// 利用BK&DR算法求哈希值以减少碰撞// wenet/runtime/core/decoder/ctc_prefix_beam_search.hstructPrefixHash{size_toperator()(conststd::vector<int>&prefix)const{size_thash_code=0;// 此处注释有误,应为BK&DR hash code// here we use KB&DR hash codefor(intid:prefix){hash_code=id+31*hash_code;}returnhash_code;}};// 使用PrefixHash// wenet/runtime/core/decoder/ctc_prefix_beam_search.ccstd::unordered_map<std::vector<int>,PrefixScore,PrefixHash>next_hyps;
// wenet/runtime/core/decoder/torch_asr_model.ccvoidTorchAsrModel::ForwardEncoderFunc(conststd::vector<std::vector<float>>&chunk_feats,std::vector<std::vector<float>>*out_prob){// 执行编码器chunk级别的一次推断,每次输入一条音频的若干帧组成的chunk// 输入参数chunk_feats:[frames,feature_dim],和缓存cached_feature_共同组成输入// 输出参数out_prob:[num_frames,vocab_size],log softmax之后CTC之前的编码器输出// 1. Prepare libtorch required data, splice cached_feature_ and chunk_feats// The first dimension is for batchsize, which is 1.intnum_frames=cached_feature_.size()+chunk_feats.size();constintfeature_dim=chunk_feats[0].size();torch::Tensorfeats=torch::zeros({1,num_frames,feature_dim},torch::kFloat);...}
// wenet/runtime/core/frontend/feature_pipline.hclassFeaturePipeline{public:explicitFeaturePipeline(constFeaturePipelineConfig&config);// The feature extraction is done in AcceptWaveform().voidAcceptWaveform(constfloat*pcm,constintsize);voidAcceptWaveform(constint16_t*pcm,constintsize);...// The caller should call this method when speech input is end.// Never call AcceptWaveform() after calling set_input_finished() !voidset_input_finished();boolinput_finished()const{returninput_finished_;}...// Read #num_frames frame features.// Return False if less then #num_frames features are read and the// input is finished.// Return True if #num_frames features are read.// This function is a blocking method when there is no feature// in feature_queue_ and the input is not finished.boolRead(intnum_frames,std::vector<std::vector<float>>*feats);...};
// wenet/runtime/core/frontend/feature_pipeline.ccboolFeaturePipeline::ReadOne(std::vector<float>*feat){if(!feature_queue_.Empty()){// 最普遍的情形,数据队列不为空,直接从队列中取出一帧声学特征返回*feat=std::move(feature_queue_.Pop());returntrue;}else{// 数据队列为空,加锁挂起本线程,等待数据存入队列std::unique_lock<std::mutex>lock(mutex_);while(!input_finished_){// 此时输入尚未结束,释放互斥锁,等待condition_variable通知并唤醒本线程// This will release the lock and wait for notify_one()// from AcceptWaveform() or set_input_finished()finish_condition_.wait(lock);// 本线程被唤醒,查看此时数据队列是否为空if(!feature_queue_.Empty()){// 不为空则从队列中取出一帧声学特征并返回*feat=std::move(feature_queue_.Pop());returntrue;}}CHECK(input_finished_);// 原始代码如下:// CHECK(feature_queue_.Empty());// return false;// 上述代码在如下时序时会发生问题:// 1. 读线程判断feature_queue_.Empty()为true,进入else,读线程挂起// 2. 写线程执行AcceptWaveform并设置input_finished_为true// 3. 读线程开始执行else部分,此时input_finished_为true,因此不会执行while循环体,// 此时CHECK(feature_queue_.Empty())失败// 也就是写线程写入最后一帧,并将input_finished_设置为true,此时队列实际还有数据,// 但读线程根据input_finished_判断此时已经结束,直接无法进入while// 因此不能CHECK(feature_queue_.Empty()),并且double check数据队列是否为空// Double check queue.empty, see issue#893 for detailed discussions.// https://github.com/wenet-e2e/wenet/issues/893if(!feature_queue_.Empty()){*feat=std::move(feature_queue_.Pop());returntrue;}else{returnfalse;}}}
classAsrDecoder{...private:// 输出词表// output symbol tablestd::shared_ptr<fst::SymbolTable>symbol_table_;// 端到端声学模型词表// e2e unit symbol tablestd::shared_ptr<fst::SymbolTable>unit_table_=nullptr;...};
wenet的上下文偏置(context biasing)
在ASR的实际应用中,常用词的识别效果较好,但对于一些特殊的词,识别精度可能会降低。上下文偏差(Context Biasing)是指在推理过程中将先验知识注入ASR,例如用户喜欢的歌曲、联系人、应用程序或位置。传统的ASR通过从偏置短语列表中构建一个n-gram有限状态转录机(Finite State Transducer,FST)来进行上下文偏置,该偏置短语列表在解码过程中与解码图动态组合,这有助于将识别结果偏向于上下文有限状态转录机中包含的n-gram,从而提高特定场景中的识别准确性。
// wenet/runtime/core/decoder/asr_decoder.hconstinttime_stamp_gap_=100;// timestamp gap between words in a sentence// wenet/runtime/core/decoder/asr_decoder.cc// 时间戳仅在输入完毕,准备输出最终结果时产生// 采用声学模型解码时产生的时间戳,同时此处也需要e2e模型的词表// TimeStamp is only supported in final result// TimeStamp of the output of CtcWfstBeamSearch may be inaccurate due to// various FST operations when building the decoding graph. So here we use// time stamp of the input(e2e model unit), which is more accurate, and it// requires the symbol table of the e2e model used in training.if(unit_table_!=nullptr&&finish){conststd::vector<int>&input=inputs[i];conststd::vector<int>&time_stamp=times[i];CHECK_EQ(input.size(),time_stamp.size());for(size_tj=0;j<input.size();j++){std::stringword=unit_table_->Find(input[j]);intstart=time_stamp[j]*frame_shift_in_ms()-time_stamp_gap_>0?time_stamp[j]*frame_shift_in_ms()-time_stamp_gap_:0;if(j>0){// 如果本时刻单词与上一个时刻单词的时间间隔小于“最小单词间隔”(time_stamp_gap_),// 则该单词的开始时刻start取上一个单词和本时刻单词的中间时刻start=(time_stamp[j]-time_stamp[j-1])*frame_shift_in_ms()<time_stamp_gap_?(time_stamp[j-1]+time_stamp[j])/2*frame_shift_in_ms():start;}intend=time_stamp[j]*frame_shift_in_ms();if(j<input.size()-1){// 如果本时刻单词与下一时刻单词的时间间隔小于“最小单词间隔”(time_stamp_gap_),// 则该单词的结束时刻end取下一个单词和本时刻单词的中间时刻end=(time_stamp[j+1]-time_stamp[j])*frame_shift_in_ms()<time_stamp_gap_?(time_stamp[j+1]+time_stamp[j])/2*frame_shift_in_ms():end;}WordPieceword_piece(word,offset+start,offset+end);path.word_pieces.emplace_back(word_piece);}}
// wenet/runtime/core/decoder/asr_decoder.ccfor(size_tj=0;j<hypothesis.size();j++){std::stringword=symbol_table_->Find(hypothesis[j]);// A detailed explanation of this if-else branch can be found in// https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058if(searcher_->Type()==kWfstBeamSearch){path.sentence+=(' '+word);}else{path.sentence+=(word);}}...// 后处理策略,目前主要功能是去除不需要的空格。// example1: “我 爱 你”==> “我爱你”// example2: “ i love wenet” ==> “i love wenet”// example3: “我 爱 wenet very much” ==> “我爱wenet very much”// example4: “aa ää xx yy” ==> “aa ää xx yy”if(post_processor_!=nullptr){path.sentence=post_processor_->Process(path.sentence,finish);}