Rui Tao's Portfolio

Implementing a Scalable Speech Recognition Service

Published on
Published on
/4 mins read/---

Service Interface

First, define the core interfaces:

interface RecognitionResult {
  text: string;
  isFinal: boolean;
  confidence: number;
}
 
interface IRecognitionConfig {
  languageCode: string;
  sampleRate: number;
  encoding: 'LINEAR16' | 'FLAC' | 'MULAW';
}
 
interface IRecognitionService {
  startRecognition(config: IRecognitionConfig): Promise<void>;
  stopRecognition(): void;
  onRecognitionResult(callback: (result: RecognitionResult) => void): void;
  onError(callback: (error: Error) => void): void;
  processAudioData(data: Buffer): Promise<void>;
}

Base Recognition Service

Create a base class for common functionality:

abstract class BaseRecognitionService implements IRecognitionService {
  protected config: IRecognitionConfig;
  protected resultCallback: (result: RecognitionResult) => void;
  protected errorCallback: (error: Error) => void;
  protected retryCount = 0;
  protected readonly MAX_RETRIES = 3;
  protected isProcessing = false;
 
  constructor(config: Partial<IRecognitionConfig>) {
    this.config = {
      languageCode: 'en-US',
      sampleRate: 16000,
      encoding: 'LINEAR16',
      ...config
    };
  }
 
  abstract startRecognition(config?: IRecognitionConfig): Promise<void>;
  abstract stopRecognition(): void;
  abstract processAudioData(data: Buffer): Promise<void>;
 
  onRecognitionResult(callback: (result: RecognitionResult) => void): void {
    this.resultCallback = callback;
  }
 
  onError(callback: (error: Error) => void): void {
    this.errorCallback = callback;
  }
 
  protected async handleError(error: Error) {
    console.error('Recognition error:', error);
    
    if (this.retryCount < this.MAX_RETRIES) {
      this.retryCount++;
      await this.reconnect();
    } else {
      this.errorCallback?.(error);
    }
  }
 
  protected async reconnect() {
    try {
      await this.stopRecognition();
      await this.startRecognition(this.config);
      this.retryCount = 0;
    } catch (error) {
      this.handleError(error as Error);
    }
  }
}

Google Speech Recognition Implementation

Implement Google Cloud Speech-to-Text service:

import { SpeechClient } from '@google-cloud/speech';
 
export class GoogleSpeechService extends BaseRecognitionService {
  private client: SpeechClient;
  private recognizeStream: any;
 
  constructor(config: Partial<IRecognitionConfig>) {
    super(config);
    this.client = new SpeechClient();
  }
 
  async startRecognition(config?: IRecognitionConfig): Promise<void> {
    if (config) {
      this.config = { ...this.config, ...config };
    }
 
    const recognizeConfig = {
      encoding: this.config.encoding,
      sampleRateHertz: this.config.sampleRate,
      languageCode: this.config.languageCode,
      enableAutomaticPunctuation: true,
      model: 'latest_long',
      useEnhanced: true,
    };
 
    this.recognizeStream = this.client
      .streamingRecognize(recognizeConfig)
      .on('error', (error) => this.handleError(error))
      .on('data', (data) => {
        if (data.results[0]) {
          this.resultCallback?.({
            text: data.results[0].alternatives[0].transcript,
            isFinal: data.results[0].isFinal,
            confidence: data.results[0].alternatives[0].confidence
          });
        }
      });
 
    this.isProcessing = true;
  }
 
  stopRecognition(): void {
    if (this.recognizeStream) {
      this.recognizeStream.end();
      this.recognizeStream = null;
    }
    this.isProcessing = false;
  }
 
  async processAudioData(data: Buffer): Promise<void> {
    if (!this.isProcessing || !this.recognizeStream) {
      await this.startRecognition();
    }
 
    try {
      this.recognizeStream.write(data);
    } catch (error) {
      await this.handleError(error as Error);
    }
  }
}

Baidu Speech Recognition Implementation

Implement Baidu Speech Recognition service:

import { BaiduAipSpeech } from 'baidu-aip-sdk';
 
export class BaiduSpeechService extends BaseRecognitionService {
  private client: BaiduAipSpeech;
  private audioBuffer: Buffer[] = [];
  private processingInterval: NodeJS.Timeout | null = null;
 
  constructor(config: {
    appId: string;
    apiKey: string;
    secretKey: string;
    recognitionConfig?: Partial<IRecognitionConfig>;
  }) {
    super(config.recognitionConfig);
    this.client = new BaiduAipSpeech(config.appId, config.apiKey, config.secretKey);
  }
 
  async startRecognition(config?: IRecognitionConfig): Promise<void> {
    if (config) {
      this.config = { ...this.config, ...config };
    }
 
    this.processingInterval = setInterval(() => {
      this.processBufferedAudio();
    }, 160); // Process every 160ms
 
    this.isProcessing = true;
  }
 
  stopRecognition(): void {
    if (this.processingInterval) {
      clearInterval(this.processingInterval);
      this.processingInterval = null;
    }
    this.audioBuffer = [];
    this.isProcessing = false;
  }
 
  async processAudioData(data: Buffer): Promise<void> {
    if (!this.isProcessing) {
      await this.startRecognition();
    }
 
    this.audioBuffer.push(data);
  }
 
  private async processBufferedAudio() {
    if (this.audioBuffer.length === 0) return;
 
    const audioData = Buffer.concat(this.audioBuffer);
    this.audioBuffer = [];
 
    try {
      const result = await this.client.recognize(audioData.toString('base64'), 'pcm', this.config.sampleRate, {
        dev_pid: this.getDevPid(this.config.languageCode)
      });
 
      if (result.err_no === 0 && result.result) {
        this.resultCallback?.({
          text: result.result[0],
          isFinal: true,
          confidence: 1.0
        });
      } else {
        throw new Error(`Recognition failed: ${result.err_msg}`);
      }
    } catch (error) {
      await this.handleError(error as Error);
    }
  }
 
  private getDevPid(languageCode: string): number {
    // Map language codes to Baidu dev_pid
    const pidMap: Record<string, number> = {
      'zh-CN': 1537, // 普通话
      'en-US': 1737, // English
      'zh-CN-medical': 1537, // 医疗领域
    };
    return pidMap[languageCode] || 1537;
  }
}

Usage Example

Here's how to use these services:

// Create and configure the service
const googleService = new GoogleSpeechService({
  languageCode: 'en-US',
  sampleRate: 16000
});
 
// Start recognition
await googleService.startRecognition();
 
// Handle results
googleService.onRecognitionResult((result) => {
  console.log(`Recognized: ${result.text}`);
  console.log(`Confidence: ${result.confidence}`);
  console.log(`Is Final: ${result.isFinal}`);
});
 
// Handle errors
googleService.onError((error) => {
  console.error('Recognition error:', error);
});
 
// Process audio data
const audioData: Buffer = getAudioData(); // Your audio data source
await googleService.processAudioData(audioData);
 
// Stop recognition when done
googleService.stopRecognition();

Notes

  • Remember to handle API credentials securely
  • Implement proper error handling and retry logic
  • Monitor service performance and resource usage
  • Consider implementing rate limiting
  • Add logging for debugging and monitoring
  • Test with different audio formats and quality
  • Handle network interruptions gracefully