Implementing a Scalable Speech Recognition Service
- Published on
- Published on
- /4 mins read/---
Service Interface
First, define the core interfaces:
interface RecognitionResult {
text: string;
isFinal: boolean;
confidence: number;
}
interface IRecognitionConfig {
languageCode: string;
sampleRate: number;
encoding: 'LINEAR16' | 'FLAC' | 'MULAW';
}
interface IRecognitionService {
startRecognition(config: IRecognitionConfig): Promise<void>;
stopRecognition(): void;
onRecognitionResult(callback: (result: RecognitionResult) => void): void;
onError(callback: (error: Error) => void): void;
processAudioData(data: Buffer): Promise<void>;
}
Base Recognition Service
Create a base class for common functionality:
abstract class BaseRecognitionService implements IRecognitionService {
protected config: IRecognitionConfig;
protected resultCallback: (result: RecognitionResult) => void;
protected errorCallback: (error: Error) => void;
protected retryCount = 0;
protected readonly MAX_RETRIES = 3;
protected isProcessing = false;
constructor(config: Partial<IRecognitionConfig>) {
this.config = {
languageCode: 'en-US',
sampleRate: 16000,
encoding: 'LINEAR16',
...config
};
}
abstract startRecognition(config?: IRecognitionConfig): Promise<void>;
abstract stopRecognition(): void;
abstract processAudioData(data: Buffer): Promise<void>;
onRecognitionResult(callback: (result: RecognitionResult) => void): void {
this.resultCallback = callback;
}
onError(callback: (error: Error) => void): void {
this.errorCallback = callback;
}
protected async handleError(error: Error) {
console.error('Recognition error:', error);
if (this.retryCount < this.MAX_RETRIES) {
this.retryCount++;
await this.reconnect();
} else {
this.errorCallback?.(error);
}
}
protected async reconnect() {
try {
await this.stopRecognition();
await this.startRecognition(this.config);
this.retryCount = 0;
} catch (error) {
this.handleError(error as Error);
}
}
}
Google Speech Recognition Implementation
Implement Google Cloud Speech-to-Text service:
import { SpeechClient } from '@google-cloud/speech';
export class GoogleSpeechService extends BaseRecognitionService {
private client: SpeechClient;
private recognizeStream: any;
constructor(config: Partial<IRecognitionConfig>) {
super(config);
this.client = new SpeechClient();
}
async startRecognition(config?: IRecognitionConfig): Promise<void> {
if (config) {
this.config = { ...this.config, ...config };
}
const recognizeConfig = {
encoding: this.config.encoding,
sampleRateHertz: this.config.sampleRate,
languageCode: this.config.languageCode,
enableAutomaticPunctuation: true,
model: 'latest_long',
useEnhanced: true,
};
this.recognizeStream = this.client
.streamingRecognize(recognizeConfig)
.on('error', (error) => this.handleError(error))
.on('data', (data) => {
if (data.results[0]) {
this.resultCallback?.({
text: data.results[0].alternatives[0].transcript,
isFinal: data.results[0].isFinal,
confidence: data.results[0].alternatives[0].confidence
});
}
});
this.isProcessing = true;
}
stopRecognition(): void {
if (this.recognizeStream) {
this.recognizeStream.end();
this.recognizeStream = null;
}
this.isProcessing = false;
}
async processAudioData(data: Buffer): Promise<void> {
if (!this.isProcessing || !this.recognizeStream) {
await this.startRecognition();
}
try {
this.recognizeStream.write(data);
} catch (error) {
await this.handleError(error as Error);
}
}
}
Baidu Speech Recognition Implementation
Implement Baidu Speech Recognition service:
import { BaiduAipSpeech } from 'baidu-aip-sdk';
export class BaiduSpeechService extends BaseRecognitionService {
private client: BaiduAipSpeech;
private audioBuffer: Buffer[] = [];
private processingInterval: NodeJS.Timeout | null = null;
constructor(config: {
appId: string;
apiKey: string;
secretKey: string;
recognitionConfig?: Partial<IRecognitionConfig>;
}) {
super(config.recognitionConfig);
this.client = new BaiduAipSpeech(config.appId, config.apiKey, config.secretKey);
}
async startRecognition(config?: IRecognitionConfig): Promise<void> {
if (config) {
this.config = { ...this.config, ...config };
}
this.processingInterval = setInterval(() => {
this.processBufferedAudio();
}, 160); // Process every 160ms
this.isProcessing = true;
}
stopRecognition(): void {
if (this.processingInterval) {
clearInterval(this.processingInterval);
this.processingInterval = null;
}
this.audioBuffer = [];
this.isProcessing = false;
}
async processAudioData(data: Buffer): Promise<void> {
if (!this.isProcessing) {
await this.startRecognition();
}
this.audioBuffer.push(data);
}
private async processBufferedAudio() {
if (this.audioBuffer.length === 0) return;
const audioData = Buffer.concat(this.audioBuffer);
this.audioBuffer = [];
try {
const result = await this.client.recognize(audioData.toString('base64'), 'pcm', this.config.sampleRate, {
dev_pid: this.getDevPid(this.config.languageCode)
});
if (result.err_no === 0 && result.result) {
this.resultCallback?.({
text: result.result[0],
isFinal: true,
confidence: 1.0
});
} else {
throw new Error(`Recognition failed: ${result.err_msg}`);
}
} catch (error) {
await this.handleError(error as Error);
}
}
private getDevPid(languageCode: string): number {
// Map language codes to Baidu dev_pid
const pidMap: Record<string, number> = {
'zh-CN': 1537, // 普通话
'en-US': 1737, // English
'zh-CN-medical': 1537, // 医疗领域
};
return pidMap[languageCode] || 1537;
}
}
Usage Example
Here's how to use these services:
// Create and configure the service
const googleService = new GoogleSpeechService({
languageCode: 'en-US',
sampleRate: 16000
});
// Start recognition
await googleService.startRecognition();
// Handle results
googleService.onRecognitionResult((result) => {
console.log(`Recognized: ${result.text}`);
console.log(`Confidence: ${result.confidence}`);
console.log(`Is Final: ${result.isFinal}`);
});
// Handle errors
googleService.onError((error) => {
console.error('Recognition error:', error);
});
// Process audio data
const audioData: Buffer = getAudioData(); // Your audio data source
await googleService.processAudioData(audioData);
// Stop recognition when done
googleService.stopRecognition();
Notes
- Remember to handle API credentials securely
- Implement proper error handling and retry logic
- Monitor service performance and resource usage
- Consider implementing rate limiting
- Add logging for debugging and monitoring
- Test with different audio formats and quality
- Handle network interruptions gracefully