Skip to content

Speech to text classes

stt.stt_deepgram.Ear_deepgram

Ear_deepgram(silence_seconds=2, api_key='', listener=None, logger=None)

Bases: BaseEar

Methods:

Name Description
transcribe_stream

Attributes:

Name Type Description
api_key
Source code in openvoicechat/stt/stt_deepgram.py
def __init__(self, silence_seconds=2, api_key="", listener=None, logger=None):
    super().__init__(silence_seconds, stream=True, listener=listener, logger=logger)
    self.api_key = api_key

api_key instance-attribute

api_key = api_key

transcribe_stream

transcribe_stream(audio_queue, transcription_queue)
Source code in openvoicechat/stt/stt_deepgram.py
def transcribe_stream(self, audio_queue, transcription_queue):
    extra_headers = {"Authorization": "token " + self.api_key}

    async def f():
        async with websockets.connect(
            "wss://api.deepgram.com/v1/listen?encoding=linear16&sample_rate=16000"
            "&channels=1&model=nova-2",
            extra_headers=extra_headers,
        ) as ws:

            async def sender(ws):  # sends audio to websocket
                try:
                    while True:
                        data = audio_queue.get()
                        if data is None:
                            await ws.send(json.dumps({"type": "CloseStream"}))
                            break
                        await ws.send(data)
                except Exception as e:
                    print("Error while sending: ", str(e))
                    raise

            async def receiver(ws):
                async for msg in ws:
                    msg = json.loads(msg)
                    if "channel" not in msg:
                        transcription_queue.put(None)
                        break
                    transcript = msg["channel"]["alternatives"][0]["transcript"]

                    if transcript:
                        transcription_queue.put(transcript)

            await asyncio.gather(sender(ws), receiver(ws))

    asyncio.run(f())

stt.stt_hf.Ear_hf

Ear_hf(model_id='openai/whisper-base.en', device='cpu', silence_seconds=2, generate_kwargs=None, listener=None, listen_interruptions=True, logger=None)

Bases: BaseEar

Methods:

Name Description
transcribe

Attributes:

Name Type Description
pipe
device
generate_kwargs
Source code in openvoicechat/stt/stt_hf.py
def __init__(
    self,
    model_id="openai/whisper-base.en",
    device="cpu",
    silence_seconds=2,
    generate_kwargs=None,
    listener=None,
    listen_interruptions=True,
    logger=None,
):
    super().__init__(
        silence_seconds,
        listener=listener,
        listen_interruptions=listen_interruptions,
        logger=logger,
    )
    from transformers import pipeline

    self.pipe = pipeline(
        "automatic-speech-recognition", model=model_id, device=device
    )
    self.device = device
    self.generate_kwargs = generate_kwargs

pipe instance-attribute

pipe = pipeline('automatic-speech-recognition', model=model_id, device=device)

device instance-attribute

device = device

generate_kwargs instance-attribute

generate_kwargs = generate_kwargs

transcribe

transcribe(audio)
Source code in openvoicechat/stt/stt_hf.py
def transcribe(self, audio):
    from torch import no_grad

    with no_grad():
        transcription = self.pipe(audio, generate_kwargs=self.generate_kwargs)
    return transcription["text"].strip()