Skip to content

Instantly share code, notes, and snippets.

@se4u
Last active March 10, 2025 02:26
Show Gist options
  • Select an option

  • Save se4u/a04912ab22a42ee019274bd70d11d863 to your computer and use it in GitHub Desktop.

Select an option

Save se4u/a04912ab22a42ee019274bd70d11d863 to your computer and use it in GitHub Desktop.
Teju
#!/bin/bash
setup () {
# Install xcode
sudo xcodebuild -license
sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer
mise use -g python@3.11
pip install -e .
git clone https://github.com/ggerganov/whisper.cpp.git
cd whisper.cpp
cmake -B build -DWHISPER_COREML=1
# TODO: figure out openmp
cmake --build build -j --config Release
make base.en
make small.en
./models/generate-coreml-model.sh base.en
}
setup_before_run () {
: need to run ollama
}
setup_priya () {
: git requires developer tools, download and install XCode
: download whisper.cpp and build it
: install homebrew.
}
phi () {
LOG_LEVEL=info python -m teju -ind 2 -l phi4 # 2>/tmp/tmp.log
}
llama32 () {
LOG_LEVEL=info python -m teju -ind 2 -l llama3.2 # 2>/tmp/tmp.log
}
${1-phi}
[build-system]
requires = ["setuptools>=42", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "Teju"
version = "0.1.0"
description = "Teju Assistant"
authors = [
{name = "Your Name", email = "your.email@example.com"},
]
dependencies = [
"ane_transformers",
"coremltools",
"ollama",
"openai-whisper",
"pyobjc>=7.0.1",
"pywhispercpp @ git+https://github.com/absadiki/pywhispercpp.git",
"scikit-learn==1.5.1",
"sounddevice",
"torch==2.5.0",
"webrtcvad",
]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
A simple example showcasing the use of `pywhispercpp` as an assistant.
The idea is to use a `VAD` to detect speech (in this example we used webrtcvad), and when speech is detected
we run the inference.
"""
import os
import argparse
import importlib.metadata
import queue
import time
from typing import Callable
import numpy as np
import sounddevice as sd
import pywhispercpp.constants as constants
import webrtcvad
import logging
import json
from ollama import chat
from ollama import ChatResponse
from pywhispercpp.model import Model
__version__ = importlib.metadata.version('pywhispercpp')
__header__ = f"""
=====================================
PyWhisperCpp
A simple assistant using Whisper.cpp
Version: {__version__}
=====================================
"""
SYSTEM_PROMPT = '''Your goal is to assist little children who cannot speak by predicting up to four likely responses to questions.
# Behaviors and Rules:
1) Question Analysis:
a) Carefully analyze the question asked to the child.
b) Identify the key information being sought by the questioner.
c) Consider the child's likely emotional state and needs based on the context of the question.
2) Response Generation:
a) Generate up to four likely responses that address the question.
b) Ensure the responses are concise, and written from the child's point of view.
c) Prioritize responses that address the child's basic needs (e.g., hunger, pain, comfort).
d) Use simple sentences and vocabulary.
3) Response Presentation:
a) Present the generated responses in a JSON list format.
b) Each response should be a string within the list.
c) The list should contain no more than four responses.
d) Do not include any additional text or explanations in the response.
'''
class Assistant:
"""
Assistant class
Example usage
```python
from pywhispercpp.examples.assistant import Assistant
my_assistant = Assistant(commands_callback=print, n_threads=8)
my_assistant.start()
```
"""
def __init__(self, *,
model='tiny',
input_device: int = None,
silence_threshold: int = 8,
q_threshold: int = 16,
block_duration: int = 30,
commands_callback: Callable[[str], None] = None,
**model_params):
"""
:param model: whisper.cpp model name or a direct path to a`ggml` model
:param input_device: The input device (aka microphone), keep it None to take the default
:param silence_threshold: The duration of silence after which the inference will be running
:param q_threshold: The inference won't be running until the data queue is having at least `q_threshold` elements
:param block_duration: minimum time audio updates in ms
:param commands_callback: The callback to run when a command is received
:param model_log_level: Logging level
:param model_params: any other parameter to pass to the whsiper.cpp model see ::: pywhispercpp.constants.PARAMS_SCHEMA
"""
self.input_device = input_device
self.sample_rate = constants.WHISPER_SAMPLE_RATE # same as whisper.cpp
self.channels = 1 # same as whisper.cpp
self.block_duration = block_duration
self.block_size = int(self.sample_rate * self.block_duration / 1000)
self.q = queue.Queue()
self.vad = webrtcvad.Vad()
self.silence_threshold = silence_threshold
self.q_threshold = q_threshold
self._silence_counter = 0
self.pwccp_model = Model(model,
print_realtime=False,
print_progress=False,
print_timestamps=False,
single_segment=True,
no_context=True,
**model_params)
self.commands_callback = commands_callback
def _audio_callback(self, indata, frames, time, status):
"""
This is called (from a separate thread) for each audio block.
"""
if status:
logging.warning(F"underlying audio stack warning:{status}")
assert frames == self.block_size
audio_data = map(lambda x: (x + 1) / 2, indata) # normalize from [-1,+1] to [0,1]
audio_data = np.fromiter(audio_data, np.float16)
audio_data = audio_data.tobytes()
detection = self.vad.is_speech(audio_data, self.sample_rate)
if detection:
self.q.put(indata.copy())
self._silence_counter = 0
else:
if self._silence_counter >= self.silence_threshold:
if self.q.qsize() > self.q_threshold:
self._transcribe_speech()
self._silence_counter = 0
else:
self._silence_counter += 1
def _transcribe_speech(self):
logging.info(f"Speech detected ...")
audio_data = np.array([])
while self.q.qsize() > 0:
# get all the data from the q
audio_data = np.append(audio_data, self.q.get())
# Appending zeros to the audio data as a workaround for small audio packets (small commands)
audio_data = np.concatenate([audio_data, np.zeros((int(self.sample_rate) + 10))])
# running the inference
self.pwccp_model.transcribe(audio_data,
new_segment_callback=self._new_segment_callback)
def _new_segment_callback(self, seg):
if self.commands_callback:
self.commands_callback(seg.text)
def start(self) -> None:
"""
Use this function to start the assistant
:return: None
"""
logging.info(f"Starting Assistant ...")
with sd.InputStream(
device=self.input_device, # the default input device
channels=self.channels,
samplerate=constants.WHISPER_SAMPLE_RATE,
blocksize=self.block_size,
callback=self._audio_callback):
try:
logging.info(f"Assistant is listening ... (CTRL+C to stop)")
while True:
time.sleep(0.1)
except KeyboardInterrupt:
logging.info("Assistant stopped")
@staticmethod
def available_devices():
qd = sd.query_devices()
return qd
class OptionGenerator:
def __init__(self, *, model, callback):
self.model = model
self.callback = callback
def __call__(self, x: str) -> None:
response: ChatResponse = chat(model=self.model, messages=[
{'role': 'system', 'content': SYSTEM_PROMPT},
{'role': 'user', 'content': x},
])
y = response.message.content
self.callback(x, y)
class OptionDisplay:
def reset(self):
os.system('clear')
def __call__(self, x: str, y: str):
self.reset()
print(x)
try:
y = json.loads(y.strip().strip('`').removeprefix('json').strip())
y.append('None of the above')
for i, e in enumerate(y, 1):
print(f'{i}. {e}')
except json.JSONDecodeError:
print(y)
def _main():
parser = argparse.ArgumentParser(description="", allow_abbrev=True)
parser.add_argument('-m', '--model', default='tiny.en', type=str, help="Whisper.cpp model, default to %(default)s")
parser.add_argument('-l', '--llm_model', default='llama3.2', type=str, help="llama3.2, phi4, llama3.3, default to %(default)s")
parser.add_argument('-ind', '--input_device', type=int, default=3,
help=(f'Id of The input device (aka microphone)\n'
f'available devices {Assistant.available_devices()}').replace('\n', ':::'))
parser.add_argument('-st', '--silence_threshold', default=16, type=int,
help=f"he duration of silence after which the inference will be running, default to %(default)s")
parser.add_argument('-bd', '--block_duration', default=30,
help=f"minimum time audio updates in ms, default to %(default)s")
args = parser.parse_args()
display_callback = OptionDisplay()
assistant_callback = OptionGenerator(model=args.llm_model, callback=display_callback)
my_assistant = Assistant(model=args.model,
input_device=args.input_device,
silence_threshold=args.silence_threshold,
block_duration=args.block_duration,
commands_callback=assistant_callback)
my_assistant.start()
_main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment