Last active
March 10, 2025 02:26
-
-
Save se4u/a04912ab22a42ee019274bd70d11d863 to your computer and use it in GitHub Desktop.
Teju
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| setup () { | |
| # Install xcode | |
| sudo xcodebuild -license | |
| sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer | |
| mise use -g python@3.11 | |
| pip install -e . | |
| git clone https://github.com/ggerganov/whisper.cpp.git | |
| cd whisper.cpp | |
| cmake -B build -DWHISPER_COREML=1 | |
| # TODO: figure out openmp | |
| cmake --build build -j --config Release | |
| make base.en | |
| make small.en | |
| ./models/generate-coreml-model.sh base.en | |
| } | |
| setup_before_run () { | |
| : need to run ollama | |
| } | |
| setup_priya () { | |
| : git requires developer tools, download and install XCode | |
| : download whisper.cpp and build it | |
| : install homebrew. | |
| } | |
| phi () { | |
| LOG_LEVEL=info python -m teju -ind 2 -l phi4 # 2>/tmp/tmp.log | |
| } | |
| llama32 () { | |
| LOG_LEVEL=info python -m teju -ind 2 -l llama3.2 # 2>/tmp/tmp.log | |
| } | |
| ${1-phi} | |
| [build-system] | |
| requires = ["setuptools>=42", "wheel"] | |
| build-backend = "setuptools.build_meta" | |
| [project] | |
| name = "Teju" | |
| version = "0.1.0" | |
| description = "Teju Assistant" | |
| authors = [ | |
| {name = "Your Name", email = "your.email@example.com"}, | |
| ] | |
| dependencies = [ | |
| "ane_transformers", | |
| "coremltools", | |
| "ollama", | |
| "openai-whisper", | |
| "pyobjc>=7.0.1", | |
| "pywhispercpp @ git+https://github.com/absadiki/pywhispercpp.git", | |
| "scikit-learn==1.5.1", | |
| "sounddevice", | |
| "torch==2.5.0", | |
| "webrtcvad", | |
| ] | |
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| """ | |
| A simple example showcasing the use of `pywhispercpp` as an assistant. | |
| The idea is to use a `VAD` to detect speech (in this example we used webrtcvad), and when speech is detected | |
| we run the inference. | |
| """ | |
| import os | |
| import argparse | |
| import importlib.metadata | |
| import queue | |
| import time | |
| from typing import Callable | |
| import numpy as np | |
| import sounddevice as sd | |
| import pywhispercpp.constants as constants | |
| import webrtcvad | |
| import logging | |
| import json | |
| from ollama import chat | |
| from ollama import ChatResponse | |
| from pywhispercpp.model import Model | |
| __version__ = importlib.metadata.version('pywhispercpp') | |
| __header__ = f""" | |
| ===================================== | |
| PyWhisperCpp | |
| A simple assistant using Whisper.cpp | |
| Version: {__version__} | |
| ===================================== | |
| """ | |
| SYSTEM_PROMPT = '''Your goal is to assist little children who cannot speak by predicting up to four likely responses to questions. | |
| # Behaviors and Rules: | |
| 1) Question Analysis: | |
| a) Carefully analyze the question asked to the child. | |
| b) Identify the key information being sought by the questioner. | |
| c) Consider the child's likely emotional state and needs based on the context of the question. | |
| 2) Response Generation: | |
| a) Generate up to four likely responses that address the question. | |
| b) Ensure the responses are concise, and written from the child's point of view. | |
| c) Prioritize responses that address the child's basic needs (e.g., hunger, pain, comfort). | |
| d) Use simple sentences and vocabulary. | |
| 3) Response Presentation: | |
| a) Present the generated responses in a JSON list format. | |
| b) Each response should be a string within the list. | |
| c) The list should contain no more than four responses. | |
| d) Do not include any additional text or explanations in the response. | |
| ''' | |
| class Assistant: | |
| """ | |
| Assistant class | |
| Example usage | |
| ```python | |
| from pywhispercpp.examples.assistant import Assistant | |
| my_assistant = Assistant(commands_callback=print, n_threads=8) | |
| my_assistant.start() | |
| ``` | |
| """ | |
| def __init__(self, *, | |
| model='tiny', | |
| input_device: int = None, | |
| silence_threshold: int = 8, | |
| q_threshold: int = 16, | |
| block_duration: int = 30, | |
| commands_callback: Callable[[str], None] = None, | |
| **model_params): | |
| """ | |
| :param model: whisper.cpp model name or a direct path to a`ggml` model | |
| :param input_device: The input device (aka microphone), keep it None to take the default | |
| :param silence_threshold: The duration of silence after which the inference will be running | |
| :param q_threshold: The inference won't be running until the data queue is having at least `q_threshold` elements | |
| :param block_duration: minimum time audio updates in ms | |
| :param commands_callback: The callback to run when a command is received | |
| :param model_log_level: Logging level | |
| :param model_params: any other parameter to pass to the whsiper.cpp model see ::: pywhispercpp.constants.PARAMS_SCHEMA | |
| """ | |
| self.input_device = input_device | |
| self.sample_rate = constants.WHISPER_SAMPLE_RATE # same as whisper.cpp | |
| self.channels = 1 # same as whisper.cpp | |
| self.block_duration = block_duration | |
| self.block_size = int(self.sample_rate * self.block_duration / 1000) | |
| self.q = queue.Queue() | |
| self.vad = webrtcvad.Vad() | |
| self.silence_threshold = silence_threshold | |
| self.q_threshold = q_threshold | |
| self._silence_counter = 0 | |
| self.pwccp_model = Model(model, | |
| print_realtime=False, | |
| print_progress=False, | |
| print_timestamps=False, | |
| single_segment=True, | |
| no_context=True, | |
| **model_params) | |
| self.commands_callback = commands_callback | |
| def _audio_callback(self, indata, frames, time, status): | |
| """ | |
| This is called (from a separate thread) for each audio block. | |
| """ | |
| if status: | |
| logging.warning(F"underlying audio stack warning:{status}") | |
| assert frames == self.block_size | |
| audio_data = map(lambda x: (x + 1) / 2, indata) # normalize from [-1,+1] to [0,1] | |
| audio_data = np.fromiter(audio_data, np.float16) | |
| audio_data = audio_data.tobytes() | |
| detection = self.vad.is_speech(audio_data, self.sample_rate) | |
| if detection: | |
| self.q.put(indata.copy()) | |
| self._silence_counter = 0 | |
| else: | |
| if self._silence_counter >= self.silence_threshold: | |
| if self.q.qsize() > self.q_threshold: | |
| self._transcribe_speech() | |
| self._silence_counter = 0 | |
| else: | |
| self._silence_counter += 1 | |
| def _transcribe_speech(self): | |
| logging.info(f"Speech detected ...") | |
| audio_data = np.array([]) | |
| while self.q.qsize() > 0: | |
| # get all the data from the q | |
| audio_data = np.append(audio_data, self.q.get()) | |
| # Appending zeros to the audio data as a workaround for small audio packets (small commands) | |
| audio_data = np.concatenate([audio_data, np.zeros((int(self.sample_rate) + 10))]) | |
| # running the inference | |
| self.pwccp_model.transcribe(audio_data, | |
| new_segment_callback=self._new_segment_callback) | |
| def _new_segment_callback(self, seg): | |
| if self.commands_callback: | |
| self.commands_callback(seg.text) | |
| def start(self) -> None: | |
| """ | |
| Use this function to start the assistant | |
| :return: None | |
| """ | |
| logging.info(f"Starting Assistant ...") | |
| with sd.InputStream( | |
| device=self.input_device, # the default input device | |
| channels=self.channels, | |
| samplerate=constants.WHISPER_SAMPLE_RATE, | |
| blocksize=self.block_size, | |
| callback=self._audio_callback): | |
| try: | |
| logging.info(f"Assistant is listening ... (CTRL+C to stop)") | |
| while True: | |
| time.sleep(0.1) | |
| except KeyboardInterrupt: | |
| logging.info("Assistant stopped") | |
| @staticmethod | |
| def available_devices(): | |
| qd = sd.query_devices() | |
| return qd | |
| class OptionGenerator: | |
| def __init__(self, *, model, callback): | |
| self.model = model | |
| self.callback = callback | |
| def __call__(self, x: str) -> None: | |
| response: ChatResponse = chat(model=self.model, messages=[ | |
| {'role': 'system', 'content': SYSTEM_PROMPT}, | |
| {'role': 'user', 'content': x}, | |
| ]) | |
| y = response.message.content | |
| self.callback(x, y) | |
| class OptionDisplay: | |
| def reset(self): | |
| os.system('clear') | |
| def __call__(self, x: str, y: str): | |
| self.reset() | |
| print(x) | |
| try: | |
| y = json.loads(y.strip().strip('`').removeprefix('json').strip()) | |
| y.append('None of the above') | |
| for i, e in enumerate(y, 1): | |
| print(f'{i}. {e}') | |
| except json.JSONDecodeError: | |
| print(y) | |
| def _main(): | |
| parser = argparse.ArgumentParser(description="", allow_abbrev=True) | |
| parser.add_argument('-m', '--model', default='tiny.en', type=str, help="Whisper.cpp model, default to %(default)s") | |
| parser.add_argument('-l', '--llm_model', default='llama3.2', type=str, help="llama3.2, phi4, llama3.3, default to %(default)s") | |
| parser.add_argument('-ind', '--input_device', type=int, default=3, | |
| help=(f'Id of The input device (aka microphone)\n' | |
| f'available devices {Assistant.available_devices()}').replace('\n', ':::')) | |
| parser.add_argument('-st', '--silence_threshold', default=16, type=int, | |
| help=f"he duration of silence after which the inference will be running, default to %(default)s") | |
| parser.add_argument('-bd', '--block_duration', default=30, | |
| help=f"minimum time audio updates in ms, default to %(default)s") | |
| args = parser.parse_args() | |
| display_callback = OptionDisplay() | |
| assistant_callback = OptionGenerator(model=args.llm_model, callback=display_callback) | |
| my_assistant = Assistant(model=args.model, | |
| input_device=args.input_device, | |
| silence_threshold=args.silence_threshold, | |
| block_duration=args.block_duration, | |
| commands_callback=assistant_callback) | |
| my_assistant.start() | |
| _main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment