se4u/gist:a04912ab22a42ee019274bd70d11d863

## gistfile1.txt
#!/bin/bash
setup () {
    # Install xcode
    sudo xcodebuild -license
    sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer
    mise use -g python@3.11
    pip install -e .
    git clone https://github.com/ggerganov/whisper.cpp.git
    cd whisper.cpp
    cmake -B build -DWHISPER_COREML=1
    # TODO: figure out openmp
    cmake --build build -j --config Release
    make base.en
    make small.en
    ./models/generate-coreml-model.sh base.en
}
setup_before_run () {
    : need to run ollama
}
setup_priya () {
    : git requires developer tools, download and install XCode
    : download whisper.cpp and build it
    : install homebrew.
}
phi () {
    LOG_LEVEL=info python -m teju -ind 2 -l phi4 # 2>/tmp/tmp.log
}
llama32 () {
    LOG_LEVEL=info python -m teju -ind 2 -l llama3.2 # 2>/tmp/tmp.log
}

${1-phi}

[build-system]
requires = ["setuptools>=42", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "Teju"
version = "0.1.0"
description = "Teju Assistant"
authors = [
    {name = "Your Name", email = "your.email@example.com"},
]
dependencies = [
    "ane_transformers",
    "coremltools",
    "ollama",
    "openai-whisper",
    "pyobjc>=7.0.1",
    "pywhispercpp @ git+https://github.com/absadiki/pywhispercpp.git",
    "scikit-learn==1.5.1",
    "sounddevice",
    "torch==2.5.0",
    "webrtcvad",
]


#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
A simple example showcasing the use of `pywhispercpp` as an assistant.
The idea is to use a `VAD` to detect speech (in this example we used webrtcvad), and when speech is detected
we run the inference.
"""
import os
import argparse
import importlib.metadata
import queue
import time
from typing import Callable
import numpy as np
import sounddevice as sd
import pywhispercpp.constants as constants
import webrtcvad
import logging
import json
from ollama import chat
from ollama import ChatResponse
from pywhispercpp.model import Model

__version__ = importlib.metadata.version('pywhispercpp')

__header__ = f"""
=====================================
PyWhisperCpp
A simple assistant using Whisper.cpp
Version: {__version__}
=====================================
"""
SYSTEM_PROMPT = '''Your goal is to assist little children who cannot speak by predicting up to four likely responses to questions.

# Behaviors and Rules:

1) Question Analysis:
  a) Carefully analyze the question asked to the child.
  b) Identify the key information being sought by the questioner.
  c) Consider the child's likely emotional state and needs based on the context of the question.

2) Response Generation:
  a) Generate up to four likely responses that address the question.
  b) Ensure the responses are concise, and written from the child's point of view.
  c) Prioritize responses that address the child's basic needs (e.g., hunger, pain, comfort).
  d) Use simple sentences and vocabulary.

3) Response Presentation:
  a) Present the generated responses in a JSON list format.
  b) Each response should be a string within the list.
  c) The list should contain no more than four responses.
  d) Do not include any additional text or explanations in the response.
'''

class Assistant:
    """
    Assistant class

    Example usage
    ```python
    from pywhispercpp.examples.assistant import Assistant

    my_assistant = Assistant(commands_callback=print, n_threads=8)
    my_assistant.start()
    ```
    """

    def __init__(self, *,
                 model='tiny',
                 input_device: int = None,
                 silence_threshold: int = 8,
                 q_threshold: int = 16,
                 block_duration: int = 30,
                 commands_callback: Callable[[str], None] = None,
                 **model_params):

        """
        :param model: whisper.cpp model name or a direct path to a`ggml` model
        :param input_device: The input device (aka microphone), keep it None to take the default
        :param silence_threshold: The duration of silence after which the inference will be running
        :param q_threshold: The inference won't be running until the data queue is having at least `q_threshold` elements
        :param block_duration: minimum time audio updates in ms
        :param commands_callback: The callback to run when a command is received
        :param model_log_level: Logging level
        :param model_params: any other parameter to pass to the whsiper.cpp model see ::: pywhispercpp.constants.PARAMS_SCHEMA
        """

        self.input_device = input_device
        self.sample_rate = constants.WHISPER_SAMPLE_RATE  # same as whisper.cpp
        self.channels = 1  # same as whisper.cpp
        self.block_duration = block_duration
        self.block_size = int(self.sample_rate * self.block_duration / 1000)
        self.q = queue.Queue()

        self.vad = webrtcvad.Vad()
        self.silence_threshold = silence_threshold
        self.q_threshold = q_threshold
        self._silence_counter = 0

        self.pwccp_model = Model(model,
                                 print_realtime=False,
                                 print_progress=False,
                                 print_timestamps=False,
                                 single_segment=True,
                                 no_context=True,
                                 **model_params)
        self.commands_callback = commands_callback

    def _audio_callback(self, indata, frames, time, status):
        """
        This is called (from a separate thread) for each audio block.
        """
        if status:
            logging.warning(F"underlying audio stack warning:{status}")

        assert frames == self.block_size
        audio_data = map(lambda x: (x + 1) / 2, indata)  # normalize from [-1,+1] to [0,1]
        audio_data = np.fromiter(audio_data, np.float16)
        audio_data = audio_data.tobytes()
        detection = self.vad.is_speech(audio_data, self.sample_rate)
        if detection:
            self.q.put(indata.copy())
            self._silence_counter = 0
        else:
            if self._silence_counter >= self.silence_threshold:
                if self.q.qsize() > self.q_threshold:
                    self._transcribe_speech()
                    self._silence_counter = 0
            else:
                self._silence_counter += 1

    def _transcribe_speech(self):
        logging.info(f"Speech detected ...")
        audio_data = np.array([])
        while self.q.qsize() > 0:
            # get all the data from the q
            audio_data = np.append(audio_data, self.q.get())
        # Appending zeros to the audio data as a workaround for small audio packets (small commands)
        audio_data = np.concatenate([audio_data, np.zeros((int(self.sample_rate) + 10))])
        # running the inference
        self.pwccp_model.transcribe(audio_data,
                                    new_segment_callback=self._new_segment_callback)

    def _new_segment_callback(self, seg):
        if self.commands_callback:
            self.commands_callback(seg.text)

    def start(self) -> None:
        """
        Use this function to start the assistant
        :return: None
        """
        logging.info(f"Starting Assistant ...")
        with sd.InputStream(
                device=self.input_device,  # the default input device
                channels=self.channels,
                samplerate=constants.WHISPER_SAMPLE_RATE,
                blocksize=self.block_size,
                callback=self._audio_callback):

            try:
                logging.info(f"Assistant is listening ... (CTRL+C to stop)")
                while True:
                    time.sleep(0.1)
            except KeyboardInterrupt:
                logging.info("Assistant stopped")

    @staticmethod
    def available_devices():
        qd = sd.query_devices()
        return qd

class OptionGenerator:
    def __init__(self, *, model, callback):
        self.model = model
        self.callback = callback

    def __call__(self, x: str) -> None:
        response: ChatResponse = chat(model=self.model, messages=[
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x},
        ])
        y = response.message.content
        self.callback(x, y)

class OptionDisplay:
    def reset(self):
        os.system('clear')

    def __call__(self, x: str, y: str):
        self.reset()
        print(x)
        try:
            y = json.loads(y.strip().strip('`').removeprefix('json').strip())
            y.append('None of the above')
            for i, e in enumerate(y, 1):
                print(f'{i}. {e}')
        except json.JSONDecodeError:
            print(y)

def _main():
    parser = argparse.ArgumentParser(description="", allow_abbrev=True)
    parser.add_argument('-m', '--model', default='tiny.en', type=str, help="Whisper.cpp model, default to %(default)s")
    parser.add_argument('-l', '--llm_model', default='llama3.2', type=str, help="llama3.2, phi4, llama3.3, default to %(default)s")
    parser.add_argument('-ind', '--input_device', type=int, default=3,
                        help=(f'Id of The input device (aka microphone)\n'
                             f'available devices {Assistant.available_devices()}').replace('\n', ':::'))
    parser.add_argument('-st', '--silence_threshold', default=16, type=int,
                        help=f"he duration of silence after which the inference will be running, default to %(default)s")
    parser.add_argument('-bd', '--block_duration', default=30,
                        help=f"minimum time audio updates in ms, default to %(default)s")
    args = parser.parse_args()
    display_callback = OptionDisplay()
    assistant_callback = OptionGenerator(model=args.llm_model, callback=display_callback)
    my_assistant = Assistant(model=args.model,
                             input_device=args.input_device,
                             silence_threshold=args.silence_threshold,
                             block_duration=args.block_duration,
                             commands_callback=assistant_callback)
    my_assistant.start()

_main()
	#!/bin/bash
	setup () {
	# Install xcode
	sudo xcodebuild -license
	sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer
	mise use -g python@3.11
	pip install -e .
	git clone https://github.com/ggerganov/whisper.cpp.git
	cd whisper.cpp
	cmake -B build -DWHISPER_COREML=1
	# TODO: figure out openmp
	cmake --build build -j --config Release
	make base.en
	make small.en
	./models/generate-coreml-model.sh base.en
	}
	setup_before_run () {
	: need to run ollama
	}
	setup_priya () {
	: git requires developer tools, download and install XCode
	: download whisper.cpp and build it
	: install homebrew.
	}
	phi () {
	LOG_LEVEL=info python -m teju -ind 2 -l phi4 # 2>/tmp/tmp.log
	}
	llama32 () {
	LOG_LEVEL=info python -m teju -ind 2 -l llama3.2 # 2>/tmp/tmp.log
	}

	${1-phi}

	[build-system]
	requires = ["setuptools>=42", "wheel"]
	build-backend = "setuptools.build_meta"

	[project]
	name = "Teju"
	version = "0.1.0"
	description = "Teju Assistant"
	authors = [
	{name = "Your Name", email = "your.email@example.com"},
	]
	dependencies = [
	"ane_transformers",
	"coremltools",
	"ollama",
	"openai-whisper",
	"pyobjc>=7.0.1",
	"pywhispercpp @ git+https://github.com/absadiki/pywhispercpp.git",
	"scikit-learn==1.5.1",
	"sounddevice",
	"torch==2.5.0",
	"webrtcvad",
	]


	#!/usr/bin/env python
	# -- coding: utf-8 --
	"""
	A simple example showcasing the use of `pywhispercpp` as an assistant.
	The idea is to use a `VAD` to detect speech (in this example we used webrtcvad), and when speech is detected
	we run the inference.
	"""
	import os
	import argparse
	import importlib.metadata
	import queue
	import time
	from typing import Callable
	import numpy as np
	import sounddevice as sd
	import pywhispercpp.constants as constants
	import webrtcvad
	import logging
	import json
	from ollama import chat
	from ollama import ChatResponse
	from pywhispercpp.model import Model

	__version__ = importlib.metadata.version('pywhispercpp')

	__header__ = f"""
	=====================================
	PyWhisperCpp
	A simple assistant using Whisper.cpp
	Version: {__version__}
	=====================================
	"""
	SYSTEM_PROMPT = '''Your goal is to assist little children who cannot speak by predicting up to four likely responses to questions.

	# Behaviors and Rules:

	1) Question Analysis:
	a) Carefully analyze the question asked to the child.
	b) Identify the key information being sought by the questioner.
	c) Consider the child's likely emotional state and needs based on the context of the question.

	2) Response Generation:
	a) Generate up to four likely responses that address the question.
	b) Ensure the responses are concise, and written from the child's point of view.
	c) Prioritize responses that address the child's basic needs (e.g., hunger, pain, comfort).
	d) Use simple sentences and vocabulary.

	3) Response Presentation:
	a) Present the generated responses in a JSON list format.
	b) Each response should be a string within the list.
	c) The list should contain no more than four responses.
	d) Do not include any additional text or explanations in the response.
	'''

	class Assistant:
	"""
	Assistant class

	Example usage
	```python
	from pywhispercpp.examples.assistant import Assistant

	my_assistant = Assistant(commands_callback=print, n_threads=8)
	my_assistant.start()
	```
	"""

	def __init__(self, *,
	model='tiny',
	input_device: int = None,
	silence_threshold: int = 8,
	q_threshold: int = 16,
	block_duration: int = 30,
	commands_callback: Callable[[str], None] = None,
	**model_params):

	"""
	:param model: whisper.cpp model name or a direct path to a`ggml` model
	:param input_device: The input device (aka microphone), keep it None to take the default
	:param silence_threshold: The duration of silence after which the inference will be running
	:param q_threshold: The inference won't be running until the data queue is having at least `q_threshold` elements
	:param block_duration: minimum time audio updates in ms
	:param commands_callback: The callback to run when a command is received
	:param model_log_level: Logging level
	:param model_params: any other parameter to pass to the whsiper.cpp model see ::: pywhispercpp.constants.PARAMS_SCHEMA
	"""

	self.input_device = input_device
	self.sample_rate = constants.WHISPER_SAMPLE_RATE # same as whisper.cpp
	self.channels = 1 # same as whisper.cpp
	self.block_duration = block_duration
	self.block_size = int(self.sample_rate * self.block_duration / 1000)
	self.q = queue.Queue()

	self.vad = webrtcvad.Vad()
	self.silence_threshold = silence_threshold
	self.q_threshold = q_threshold
	self._silence_counter = 0

	self.pwccp_model = Model(model,
	print_realtime=False,
	print_progress=False,
	print_timestamps=False,
	single_segment=True,
	no_context=True,
	**model_params)
	self.commands_callback = commands_callback

	def _audio_callback(self, indata, frames, time, status):
	"""
	This is called (from a separate thread) for each audio block.
	"""
	if status:
	logging.warning(F"underlying audio stack warning:{status}")

	assert frames == self.block_size
	audio_data = map(lambda x: (x + 1) / 2, indata) # normalize from [-1,+1] to [0,1]
	audio_data = np.fromiter(audio_data, np.float16)
	audio_data = audio_data.tobytes()
	detection = self.vad.is_speech(audio_data, self.sample_rate)
	if detection:
	self.q.put(indata.copy())
	self._silence_counter = 0
	else:
	if self._silence_counter >= self.silence_threshold:
	if self.q.qsize() > self.q_threshold:
	self._transcribe_speech()
	self._silence_counter = 0
	else:
	self._silence_counter += 1

	def _transcribe_speech(self):
	logging.info(f"Speech detected ...")
	audio_data = np.array([])
	while self.q.qsize() > 0:
	# get all the data from the q
	audio_data = np.append(audio_data, self.q.get())
	# Appending zeros to the audio data as a workaround for small audio packets (small commands)
	audio_data = np.concatenate([audio_data, np.zeros((int(self.sample_rate) + 10))])
	# running the inference
	self.pwccp_model.transcribe(audio_data,
	new_segment_callback=self._new_segment_callback)

	def _new_segment_callback(self, seg):
	if self.commands_callback:
	self.commands_callback(seg.text)

	def start(self) -> None:
	"""
	Use this function to start the assistant
	:return: None
	"""
	logging.info(f"Starting Assistant ...")
	with sd.InputStream(
	device=self.input_device, # the default input device
	channels=self.channels,
	samplerate=constants.WHISPER_SAMPLE_RATE,
	blocksize=self.block_size,
	callback=self._audio_callback):

	try:
	logging.info(f"Assistant is listening ... (CTRL+C to stop)")
	while True:
	time.sleep(0.1)
	except KeyboardInterrupt:
	logging.info("Assistant stopped")

	@staticmethod
	def available_devices():
	qd = sd.query_devices()
	return qd

	class OptionGenerator:
	def __init__(self, *, model, callback):
	self.model = model
	self.callback = callback

	def __call__(self, x: str) -> None:
	response: ChatResponse = chat(model=self.model, messages=[
	{'role': 'system', 'content': SYSTEM_PROMPT},
	{'role': 'user', 'content': x},
	])
	y = response.message.content
	self.callback(x, y)

	class OptionDisplay:
	def reset(self):
	os.system('clear')

	def __call__(self, x: str, y: str):
	self.reset()
	print(x)
	try:
	y = json.loads(y.strip().strip('`').removeprefix('json').strip())
	y.append('None of the above')
	for i, e in enumerate(y, 1):
	print(f'{i}. {e}')
	except json.JSONDecodeError:
	print(y)

	def _main():
	parser = argparse.ArgumentParser(description="", allow_abbrev=True)
	parser.add_argument('-m', '--model', default='tiny.en', type=str, help="Whisper.cpp model, default to %(default)s")
	parser.add_argument('-l', '--llm_model', default='llama3.2', type=str, help="llama3.2, phi4, llama3.3, default to %(default)s")
	parser.add_argument('-ind', '--input_device', type=int, default=3,
	help=(f'Id of The input device (aka microphone)\n'
	f'available devices {Assistant.available_devices()}').replace('\n', ':::'))
	parser.add_argument('-st', '--silence_threshold', default=16, type=int,
	help=f"he duration of silence after which the inference will be running, default to %(default)s")
	parser.add_argument('-bd', '--block_duration', default=30,
	help=f"minimum time audio updates in ms, default to %(default)s")
	args = parser.parse_args()
	display_callback = OptionDisplay()
	assistant_callback = OptionGenerator(model=args.llm_model, callback=display_callback)
	my_assistant = Assistant(model=args.model,
	input_device=args.input_device,
	silence_threshold=args.silence_threshold,
	block_duration=args.block_duration,
	commands_callback=assistant_callback)
	my_assistant.start()

	_main()
No results found