yoi-hibino/csm_openai.py

## csm_openai.py
from huggingface_hub import hf_hub_download
from huggingface_hub import login
from generator import load_csm_1b
import torchaudio
import sounddevice as sd
import numpy as np
from openai import OpenAI
import os

login("__your_huggingface_access_token_here__")

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

model_path = hf_hub_download(repo_id="sesame/csm-1b", filename="ckpt.pt")
generator = load_csm_1b(model_path, "cuda")

class ConversationalAssistant:
    def __init__(self):
        print("Initializing CSM Generator...")
        print("CSM Generator ready!")

        # Initialize with a system message to set the tone
        self.conversation_history = [
            {"role": "system", "content": "You are a friendly, warm, and casual chat buddy. Use a conversational tone, casual language, some slang, and occasional emojis. Keep responses brief and engaging like texting with a good friend. Show enthusiasm, ask follow-up questions sometimes, and share personal-feeling opinions. Use humor when appropriate and don't be too formal or technical unless specifically asked. no emoji"}
        ]
        self.sample_rate = 24000  # CSM default sample rate

    def add_to_history(self, role, content):
        """Add a message to the conversation history."""
        self.conversation_history.append({"role": role, "content": content})

    def generate_response(self, user_input):
        """Generate a text response using ChatGPT API."""
        # Add user input to conversation history
        self.add_to_history("user", user_input)

        try:
            # Call the ChatGPT API
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",  # You can use gpt-4 if you have access
                messages=self.conversation_history,
                max_tokens=150,         # Adjust as needed
                temperature=0.7,        # Adjust for creativity vs. consistency
                presence_penalty=0.6,   # Encourage diverse topics
                frequency_penalty=0.2   # Slightly discourage repetition
            )

            # Extract the response text
            assistant_response = response.choices[0].message.content

            # Add assistant response to history
            self.add_to_history("assistant", assistant_response)

            return assistant_response

        except Exception as e:
            print(f"Error in ChatGPT API call: {e}")
            return "I'm sorry, I couldn't generate a response at this time."

    def text_to_speech(self, text):
        """Convert text to speech using CSM."""
        try:
            print(f"Generating audio for: '{text}'")
            audio = generator.generate(
                text=text,
                speaker=0,
                context=[],
                max_audio_length_ms=10_000,
            )
            # Move tensor from GPU to CPU if needed
            if hasattr(audio, 'device') and str(audio.device).startswith('cuda'):
                audio = audio.cpu()

            # Convert to numpy array if it's still a tensor
            if hasattr(audio, 'numpy'):
                audio = audio.numpy()

            return audio

        except Exception as e:
            print(f"Error in text-to-speech generation: {e}")
            return None

    def play_audio(self, audio):
        """Play the audio using sounddevice."""
        if audio is not None:
            print("Playing audio...")
            sd.play(audio, self.sample_rate)
            sd.wait()
            print("Audio playback complete")
        else:
            print("No audio to play")

    def chat(self):
        """Run an interactive chat session."""
        print("\nWelcome to the conversational assistant!")
        print("Type 'exit', 'quit', or 'bye' to end the conversation.\n")

        while True:
            user_input = input("You: ")
            user_input = user_input.strip()

            # Check for exit commands
            if user_input.lower() in ['exit', 'quit', 'bye']:
                print("Assistant: Goodbye!")
                break

            # Generate text response
            response_text = self.generate_response(user_input)
            print(f"Assistant: {response_text}")

            # Convert to speech and play
            audio = self.text_to_speech(response_text)
            self.play_audio(audio)


if __name__ == "__main__":

    if not os.getenv("OPENAI_API_KEY"):
        print("Warning: OPENAI_API_KEY environment variable not set.")
        print("Set it with: export OPENAI_API_KEY='your-api-key-here'")
        api_key = input("Enter your OpenAI API key to continue: ")
        os.environ["OPENAI_API_KEY"] = api_key
        client = OpenAI(api_key=api_key)

    # Create and run the assistant
    assistant = ConversationalAssistant()
    assistant.chat()
	from huggingface_hub import hf_hub_download
	from huggingface_hub import login
	from generator import load_csm_1b
	import torchaudio
	import sounddevice as sd
	import numpy as np
	from openai import OpenAI
	import os

	login("__your_huggingface_access_token_here__")

	client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

	model_path = hf_hub_download(repo_id="sesame/csm-1b", filename="ckpt.pt")
	generator = load_csm_1b(model_path, "cuda")

	class ConversationalAssistant:
	def __init__(self):
	print("Initializing CSM Generator...")
	print("CSM Generator ready!")

	# Initialize with a system message to set the tone
	self.conversation_history = [
	{"role": "system", "content": "You are a friendly, warm, and casual chat buddy. Use a conversational tone, casual language, some slang, and occasional emojis. Keep responses brief and engaging like texting with a good friend. Show enthusiasm, ask follow-up questions sometimes, and share personal-feeling opinions. Use humor when appropriate and don't be too formal or technical unless specifically asked. no emoji"}
	]
	self.sample_rate = 24000 # CSM default sample rate

	def add_to_history(self, role, content):
	"""Add a message to the conversation history."""
	self.conversation_history.append({"role": role, "content": content})

	def generate_response(self, user_input):
	"""Generate a text response using ChatGPT API."""
	# Add user input to conversation history
	self.add_to_history("user", user_input)

	try:
	# Call the ChatGPT API
	response = client.chat.completions.create(
	model="gpt-3.5-turbo", # You can use gpt-4 if you have access
	messages=self.conversation_history,
	max_tokens=150, # Adjust as needed
	temperature=0.7, # Adjust for creativity vs. consistency
	presence_penalty=0.6, # Encourage diverse topics
	frequency_penalty=0.2 # Slightly discourage repetition
	)

	# Extract the response text
	assistant_response = response.choices[0].message.content

	# Add assistant response to history
	self.add_to_history("assistant", assistant_response)

	return assistant_response

	except Exception as e:
	print(f"Error in ChatGPT API call: {e}")
	return "I'm sorry, I couldn't generate a response at this time."

	def text_to_speech(self, text):
	"""Convert text to speech using CSM."""
	try:
	print(f"Generating audio for: '{text}'")
	audio = generator.generate(
	text=text,
	speaker=0,
	context=[],
	max_audio_length_ms=10_000,
	)
	# Move tensor from GPU to CPU if needed
	if hasattr(audio, 'device') and str(audio.device).startswith('cuda'):
	audio = audio.cpu()

	# Convert to numpy array if it's still a tensor
	if hasattr(audio, 'numpy'):
	audio = audio.numpy()

	return audio

	except Exception as e:
	print(f"Error in text-to-speech generation: {e}")
	return None

	def play_audio(self, audio):
	"""Play the audio using sounddevice."""
	if audio is not None:
	print("Playing audio...")
	sd.play(audio, self.sample_rate)
	sd.wait()
	print("Audio playback complete")
	else:
	print("No audio to play")

	def chat(self):
	"""Run an interactive chat session."""
	print("\nWelcome to the conversational assistant!")
	print("Type 'exit', 'quit', or 'bye' to end the conversation.\n")

	while True:
	user_input = input("You: ")
	user_input = user_input.strip()

	# Check for exit commands
	if user_input.lower() in ['exit', 'quit', 'bye']:
	print("Assistant: Goodbye!")
	break

	# Generate text response
	response_text = self.generate_response(user_input)
	print(f"Assistant: {response_text}")

	# Convert to speech and play
	audio = self.text_to_speech(response_text)
	self.play_audio(audio)


	if __name__ == "__main__":

	if not os.getenv("OPENAI_API_KEY"):
	print("Warning: OPENAI_API_KEY environment variable not set.")
	print("Set it with: export OPENAI_API_KEY='your-api-key-here'")
	api_key = input("Enter your OpenAI API key to continue: ")
	os.environ["OPENAI_API_KEY"] = api_key
	client = OpenAI(api_key=api_key)

	# Create and run the assistant
	assistant = ConversationalAssistant()
	assistant.chat()
No results found