Created
March 16, 2025 04:42
-
-
Save yoi-hibino/0e2df78ca54bdf4baf5322dd20fd930d to your computer and use it in GitHub Desktop.
Sesame+OpenAI API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from huggingface_hub import hf_hub_download | |
| from huggingface_hub import login | |
| from generator import load_csm_1b | |
| import torchaudio | |
| import sounddevice as sd | |
| import numpy as np | |
| from openai import OpenAI | |
| import os | |
| login("__your_huggingface_access_token_here__") | |
| client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
| model_path = hf_hub_download(repo_id="sesame/csm-1b", filename="ckpt.pt") | |
| generator = load_csm_1b(model_path, "cuda") | |
| class ConversationalAssistant: | |
| def __init__(self): | |
| print("Initializing CSM Generator...") | |
| print("CSM Generator ready!") | |
| # Initialize with a system message to set the tone | |
| self.conversation_history = [ | |
| {"role": "system", "content": "You are a friendly, warm, and casual chat buddy. Use a conversational tone, casual language, some slang, and occasional emojis. Keep responses brief and engaging like texting with a good friend. Show enthusiasm, ask follow-up questions sometimes, and share personal-feeling opinions. Use humor when appropriate and don't be too formal or technical unless specifically asked. no emoji"} | |
| ] | |
| self.sample_rate = 24000 # CSM default sample rate | |
| def add_to_history(self, role, content): | |
| """Add a message to the conversation history.""" | |
| self.conversation_history.append({"role": role, "content": content}) | |
| def generate_response(self, user_input): | |
| """Generate a text response using ChatGPT API.""" | |
| # Add user input to conversation history | |
| self.add_to_history("user", user_input) | |
| try: | |
| # Call the ChatGPT API | |
| response = client.chat.completions.create( | |
| model="gpt-3.5-turbo", # You can use gpt-4 if you have access | |
| messages=self.conversation_history, | |
| max_tokens=150, # Adjust as needed | |
| temperature=0.7, # Adjust for creativity vs. consistency | |
| presence_penalty=0.6, # Encourage diverse topics | |
| frequency_penalty=0.2 # Slightly discourage repetition | |
| ) | |
| # Extract the response text | |
| assistant_response = response.choices[0].message.content | |
| # Add assistant response to history | |
| self.add_to_history("assistant", assistant_response) | |
| return assistant_response | |
| except Exception as e: | |
| print(f"Error in ChatGPT API call: {e}") | |
| return "I'm sorry, I couldn't generate a response at this time." | |
| def text_to_speech(self, text): | |
| """Convert text to speech using CSM.""" | |
| try: | |
| print(f"Generating audio for: '{text}'") | |
| audio = generator.generate( | |
| text=text, | |
| speaker=0, | |
| context=[], | |
| max_audio_length_ms=10_000, | |
| ) | |
| # Move tensor from GPU to CPU if needed | |
| if hasattr(audio, 'device') and str(audio.device).startswith('cuda'): | |
| audio = audio.cpu() | |
| # Convert to numpy array if it's still a tensor | |
| if hasattr(audio, 'numpy'): | |
| audio = audio.numpy() | |
| return audio | |
| except Exception as e: | |
| print(f"Error in text-to-speech generation: {e}") | |
| return None | |
| def play_audio(self, audio): | |
| """Play the audio using sounddevice.""" | |
| if audio is not None: | |
| print("Playing audio...") | |
| sd.play(audio, self.sample_rate) | |
| sd.wait() | |
| print("Audio playback complete") | |
| else: | |
| print("No audio to play") | |
| def chat(self): | |
| """Run an interactive chat session.""" | |
| print("\nWelcome to the conversational assistant!") | |
| print("Type 'exit', 'quit', or 'bye' to end the conversation.\n") | |
| while True: | |
| user_input = input("You: ") | |
| user_input = user_input.strip() | |
| # Check for exit commands | |
| if user_input.lower() in ['exit', 'quit', 'bye']: | |
| print("Assistant: Goodbye!") | |
| break | |
| # Generate text response | |
| response_text = self.generate_response(user_input) | |
| print(f"Assistant: {response_text}") | |
| # Convert to speech and play | |
| audio = self.text_to_speech(response_text) | |
| self.play_audio(audio) | |
| if __name__ == "__main__": | |
| if not os.getenv("OPENAI_API_KEY"): | |
| print("Warning: OPENAI_API_KEY environment variable not set.") | |
| print("Set it with: export OPENAI_API_KEY='your-api-key-here'") | |
| api_key = input("Enter your OpenAI API key to continue: ") | |
| os.environ["OPENAI_API_KEY"] = api_key | |
| client = OpenAI(api_key=api_key) | |
| # Create and run the assistant | |
| assistant = ConversationalAssistant() | |
| assistant.chat() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment