Skip to content

Instantly share code, notes, and snippets.

@guillermoscript
Created January 10, 2024 23:57
Show Gist options
  • Select an option

  • Save guillermoscript/c0682063097a7431e719f45337afa72a to your computer and use it in GitHub Desktop.

Select an option

Save guillermoscript/c0682063097a7431e719f45337afa72a to your computer and use it in GitHub Desktop.
How to transcribe audios bigger than 25MB with Whisper AI
import openai
import mimetypes
from pydub import AudioSegment
import os
load_dotenv()
openai.api_key = os.getenv('OPEN_AI_KEY')
app = Flask(__name__)
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s')
@app.route('/split-audio', methods=['POST'])
def split_audio():
try:
audio_file = request.files['audio']
file_extension = mimetypes.guess_extension(audio_file.content_type)[1:]
file_format = mimetypes.guess_extension(file_extension)
audio = AudioSegment.from_file(audio_file, format=file_format)
segment_length = 10 * 60 # Start with 10 minutes
max_segment_size = 24 * 1024 * 1024 # 24MB in bytes
prompt = ""
full_text = ""
segment_filename = os.path.splitext(audio_file.filename)[0]
duration = audio.duration_seconds
number_of_segments = int(duration / segment_length)
logging.info('Splitting audio into segments')
if number_of_segments == 0:
number_of_segments = 1
# Split the audio into segments and transcribe each segment
i = 0
segment_start = 0
segment_end = segment_length * 1000
while segment_start < duration * 1000:
logging.info('Segment ' + str(i+1) + ' processing')
sound_export = audio[segment_start:segment_end]
export_format = file_extension
exported_file = '/tmp/' + segment_filename + '-' + str(i+1) + '.' + export_format
sound_export.export(exported_file, format=export_format)
# Check the size of the exported file
while os.path.getsize(exported_file) > max_segment_size:
# If the file is larger than 24MB, split it into smaller segments
segment_length /= 2 # Halve the segment length
segment_end = segment_start + segment_length * 1000
sound_export = audio[segment_start:segment_end]
sound_export.export(exported_file, format=export_format)
logging.info('Segment ' + str(i+1) + ' exported')
with open(exported_file, "rb") as f:
data = openai.Audio.transcribe("whisper-1", f, prompt=prompt)
logging.info('Segment ' + str(i+1) + ' transcribing')
prompt += data.text
full_text += data.text
logging.info('Segment ' + str(i+1) + ' transcribed')
# Update segment_start and segment_end for the next loop iteration
segment_start = segment_end
segment_end += segment_length * 1000
i += 1
logging.info('Audio transcribed')
return jsonify({'full_text': full_text}), 200
except Exception as e:
error_message = 'Error occurred: ' + str(e)
logging.error(error_message)
response = jsonify({'error': str(e)})
response.status_code = 400 # Set the status code to indicate a client error
return response
@guillermoscript
Copy link
Author

check my blog post for more in depth explanation of the code, any comments please let me know!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment