Created
January 10, 2024 23:57
-
-
Save guillermoscript/c0682063097a7431e719f45337afa72a to your computer and use it in GitHub Desktop.
How to transcribe audios bigger than 25MB with Whisper AI
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import openai | |
| import mimetypes | |
| from pydub import AudioSegment | |
| import os | |
| load_dotenv() | |
| openai.api_key = os.getenv('OPEN_AI_KEY') | |
| app = Flask(__name__) | |
| logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s') | |
| @app.route('/split-audio', methods=['POST']) | |
| def split_audio(): | |
| try: | |
| audio_file = request.files['audio'] | |
| file_extension = mimetypes.guess_extension(audio_file.content_type)[1:] | |
| file_format = mimetypes.guess_extension(file_extension) | |
| audio = AudioSegment.from_file(audio_file, format=file_format) | |
| segment_length = 10 * 60 # Start with 10 minutes | |
| max_segment_size = 24 * 1024 * 1024 # 24MB in bytes | |
| prompt = "" | |
| full_text = "" | |
| segment_filename = os.path.splitext(audio_file.filename)[0] | |
| duration = audio.duration_seconds | |
| number_of_segments = int(duration / segment_length) | |
| logging.info('Splitting audio into segments') | |
| if number_of_segments == 0: | |
| number_of_segments = 1 | |
| # Split the audio into segments and transcribe each segment | |
| i = 0 | |
| segment_start = 0 | |
| segment_end = segment_length * 1000 | |
| while segment_start < duration * 1000: | |
| logging.info('Segment ' + str(i+1) + ' processing') | |
| sound_export = audio[segment_start:segment_end] | |
| export_format = file_extension | |
| exported_file = '/tmp/' + segment_filename + '-' + str(i+1) + '.' + export_format | |
| sound_export.export(exported_file, format=export_format) | |
| # Check the size of the exported file | |
| while os.path.getsize(exported_file) > max_segment_size: | |
| # If the file is larger than 24MB, split it into smaller segments | |
| segment_length /= 2 # Halve the segment length | |
| segment_end = segment_start + segment_length * 1000 | |
| sound_export = audio[segment_start:segment_end] | |
| sound_export.export(exported_file, format=export_format) | |
| logging.info('Segment ' + str(i+1) + ' exported') | |
| with open(exported_file, "rb") as f: | |
| data = openai.Audio.transcribe("whisper-1", f, prompt=prompt) | |
| logging.info('Segment ' + str(i+1) + ' transcribing') | |
| prompt += data.text | |
| full_text += data.text | |
| logging.info('Segment ' + str(i+1) + ' transcribed') | |
| # Update segment_start and segment_end for the next loop iteration | |
| segment_start = segment_end | |
| segment_end += segment_length * 1000 | |
| i += 1 | |
| logging.info('Audio transcribed') | |
| return jsonify({'full_text': full_text}), 200 | |
| except Exception as e: | |
| error_message = 'Error occurred: ' + str(e) | |
| logging.error(error_message) | |
| response = jsonify({'error': str(e)}) | |
| response.status_code = 400 # Set the status code to indicate a client error | |
| return response |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
check my blog post for more in depth explanation of the code, any comments please let me know!