pyton ocr and mp3 files
Updated on 17 aug 2024.
CHANGE VOICE SPEED, AND MALE/FEMALE VOICE. AS REQUIRED.
import gc
gc.collect()
!pip install google-cloud-texttospeech pydub
import json
from google.cloud import texttospeech
from pydub import AudioSegment
import os
# Paths to the files
credentials_path = '/content/credentials.json'
text_file_path = '/content/gita_chapter2.txt'
output_file_path = '/content/gita_chapter2_audio.mp3'
# Verify the file existence
if os.path.exists(credentials_path):
print("Credentials file found.")
else:
print("Credentials file not found.")
if os.path.exists(text_file_path):
print("Text file found.")
else:
print("Text file not found.")
# Load your Google Cloud project credentials
with open(credentials_path) as f:
credentials = json.load(f)
# Configure the Text-to-Speech client
client = texttospeech.TextToSpeechClient.from_service_account_json(credentials_path)
# Load Telugu text from the file
with open(text_file_path, 'r', encoding='utf-8') as file:
text = file.read().strip()
# Function to split text into chunks based on byte size
def split_text(text, max_bytes=5000):
"""Split text into chunks of max_bytes bytes."""
chunks = []
current_chunk = ""
current_chunk_bytes = 0
for char in text:
char_bytes = len(char.encode('utf-8'))
if current_chunk_bytes + char_bytes > max_bytes:
chunks.append(current_chunk)
current_chunk = char
current_chunk_bytes = char_bytes
else:
current_chunk += char
current_chunk_bytes += char_bytes
if current_chunk:
chunks.append(current_chunk)
return chunks
# Configure voice and speed
voice = texttospeech.VoiceSelectionParams(
language_code='te-IN', # Telugu (India) Link:https://cloud.google.com/text-to-speech/docs/voices
name='te-IN-Standard-A', # Male voice te-IN-Standard-B ;felmale : te-IN-Standard-A
ssml_gender=texttospeech.SsmlVoiceGender.MALE
)
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3,
speaking_rate=1.25 # Normal speed, 1.0 is default 0.25,0.5,0.75,1 (normal), 1.25, 1.5 , 1.75, 2
)
# Function to synthesize speech for each chunk
def synthesize_speech(text_chunk):
synthesis_input = texttospeech.SynthesisInput(text=text_chunk)
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
return response.audio_content
# Function to save audio content to a temporary file
def save_temp_audio(audio_content, temp_file_path):
with open(temp_file_path, 'wb') as out:
out.write(audio_content)
# Process text and combine audio files
text_chunks = split_text(text)
temp_files = []
for i, chunk in enumerate(text_chunks):
temp_file_path = f'/content/temp_chunk_{i}.mp3'
temp_files.append(temp_file_path)
audio_content = synthesize_speech(chunk)
save_temp_audio(audio_content, temp_file_path)
# Combine audio files
combined = AudioSegment.empty()
for temp_file in temp_files:
audio_segment = AudioSegment.from_mp3(temp_file)
combined += audio_segment
# Export the combined audio to the final output file
combined.export(output_file_path, format='mp3')
print(f'Audio content written to file "{output_file_path}"')
# Clean up temporary files
for temp_file in temp_files:
os.remove(temp_file)
-----------------------
OCR Examples
Basic OCR with Tesseract:
- Problem: Extract text from an image.
- Solution: Use the
pytesseract
library to convert the image to text.
pythonfrom PIL import Image import pytesseract image = Image.open('example.png') text = pytesseract.image_to_string(image) print(text)
OCR with Image Preprocessing:
- Problem: Improve OCR accuracy by preprocessing the image.
- Solution: Convert the image to grayscale and apply thresholding.
pythonimport cv2 import pytesseract image = cv2.imread('example.png') gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY) text = pytesseract.image_to_string(thresh) print(text)
OCR with Different Languages:
- Problem: Extract text in a specific language from an image.
- Solution: Specify the language parameter in
pytesseract
.
pythonfrom PIL import Image import pytesseract image = Image.open('example.png') text = pytesseract.image_to_string(image, lang='spa') # Spanish print(text)
OCR on Multiple Images:
- Problem: Perform OCR on a set of images in a directory.
- Solution: Loop through images and extract text from each.
pythonfrom PIL import Image import pytesseract import os for filename in os.listdir('images'): if filename.endswith('.png'): image = Image.open(os.path.join('images', filename)) text = pytesseract.image_to_string(image) print(f"{filename}:\n{text}\n")
Bounding Boxes for Detected Text:
- Problem: Get bounding boxes for text detected in an image.
- Solution: Use
pytesseract
to extract bounding box coordinates.
pythonimport cv2 import pytesseract image = cv2.imread('example.png') boxes = pytesseract.image_to_boxes(image) for box in boxes.splitlines(): b = box.split(' ') x, y, w, h = int(b[1]), int(b[2]), int(b[3]), int(b[4]) cv2.rectangle(image, (x, image.shape[0] - y), (w, image.shape[0] - h), (0, 255, 0), 2) cv2.imshow('Image', image) cv2.waitKey(0)
Detecting Text Orientation and Script:
- Problem: Detect the orientation and script of the text in an image.
- Solution: Use
pytesseract
to obtain orientation and script information.
pythonfrom PIL import Image import pytesseract image = Image.open('example.png') osd = pytesseract.image_to_osd(image) print(osd)
OCR on PDF Files:
- Problem: Extract text from a PDF document.
- Solution: Convert PDF to images and then apply OCR.
pythonfrom pdf2image import convert_from_path import pytesseract pages = convert_from_path('example.pdf') for page in pages: text = pytesseract.image_to_string(page) print(text)
OCR on Scanned Documents:
- Problem: Perform OCR on scanned documents.
- Solution: Use
pytesseract
to extract text from scanned images.
pythonfrom PIL import Image import pytesseract image = Image.open('scanned_doc.png') text = pytesseract.image_to_string(image) print(text)
OCR with OpenCV Preprocessing:
- Problem: Improve OCR results using OpenCV preprocessing.
- Solution: Apply blurring and adaptive thresholding.
pythonimport cv2 import pytesseract image = cv2.imread('example.png') gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) blur = cv2.medianBlur(gray, 3) thresh = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2) text = pytesseract.image_to_string(thresh) print(text)
OCR with Rotated Images:
- Problem: Extract text from rotated images.
- Solution: Rotate the image to correct orientation before applying OCR.
pythonimport cv2 import pytesseract image = cv2.imread('rotated_image.png') rotated = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE) text = pytesseract.image_to_string(rotated) print(text)
Sound File Examples
Play Sound with Pygame:
- Problem: Play an audio file in your Python application.
- Solution: Use the
pygame
library to load and play the sound.
pythonimport pygame pygame.mixer.init() pygame.mixer.music.load('example.mp3') pygame.mixer.music.play() while pygame.mixer.music.get_busy(): pass
Convert Text to Speech with gTTS:
- Problem: Convert a given text to speech.
- Solution: Use
gTTS
to convert text to speech and save it as an audio file.
pythonfrom gtts import gTTS tts = gTTS(text='Hello, world!', lang='en') tts.save('hello.mp3')
Speech Recognition from Audio File:
- Problem: Convert speech from an audio file to text.
- Solution: Use
SpeechRecognition
to transcribe the audio file.
pythonimport speech_recognition as sr recognizer = sr.Recognizer() with sr.AudioFile('example.wav') as source: audio = recognizer.record(source) text = recognizer.recognize_google(audio) print(text)
Recording Audio with PyAudio:
- Problem: Record audio using the microphone.
- Solution: Use
PyAudio
to capture audio from the microphone and save it to a file.
pythonimport pyaudio import wave FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 44100 CHUNK = 1024 RECORD_SECONDS = 5 OUTPUT_FILENAME = 'output.wav' audio = pyaudio.PyAudio() stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) frames = [] for _ in range(int(RATE / CHUNK * RECORD_SECONDS)): data = stream.read(CHUNK) frames.append(data) stream.stop_stream() stream.close() audio.terminate() wf = wave.open(OUTPUT_FILENAME, 'wb') wf.setnchannels(CHANNELS) wf.setsampwidth(audio.get_sample_size(FORMAT)) wf.setframerate(RATE) wf.writeframes(b''.join(frames)) wf.close()
Change Audio Speed with PyDub:
- Problem: Increase or decrease the speed of an audio file.
- Solution: Use
pydub
to manipulate the playback speed of the audio.
pythonfrom pydub import AudioSegment sound = AudioSegment.from_file('example.wav') faster_sound = sound.speedup(playback_speed=1.5) faster_sound.export('faster_example.wav', format='wav')
Merge Two Audio Files with PyDub:
- Problem: Merge two audio files into one.
- Solution: Use
pydub
to concatenate the audio files.
pythonfrom pydub import AudioSegment sound1 = AudioSegment.from_file('example1.wav') sound2 = AudioSegment.from_file('example2.wav') combined = sound1 + sound2 combined.export('combined_example.wav', format='wav')
Extract Audio from Video with MoviePy:
- Problem: Extract the audio track from a video file.
- Solution: Use
moviepy
to extract and save the audio from the video.
pythonfrom moviepy.editor import VideoFileClip video = VideoFileClip('example.mp4') audio = video.audio audio.write_audiofile('extracted_audio.wav')