pyton ocr and mp3 files

- August 02, 2024

Updated on 17 aug 2024.

CHANGE VOICE SPEED, AND MALE/FEMALE VOICE. AS REQUIRED.

import gc
gc.collect()
!pip install google-cloud-texttospeech pydub

import json
from google.cloud import texttospeech
from pydub import AudioSegment
import os

# Paths to the files
credentials_path = '/content/credentials.json'
text_file_path = '/content/gita_chapter2.txt'
output_file_path = '/content/gita_chapter2_audio.mp3'

# Verify the file existence
if os.path.exists(credentials_path):
    print("Credentials file found.")
else:
    print("Credentials file not found.")

if os.path.exists(text_file_path):
    print("Text file found.")
else:
    print("Text file not found.")

# Load your Google Cloud project credentials
with open(credentials_path) as f:
    credentials = json.load(f)

# Configure the Text-to-Speech client
client = texttospeech.TextToSpeechClient.from_service_account_json(credentials_path)

# Load Telugu text from the file
with open(text_file_path, 'r', encoding='utf-8') as file:
    text = file.read().strip()

# Function to split text into chunks based on byte size
def split_text(text, max_bytes=5000):
    """Split text into chunks of max_bytes bytes."""
    chunks = []
    current_chunk = ""
    current_chunk_bytes = 0

    for char in text:
        char_bytes = len(char.encode('utf-8'))
        if current_chunk_bytes + char_bytes > max_bytes:
            chunks.append(current_chunk)
            current_chunk = char
            current_chunk_bytes = char_bytes
        else:
            current_chunk += char
            current_chunk_bytes += char_bytes

    if current_chunk:
        chunks.append(current_chunk)

    return chunks

# Configure voice and speed
voice = texttospeech.VoiceSelectionParams(
    language_code='te-IN',  # Telugu (India) Link:https://cloud.google.com/text-to-speech/docs/voices
    name='te-IN-Standard-A',  # Male voice te-IN-Standard-B  ;felmale : te-IN-Standard-A
    ssml_gender=texttospeech.SsmlVoiceGender.MALE
)
audio_config = texttospeech.AudioConfig(
    audio_encoding=texttospeech.AudioEncoding.MP3,
    speaking_rate=1.25  # Normal speed, 1.0 is default 0.25,0.5,0.75,1 (normal), 1.25, 1.5 , 1.75, 2
)

# Function to synthesize speech for each chunk
def synthesize_speech(text_chunk):
    synthesis_input = texttospeech.SynthesisInput(text=text_chunk)
    response = client.synthesize_speech(
        input=synthesis_input, voice=voice, audio_config=audio_config
    )
    return response.audio_content

# Function to save audio content to a temporary file
def save_temp_audio(audio_content, temp_file_path):
    with open(temp_file_path, 'wb') as out:
        out.write(audio_content)

# Process text and combine audio files
text_chunks = split_text(text)
temp_files = []

for i, chunk in enumerate(text_chunks):
    temp_file_path = f'/content/temp_chunk_{i}.mp3'
    temp_files.append(temp_file_path)
    audio_content = synthesize_speech(chunk)
    save_temp_audio(audio_content, temp_file_path)

# Combine audio files
combined = AudioSegment.empty()
for temp_file in temp_files:
    audio_segment = AudioSegment.from_mp3(temp_file)
    combined += audio_segment

# Export the combined audio to the final output file
combined.export(output_file_path, format='mp3')

print(f'Audio content written to file "{output_file_path}"')

# Clean up temporary files
for temp_file in temp_files:
    os.remove(temp_file)

-----------------------

OCR Examples

Basic OCR with Tesseract:

Problem: Extract text from an image.
Solution: Use the pytesseract library to convert the image to text.

python
from PIL import Image
import pytesseract

image = Image.open('example.png')
text = pytesseract.image_to_string(image)
print(text)

OCR with Image Preprocessing:

Problem: Improve OCR accuracy by preprocessing the image.
Solution: Convert the image to grayscale and apply thresholding.

python
import cv2
import pytesseract

image = cv2.imread('example.png')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
text = pytesseract.image_to_string(thresh)
print(text)

OCR with Different Languages:

Problem: Extract text in a specific language from an image.
Solution: Specify the language parameter in pytesseract.

python
from PIL import Image
import pytesseract

image = Image.open('example.png')
text = pytesseract.image_to_string(image, lang='spa')  # Spanish
print(text)

OCR on Multiple Images:

Problem: Perform OCR on a set of images in a directory.
Solution: Loop through images and extract text from each.

python
from PIL import Image
import pytesseract
import os

for filename in os.listdir('images'):
    if filename.endswith('.png'):
        image = Image.open(os.path.join('images', filename))
        text = pytesseract.image_to_string(image)
        print(f"{filename}:\n{text}\n")

Bounding Boxes for Detected Text:

Problem: Get bounding boxes for text detected in an image.
Solution: Use pytesseract to extract bounding box coordinates.

python
import cv2
import pytesseract

image = cv2.imread('example.png')
boxes = pytesseract.image_to_boxes(image)
for box in boxes.splitlines():
    b = box.split(' ')
    x, y, w, h = int(b[1]), int(b[2]), int(b[3]), int(b[4])
    cv2.rectangle(image, (x, image.shape[0] - y), (w, image.shape[0] - h), (0, 255, 0), 2)
cv2.imshow('Image', image)
cv2.waitKey(0)

Detecting Text Orientation and Script:
- Problem: Detect the orientation and script of the text in an image.
- Solution: Use pytesseract to obtain orientation and script information.
```
python
from PIL import Image
import pytesseract

image = Image.open('example.png')
osd = pytesseract.image_to_osd(image)
print(osd)
```

OCR on PDF Files:

Problem: Extract text from a PDF document.
Solution: Convert PDF to images and then apply OCR.

python
from pdf2image import convert_from_path
import pytesseract

pages = convert_from_path('example.pdf')
for page in pages:
    text = pytesseract.image_to_string(page)
    print(text)

OCR on Scanned Documents:

Problem: Perform OCR on scanned documents.
Solution: Use pytesseract to extract text from scanned images.

python
from PIL import Image
import pytesseract

image = Image.open('scanned_doc.png')
text = pytesseract.image_to_string(image)
print(text)

OCR with OpenCV Preprocessing:

Problem: Improve OCR results using OpenCV preprocessing.
Solution: Apply blurring and adaptive thresholding.

python
import cv2
import pytesseract

image = cv2.imread('example.png')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blur = cv2.medianBlur(gray, 3)
thresh = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
text = pytesseract.image_to_string(thresh)
print(text)

OCR with Rotated Images:

Problem: Extract text from rotated images.
Solution: Rotate the image to correct orientation before applying OCR.

python
import cv2
import pytesseract

image = cv2.imread('rotated_image.png')
rotated = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
text = pytesseract.image_to_string(rotated)
print(text)

Sound File Examples

Play Sound with Pygame:

Problem: Play an audio file in your Python application.
Solution: Use the pygame library to load and play the sound.

python
import pygame

pygame.mixer.init()
pygame.mixer.music.load('example.mp3')
pygame.mixer.music.play()
while pygame.mixer.music.get_busy():
    pass

Convert Text to Speech with gTTS:
- Problem: Convert a given text to speech.
- Solution: Use gTTS to convert text to speech and save it as an audio file.
```
python
from gtts import gTTS

tts = gTTS(text='Hello, world!', lang='en')
tts.save('hello.mp3')
```

Speech Recognition from Audio File:

Problem: Convert speech from an audio file to text.
Solution: Use SpeechRecognition to transcribe the audio file.

python
import speech_recognition as sr

recognizer = sr.Recognizer()
with sr.AudioFile('example.wav') as source:
    audio = recognizer.record(source)
text = recognizer.recognize_google(audio)
print(text)

Recording Audio with PyAudio:

Problem: Record audio using the microphone.
Solution: Use PyAudio to capture audio from the microphone and save it to a file.

python
import pyaudio
import wave

FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
RECORD_SECONDS = 5
OUTPUT_FILENAME = 'output.wav'

audio = pyaudio.PyAudio()
stream = audio.open(format=FORMAT, channels=CHANNELS,
                    rate=RATE, input=True,
                    frames_per_buffer=CHUNK)

frames = []
for _ in range(int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)

stream.stop_stream()
stream.close()
audio.terminate()

wf = wave.open(OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(audio.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()

Change Audio Speed with PyDub:

Problem: Increase or decrease the speed of an audio file.
Solution: Use pydub to manipulate the playback speed of the audio.

python
from pydub import AudioSegment

sound = AudioSegment.from_file('example.wav')
faster_sound = sound.speedup(playback_speed=1.5)
faster_sound.export('faster_example.wav', format='wav')

Merge Two Audio Files with PyDub:

Problem: Merge two audio files into one.
Solution: Use pydub to concatenate the audio files.

python
from pydub import AudioSegment

sound1 = AudioSegment.from_file('example1.wav')
sound2 = AudioSegment.from_file('example2.wav')
combined = sound1 + sound2
combined.export('combined_example.wav', format='wav')

Extract Audio from Video with MoviePy:

Problem: Extract the audio track from a video file.
Solution: Use moviepy to extract and save the audio from the video.

python
from moviepy.editor import VideoFileClip

video = VideoFileClip('example.mp4')
audio = video.audio
audio.write_audiofile('extracted_audio.wav')

Search This Blog

My important workaround in my journey