pyton ocr and mp3 files

 Updated on 17 aug 2024.


CHANGE VOICE SPEED, AND MALE/FEMALE VOICE. AS REQUIRED.

import gc
gc.collect()
!pip install google-cloud-texttospeech pydub

import json
from google.cloud import texttospeech
from pydub import AudioSegment
import os

# Paths to the files
credentials_path = '/content/credentials.json'
text_file_path = '/content/gita_chapter2.txt'
output_file_path = '/content/gita_chapter2_audio.mp3'

# Verify the file existence
if os.path.exists(credentials_path):
    print("Credentials file found.")
else:
    print("Credentials file not found.")

if os.path.exists(text_file_path):
    print("Text file found.")
else:
    print("Text file not found.")

# Load your Google Cloud project credentials
with open(credentials_path) as f:
    credentials = json.load(f)

# Configure the Text-to-Speech client
client = texttospeech.TextToSpeechClient.from_service_account_json(credentials_path)

# Load Telugu text from the file
with open(text_file_path, 'r', encoding='utf-8') as file:
    text = file.read().strip()

# Function to split text into chunks based on byte size
def split_text(text, max_bytes=5000):
    """Split text into chunks of max_bytes bytes."""
    chunks = []
    current_chunk = ""
    current_chunk_bytes = 0

    for char in text:
        char_bytes = len(char.encode('utf-8'))
        if current_chunk_bytes + char_bytes > max_bytes:
            chunks.append(current_chunk)
            current_chunk = char
            current_chunk_bytes = char_bytes
        else:
            current_chunk += char
            current_chunk_bytes += char_bytes

    if current_chunk:
        chunks.append(current_chunk)

    return chunks

# Configure voice and speed
voice = texttospeech.VoiceSelectionParams(
    language_code='te-IN',  # Telugu (India) Link:https://cloud.google.com/text-to-speech/docs/voices
    name='te-IN-Standard-A',  # Male voice te-IN-Standard-B  ;felmale : te-IN-Standard-A
    ssml_gender=texttospeech.SsmlVoiceGender.MALE
)
audio_config = texttospeech.AudioConfig(
    audio_encoding=texttospeech.AudioEncoding.MP3,
    speaking_rate=1.25  # Normal speed, 1.0 is default 0.25,0.5,0.75,1 (normal), 1.25, 1.5 , 1.75, 2
)

# Function to synthesize speech for each chunk
def synthesize_speech(text_chunk):
    synthesis_input = texttospeech.SynthesisInput(text=text_chunk)
    response = client.synthesize_speech(
        input=synthesis_input, voice=voice, audio_config=audio_config
    )
    return response.audio_content

# Function to save audio content to a temporary file
def save_temp_audio(audio_content, temp_file_path):
    with open(temp_file_path, 'wb') as out:
        out.write(audio_content)

# Process text and combine audio files
text_chunks = split_text(text)
temp_files = []

for i, chunk in enumerate(text_chunks):
    temp_file_path = f'/content/temp_chunk_{i}.mp3'
    temp_files.append(temp_file_path)
    audio_content = synthesize_speech(chunk)
    save_temp_audio(audio_content, temp_file_path)

# Combine audio files
combined = AudioSegment.empty()
for temp_file in temp_files:
    audio_segment = AudioSegment.from_mp3(temp_file)
    combined += audio_segment

# Export the combined audio to the final output file
combined.export(output_file_path, format='mp3')

print(f'Audio content written to file "{output_file_path}"')

# Clean up temporary files
for temp_file in temp_files:
    os.remove(temp_file)



-----------------------

OCR Examples

  1. Basic OCR with Tesseract:

    • Problem: Extract text from an image.
    • Solution: Use the pytesseract library to convert the image to text.
    python
    from PIL import Image import pytesseract image = Image.open('example.png') text = pytesseract.image_to_string(image) print(text)
  2. OCR with Image Preprocessing:

    • Problem: Improve OCR accuracy by preprocessing the image.
    • Solution: Convert the image to grayscale and apply thresholding.
    python
    import cv2 import pytesseract image = cv2.imread('example.png') gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY) text = pytesseract.image_to_string(thresh) print(text)
  3. OCR with Different Languages:

    • Problem: Extract text in a specific language from an image.
    • Solution: Specify the language parameter in pytesseract.
    python
    from PIL import Image import pytesseract image = Image.open('example.png') text = pytesseract.image_to_string(image, lang='spa') # Spanish print(text)
  4. OCR on Multiple Images:

    • Problem: Perform OCR on a set of images in a directory.
    • Solution: Loop through images and extract text from each.
    python
    from PIL import Image import pytesseract import os for filename in os.listdir('images'): if filename.endswith('.png'): image = Image.open(os.path.join('images', filename)) text = pytesseract.image_to_string(image) print(f"{filename}:\n{text}\n")
  5. Bounding Boxes for Detected Text:

    • Problem: Get bounding boxes for text detected in an image.
    • Solution: Use pytesseract to extract bounding box coordinates.
    python
    import cv2 import pytesseract image = cv2.imread('example.png') boxes = pytesseract.image_to_boxes(image) for box in boxes.splitlines(): b = box.split(' ') x, y, w, h = int(b[1]), int(b[2]), int(b[3]), int(b[4]) cv2.rectangle(image, (x, image.shape[0] - y), (w, image.shape[0] - h), (0, 255, 0), 2) cv2.imshow('Image', image) cv2.waitKey(0)
  6. Detecting Text Orientation and Script:

    • Problem: Detect the orientation and script of the text in an image.
    • Solution: Use pytesseract to obtain orientation and script information.
    python
    from PIL import Image import pytesseract image = Image.open('example.png') osd = pytesseract.image_to_osd(image) print(osd)
  7. OCR on PDF Files:

    • Problem: Extract text from a PDF document.
    • Solution: Convert PDF to images and then apply OCR.
    python
    from pdf2image import convert_from_path import pytesseract pages = convert_from_path('example.pdf') for page in pages: text = pytesseract.image_to_string(page) print(text)
  8. OCR on Scanned Documents:

    • Problem: Perform OCR on scanned documents.
    • Solution: Use pytesseract to extract text from scanned images.
    python
    from PIL import Image import pytesseract image = Image.open('scanned_doc.png') text = pytesseract.image_to_string(image) print(text)
  9. OCR with OpenCV Preprocessing:

    • Problem: Improve OCR results using OpenCV preprocessing.
    • Solution: Apply blurring and adaptive thresholding.
    python
    import cv2 import pytesseract image = cv2.imread('example.png') gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) blur = cv2.medianBlur(gray, 3) thresh = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2) text = pytesseract.image_to_string(thresh) print(text)
  10. OCR with Rotated Images:

    • Problem: Extract text from rotated images.
    • Solution: Rotate the image to correct orientation before applying OCR.
    python
    import cv2 import pytesseract image = cv2.imread('rotated_image.png') rotated = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE) text = pytesseract.image_to_string(rotated) print(text)

Sound File Examples

  1. Play Sound with Pygame:

    • Problem: Play an audio file in your Python application.
    • Solution: Use the pygame library to load and play the sound.
    python
    import pygame pygame.mixer.init() pygame.mixer.music.load('example.mp3') pygame.mixer.music.play() while pygame.mixer.music.get_busy(): pass
  2. Convert Text to Speech with gTTS:

    • Problem: Convert a given text to speech.
    • Solution: Use gTTS to convert text to speech and save it as an audio file.
    python
    from gtts import gTTS tts = gTTS(text='Hello, world!', lang='en') tts.save('hello.mp3')
  3. Speech Recognition from Audio File:

    • Problem: Convert speech from an audio file to text.
    • Solution: Use SpeechRecognition to transcribe the audio file.
    python
    import speech_recognition as sr recognizer = sr.Recognizer() with sr.AudioFile('example.wav') as source: audio = recognizer.record(source) text = recognizer.recognize_google(audio) print(text)
  4. Recording Audio with PyAudio:

    • Problem: Record audio using the microphone.
    • Solution: Use PyAudio to capture audio from the microphone and save it to a file.
    python
    import pyaudio import wave FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 44100 CHUNK = 1024 RECORD_SECONDS = 5 OUTPUT_FILENAME = 'output.wav' audio = pyaudio.PyAudio() stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) frames = [] for _ in range(int(RATE / CHUNK * RECORD_SECONDS)): data = stream.read(CHUNK) frames.append(data) stream.stop_stream() stream.close() audio.terminate() wf = wave.open(OUTPUT_FILENAME, 'wb') wf.setnchannels(CHANNELS) wf.setsampwidth(audio.get_sample_size(FORMAT)) wf.setframerate(RATE) wf.writeframes(b''.join(frames)) wf.close()
  5. Change Audio Speed with PyDub:

    • Problem: Increase or decrease the speed of an audio file.
    • Solution: Use pydub to manipulate the playback speed of the audio.
    python
    from pydub import AudioSegment sound = AudioSegment.from_file('example.wav') faster_sound = sound.speedup(playback_speed=1.5) faster_sound.export('faster_example.wav', format='wav')
  6. Merge Two Audio Files with PyDub:

    • Problem: Merge two audio files into one.
    • Solution: Use pydub to concatenate the audio files.
    python
    from pydub import AudioSegment sound1 = AudioSegment.from_file('example1.wav') sound2 = AudioSegment.from_file('example2.wav') combined = sound1 + sound2 combined.export('combined_example.wav', format='wav')
  7. Extract Audio from Video with MoviePy:

    • Problem: Extract the audio track from a video file.
    • Solution: Use moviepy to extract and save the audio from the video.
    python
    from moviepy.editor import VideoFileClip video = VideoFileClip('example.mp4') audio = video.audio audio.write_audiofile('extracted_audio.wav')

Popular posts from this blog

pss book : శ్రీకృష్ణుడు దేవుడా, భగవంతుడా completed , second review needed. 26th April 2024

pss book: గురు ప్రార్థనామంజరి . completed 21st july 2024

pss book: కధల జ్ఞానము read review pending. 25th june 2024