worked pdf images -> text worked
orignial
"""!pip install pdf2image
from PIL import Image
import pytesseract
import os
from pdf2image import convert_from_path
# Paths to the files
pdf_file_path = '/content/Pravakthalu-Yevaru.pdf'
output_text_file_path = '/content/Pravakthalu-Yevaru.txt'
# Verify the PDF file existence
if not os.path.exists(pdf_file_path):
raise FileNotFoundError(f"PDF file {pdf_file_path} not found")
# Convert PDF to images
images = convert_from_path(pdf_file_path, dpi=300)
# Configure Tesseract for Telugu and English
tesseract_config = r'--oem 3 --psm 6 -l tel+eng'
# Process each image and save the text
with open(output_text_file_path, 'w', encoding='utf-8') as output_file:
for i, image in enumerate(images):
# Extract text from the image using Tesseract
text = pytesseract.image_to_string(image, config=tesseract_config)
output_file.write(f"Page {i + 1}\n")
output_file.write(text)
output_file.write("\n\n")
print(f'OCR text written to file "{output_text_file_path}"')"""
#it worked, see below optmized code,
!pip install Pillow
!pip install pytesseract
!apt-get install -y poppler-utils
!pip install pdf2image
!apt-get update
!apt-get install -y tesseract-ocr-tel
from PIL import Image
import pytesseract
import os
from pdf2image import convert_from_path
# Paths to the files
pdf_file_path = '/content/Images_filename123.pdf'
output_text_file_path = '/content/out.txt'
# Verify the PDF file existence
if not os.path.exists(pdf_file_path):
raise FileNotFoundError(f"PDF file {pdf_file_path} not found")
# Convert PDF to images
images = convert_from_path(pdf_file_path, dpi=300)
# Configure Tesseract for Telugu and English
tesseract_config = r'--oem 3 --psm 6 -l tel+eng'
# Process each image and save the text
with open(output_text_file_path, 'w', encoding='utf-8') as output_file:
for i, image in enumerate(images):
# Extract text from the image using Tesseract
text = pytesseract.image_to_string(image, config=tesseract_config)
output_file.write(f"Page {i + 1}\n")
output_file.write(text)
output_file.write("\n\n")
print(f'OCR text written to file "{output_text_file_path}"')
# Install required packages
!apt-get update -qq
!apt-get install -y -qq poppler-utils tesseract-ocr-tel
!pip install -q Pillow pytesseract pdf2image
from PIL import Image
import pytesseract
import os
from pdf2image import convert_from_path
# File paths
pdf_file_path = '/content/Images_filename123.pdf'
output_text_file_path = '/content/out.txt'
# Check if PDF exists
if not os.path.exists(pdf_file_path):
raise FileNotFoundError(f"PDF file {pdf_file_path} not found")
# Convert PDF to images
images = convert_from_path(pdf_file_path, dpi=300)
# Tesseract config for Telugu + English
tesseract_config = '--oem 3 --psm 6 -l tel+eng'
# Extract and write text
with open(output_text_file_path, 'w', encoding='utf-8') as f:
for i, img in enumerate(images, 1):
text = pytesseract.image_to_string(img, config=tesseract_config)
f.write(f"Page {i}\n{text}\n\n")
print(f'OCR text written to file "{output_text_file_path}"')
-------------
referece