worked pdf images -> text worked

 


orignial

"""!pip install pdf2image
from PIL import Image
import pytesseract
import os
from pdf2image import convert_from_path

# Paths to the files
pdf_file_path = '/content/Pravakthalu-Yevaru.pdf'
output_text_file_path = '/content/Pravakthalu-Yevaru.txt'

# Verify the PDF file existence
if not os.path.exists(pdf_file_path):
    raise FileNotFoundError(f"PDF file {pdf_file_path} not found")

# Convert PDF to images
images = convert_from_path(pdf_file_path, dpi=300)

# Configure Tesseract for Telugu and English
tesseract_config = r'--oem 3 --psm 6 -l tel+eng'

# Process each image and save the text
with open(output_text_file_path, 'w', encoding='utf-8') as output_file:
    for i, image in enumerate(images):
        # Extract text from the image using Tesseract
        text = pytesseract.image_to_string(image, config=tesseract_config)
        output_file.write(f"Page {i + 1}\n")
        output_file.write(text)
        output_file.write("\n\n")

print(f'OCR text written to file "{output_text_file_path}"')"""
#it worked, see below optmized code,
!pip install Pillow
!pip install pytesseract

!apt-get install -y poppler-utils
!pip install pdf2image
!apt-get update
!apt-get install -y tesseract-ocr-tel
from PIL import Image

import pytesseract

import os

from pdf2image import convert_from_path



# Paths to the files

pdf_file_path = '/content/Images_filename123.pdf'

output_text_file_path = '/content/out.txt'



# Verify the PDF file existence

if not os.path.exists(pdf_file_path):

    raise FileNotFoundError(f"PDF file {pdf_file_path} not found")



# Convert PDF to images

images = convert_from_path(pdf_file_path, dpi=300)



# Configure Tesseract for Telugu and English

tesseract_config = r'--oem 3 --psm 6 -l tel+eng'



# Process each image and save the text

with open(output_text_file_path, 'w', encoding='utf-8') as output_file:

    for i, image in enumerate(images):

        # Extract text from the image using Tesseract

        text = pytesseract.image_to_string(image, config=tesseract_config)

        output_file.write(f"Page {i + 1}\n")

        output_file.write(text)

        output_file.write("\n\n")



print(f'OCR text written to file "{output_text_file_path}"')



# Install required packages
!apt-get update -qq
!apt-get install -y -qq poppler-utils tesseract-ocr-tel
!pip install -q Pillow pytesseract pdf2image

from PIL import Image
import pytesseract
import os
from pdf2image import convert_from_path

# File paths
pdf_file_path = '/content/Images_filename123.pdf'
output_text_file_path = '/content/out.txt'

# Check if PDF exists
if not os.path.exists(pdf_file_path):
    raise FileNotFoundError(f"PDF file {pdf_file_path} not found")

# Convert PDF to images
images = convert_from_path(pdf_file_path, dpi=300)

# Tesseract config for Telugu + English
tesseract_config = '--oem 3 --psm 6 -l tel+eng'

# Extract and write text
with open(output_text_file_path, 'w', encoding='utf-8') as f:
    for i, img in enumerate(images, 1):
        text = pytesseract.image_to_string(img, config=tesseract_config)
        f.write(f"Page {i}\n{text}\n\n")

print(f'OCR text written to file "{output_text_file_path}"')





-------------

referece

what if telugu and english both, only english, only telugu, english and hind, only hind? for Tesseract config for Telugu + English; !apt-get install -y -qq poppler-utils tesseract-ocr-tel tesseract_config = '--oem 3 --psm 6 -l tel+eng'
3/3
ChatGPT said:

Great question. You're working with Tesseract OCR and want to understand how to configure it for different language combinations.

Here's how you should set tesseract_config based on the language needs:

Language ModeTesseract Language CodeExample Config
Telugu and Englishtel+eng--oem 3 --psm 6 -l tel+eng
Only Englisheng--oem 3 --psm 6 -l eng
Only Telugutel--oem 3 --psm 6 -l tel
English and Hindieng+hin--oem 3 --psm 6 -l eng+hin
Only Hindihin--oem 3 --psm 6 -l hin

Additional Setup (if not installed):

Make sure you install the required language packs:

bash
!apt-get install -y -qq tesseract-ocr-eng tesseract-ocr-tel tesseract-ocr-hin

Or you can install individually like:

bash
!apt-get install -y -qq tesseract-ocr-hin # For Hindi only

Notes:

  • --oem 3: Use the default OCR Engine mode (both LSTM and legacy).

  • --psm 6: Assume a uniform block of text (change if needed based on layout).

  • Language order in -l (like tel+eng or eng+hin) affects script recognition priority slightly.

Do you want to auto-detect the language or restrict it manually?

---------------------

Popular posts from this blog

SAP CPI : camle expression in sap cpi , cm, router, filter and groovy script. format

pss book: గురు ప్రార్థనామంజరి . completed 21st july 2024

pss book : శ్రీకృష్ణుడు దేవుడా, భగవంతుడా completed , second review needed. 26th April 2024