pdf 2 txt , for any language, using google drive and without also



download unzip, place in google drive.



required packages. saved in ranta google drive.


----------

 without google drive



#with out google drive save.

---------------


#date 1st aug 2025

# Install required packages  fro pdf to text.

!apt-get update -qq

!apt-get install -y -qq poppler-utils tesseract-ocr-tel

!pip install -q Pillow pytesseract pdf2image


------------------

#date: 1st aug 2025


from PIL import Image

import pytesseract

import os

from pdf2image import convert_from_path


# File paths

pdf_file_path = '/content/Mathamu-Pathamu.pdf'

output_text_file_path = '/content/out.txt'


# Check if PDF exists

if not os.path.exists(pdf_file_path):

    raise FileNotFoundError(f"PDF file {pdf_file_path} not found")


# Convert PDF to images

images = convert_from_path(pdf_file_path, dpi=300)


# Tesseract config for Telugu + English

tesseract_config = '--oem 3 --psm 6 -l tel+eng'


# Extract and write text

with open(output_text_file_path, 'w', encoding='utf-8') as f:

    for i, img in enumerate(images, 1):

        text = pytesseract.image_to_string(img, config=tesseract_config)

        f.write(f"Page {i}\n{text}\n\n")


print(f'OCR text written to file "{output_text_file_path}"')


----------------


#for english

#installing/downloading in google drive visible and non visible files 


# Step 3: Download required Python packages into the folder

!pip download Pillow pytesseract pdf2image -d /content/drive/MyDrive/Lib_pdf2Txt


# Download .deb files for system packages

!apt-get download poppler-utils tesseract-ocr-tel -o=dir::cache=/content/drive/MyDrive/Lib_pdf2Txt/apt_debs


------------- for english worked---------

!apt-get install -y -qq poppler-utils ( using google drive)


!pip install --no-index --find-links=/content/drive/MyDrive/Lib_pdf2Txt Pillow pytesseract pdf2image


!dpkg -i /content/drive/MyDrive/Lib_pdf2Txt/apt_debs/*.deb || true

!apt-get install -f -y  # To fix any missing dependencies


import os

os.environ["PATH"] += os.pathsep + "/usr/bin"  # Optional: adjust if you installed poppler elsewhere


from PIL import Image

import pytesseract

from pdf2image import convert_from_path


pdf_file_path = '/content/page128.pdf'

output_text_file_path = '/content/out2.txt'


if not os.path.exists(pdf_file_path):

    raise FileNotFoundError(f"PDF file {pdf_file_path} not found")


images = convert_from_path(pdf_file_path, dpi=300)

tesseract_config = '--oem 3 --psm 6 -l eng'


with open(output_text_file_path, 'w', encoding='utf-8') as f:

    for i, img in enumerate(images, 1):

        text = pytesseract.image_to_string(img, config=tesseract_config)

        f.write(f"Page {i}\n{text}\n\n")


print(f'OCR text written to file "{output_text_file_path}"')

--------------------------------

for telugu (using google drive)

#installing/downloading in google drive visible and non visible files 


# Step 3: Download required Python packages into the folder

!pip download Pillow pytesseract pdf2image -d /content/drive/MyDrive/Lib_pdf2Txt


# Download .deb files for system packages

!apt-get download poppler-utils tesseract-ocr-tel -o=dir::cache=/content/drive/MyDrive/Lib_pdf2Txt/apt_debs

------------

# for telugu 

!apt-get install -y -qq poppler-utils

!apt-get install -y -qq poppler-utils tesseract-ocr-tel

!pip install --no-index --find-links=/content/drive/MyDrive/Lib_pdf2Txt Pillow pytesseract pdf2image


!dpkg -i /content/drive/MyDrive/Lib_pdf2Txt/apt_debs/*.deb || true

!apt-get install -f -y  # To fix any missing dependencies


import os

os.environ["PATH"] += os.pathsep + "/usr/bin"  # Optional: adjust if you installed poppler elsewhere


from PIL import Image

import pytesseract

from pdf2image import convert_from_path


pdf_file_path = '/content/mp_page2.pdf'

output_text_file_path = '/content/out.txt'


if not os.path.exists(pdf_file_path):

    raise FileNotFoundError(f"PDF file {pdf_file_path} not found")


images = convert_from_path(pdf_file_path, dpi=300)

tesseract_config = '--oem 3 --psm 6 -l tel+eng'


with open(output_text_file_path, 'w', encoding='utf-8') as f:

    for i, img in enumerate(images, 1):

        text = pytesseract.image_to_string(img, config=tesseract_config)

        f.write(f"Page {i}\n{text}\n\n")


print(f'OCR text written to file "{output_text_file_path}"')


Popular posts from this blog

SAP CPI : camle expression in sap cpi , cm, router, filter and groovy script. format

SAP CPI camel conditions and xpath conditions

oauth call to cpi integraiton suite from sap apim