pdf 2 txt , for any language, using google drive and without also

- July 31, 2025

download unzip, place in google drive.

required packages. saved in ranta google drive.

----------

without google drive

#with out google drive save.

---------------

#date 1st aug 2025

# Install required packages fro pdf to text.

!apt-get update -qq

!apt-get install -y -qq poppler-utils tesseract-ocr-tel

!pip install -q Pillow pytesseract pdf2image

------------------

#date: 1st aug 2025

from PIL import Image

import pytesseract

import os

from pdf2image import convert_from_path

# File paths

pdf_file_path = '/content/Mathamu-Pathamu.pdf'

output_text_file_path = '/content/out.txt'

# Check if PDF exists

if not os.path.exists(pdf_file_path):

raise FileNotFoundError(f"PDF file {pdf_file_path} not found")

# Convert PDF to images

images = convert_from_path(pdf_file_path, dpi=300)

# Tesseract config for Telugu + English

tesseract_config = '--oem 3 --psm 6 -l tel+eng'

# Extract and write text

with open(output_text_file_path, 'w', encoding='utf-8') as f:

for i, img in enumerate(images, 1):

text = pytesseract.image_to_string(img, config=tesseract_config)

f.write(f"Page {i}\n{text}\n\n")

print(f'OCR text written to file "{output_text_file_path}"')

----------------

#for english

#installing/downloading in google drive visible and non visible files

# Step 3: Download required Python packages into the folder

!pip download Pillow pytesseract pdf2image -d /content/drive/MyDrive/Lib_pdf2Txt

# Download .deb files for system packages

!apt-get download poppler-utils tesseract-ocr-tel -o=dir::cache=/content/drive/MyDrive/Lib_pdf2Txt/apt_debs

------------- for english worked---------

!apt-get install -y -qq poppler-utils ( using google drive)

!pip install --no-index --find-links=/content/drive/MyDrive/Lib_pdf2Txt Pillow pytesseract pdf2image

!dpkg -i /content/drive/MyDrive/Lib_pdf2Txt/apt_debs/*.deb || true

!apt-get install -f -y # To fix any missing dependencies

import os

os.environ["PATH"] += os.pathsep + "/usr/bin" # Optional: adjust if you installed poppler elsewhere

from PIL import Image

import pytesseract

from pdf2image import convert_from_path

pdf_file_path = '/content/page128.pdf'

output_text_file_path = '/content/out2.txt'

if not os.path.exists(pdf_file_path):

raise FileNotFoundError(f"PDF file {pdf_file_path} not found")

images = convert_from_path(pdf_file_path, dpi=300)

tesseract_config = '--oem 3 --psm 6 -l eng'

with open(output_text_file_path, 'w', encoding='utf-8') as f:

for i, img in enumerate(images, 1):

text = pytesseract.image_to_string(img, config=tesseract_config)

f.write(f"Page {i}\n{text}\n\n")

print(f'OCR text written to file "{output_text_file_path}"')

--------------------------------

for telugu (using google drive)

#installing/downloading in google drive visible and non visible files

# Step 3: Download required Python packages into the folder

!pip download Pillow pytesseract pdf2image -d /content/drive/MyDrive/Lib_pdf2Txt

# Download .deb files for system packages

!apt-get download poppler-utils tesseract-ocr-tel -o=dir::cache=/content/drive/MyDrive/Lib_pdf2Txt/apt_debs

------------

# for telugu

!apt-get install -y -qq poppler-utils

!apt-get install -y -qq poppler-utils tesseract-ocr-tel

!pip install --no-index --find-links=/content/drive/MyDrive/Lib_pdf2Txt Pillow pytesseract pdf2image

!dpkg -i /content/drive/MyDrive/Lib_pdf2Txt/apt_debs/*.deb || true

!apt-get install -f -y # To fix any missing dependencies

import os

os.environ["PATH"] += os.pathsep + "/usr/bin" # Optional: adjust if you installed poppler elsewhere

from PIL import Image

import pytesseract

from pdf2image import convert_from_path

pdf_file_path = '/content/mp_page2.pdf'

output_text_file_path = '/content/out.txt'

if not os.path.exists(pdf_file_path):

raise FileNotFoundError(f"PDF file {pdf_file_path} not found")

images = convert_from_path(pdf_file_path, dpi=300)

tesseract_config = '--oem 3 --psm 6 -l tel+eng'

with open(output_text_file_path, 'w', encoding='utf-8') as f:

for i, img in enumerate(images, 1):

text = pytesseract.image_to_string(img, config=tesseract_config)

f.write(f"Page {i}\n{text}\n\n")

print(f'OCR text written to file "{output_text_file_path}"')

Search This Blog

My important workaround in my journey

pdf 2 txt , for any language, using google drive and without also

Popular posts from this blog

praveen samples: idoc2edi: step by tpm configuration, with payloads

50 questoins of grok questions.

SAP CPI : camle expression in sap cpi , cm, router, filter and groovy script. format