pdf 2 txt , for any language, using google drive and without also
required packages. saved in ranta google drive.
----------
without google drive
#with out google drive save.
---------------
#date 1st aug 2025
# Install required packages fro pdf to text.
!apt-get update -qq
!apt-get install -y -qq poppler-utils tesseract-ocr-tel
!pip install -q Pillow pytesseract pdf2image
------------------
#date: 1st aug 2025
from PIL import Image
import pytesseract
import os
from pdf2image import convert_from_path
# File paths
pdf_file_path = '/content/Mathamu-Pathamu.pdf'
output_text_file_path = '/content/out.txt'
# Check if PDF exists
if not os.path.exists(pdf_file_path):
raise FileNotFoundError(f"PDF file {pdf_file_path} not found")
# Convert PDF to images
images = convert_from_path(pdf_file_path, dpi=300)
# Tesseract config for Telugu + English
tesseract_config = '--oem 3 --psm 6 -l tel+eng'
# Extract and write text
with open(output_text_file_path, 'w', encoding='utf-8') as f:
for i, img in enumerate(images, 1):
text = pytesseract.image_to_string(img, config=tesseract_config)
f.write(f"Page {i}\n{text}\n\n")
print(f'OCR text written to file "{output_text_file_path}"')
----------------
#for english
#installing/downloading in google drive visible and non visible files
# Step 3: Download required Python packages into the folder
!pip download Pillow pytesseract pdf2image -d /content/drive/MyDrive/Lib_pdf2Txt
# Download .deb files for system packages
!apt-get download poppler-utils tesseract-ocr-tel -o=dir::cache=/content/drive/MyDrive/Lib_pdf2Txt/apt_debs
------------- for english worked---------
!apt-get install -y -qq poppler-utils ( using google drive)
!pip install --no-index --find-links=/content/drive/MyDrive/Lib_pdf2Txt Pillow pytesseract pdf2image
!dpkg -i /content/drive/MyDrive/Lib_pdf2Txt/apt_debs/*.deb || true
!apt-get install -f -y # To fix any missing dependencies
import os
os.environ["PATH"] += os.pathsep + "/usr/bin" # Optional: adjust if you installed poppler elsewhere
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
pdf_file_path = '/content/page128.pdf'
output_text_file_path = '/content/out2.txt'
if not os.path.exists(pdf_file_path):
raise FileNotFoundError(f"PDF file {pdf_file_path} not found")
images = convert_from_path(pdf_file_path, dpi=300)
tesseract_config = '--oem 3 --psm 6 -l eng'
with open(output_text_file_path, 'w', encoding='utf-8') as f:
for i, img in enumerate(images, 1):
text = pytesseract.image_to_string(img, config=tesseract_config)
f.write(f"Page {i}\n{text}\n\n")
print(f'OCR text written to file "{output_text_file_path}"')
--------------------------------
for telugu (using google drive)
#installing/downloading in google drive visible and non visible files
# Step 3: Download required Python packages into the folder
!pip download Pillow pytesseract pdf2image -d /content/drive/MyDrive/Lib_pdf2Txt
# Download .deb files for system packages
!apt-get download poppler-utils tesseract-ocr-tel -o=dir::cache=/content/drive/MyDrive/Lib_pdf2Txt/apt_debs
------------
# for telugu
!apt-get install -y -qq poppler-utils
!apt-get install -y -qq poppler-utils tesseract-ocr-tel
!pip install --no-index --find-links=/content/drive/MyDrive/Lib_pdf2Txt Pillow pytesseract pdf2image
!dpkg -i /content/drive/MyDrive/Lib_pdf2Txt/apt_debs/*.deb || true
!apt-get install -f -y # To fix any missing dependencies
import os
os.environ["PATH"] += os.pathsep + "/usr/bin" # Optional: adjust if you installed poppler elsewhere
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
pdf_file_path = '/content/mp_page2.pdf'
output_text_file_path = '/content/out.txt'
if not os.path.exists(pdf_file_path):
raise FileNotFoundError(f"PDF file {pdf_file_path} not found")
images = convert_from_path(pdf_file_path, dpi=300)
tesseract_config = '--oem 3 --psm 6 -l tel+eng'
with open(output_text_file_path, 'w', encoding='utf-8') as f:
for i, img in enumerate(images, 1):
text = pytesseract.image_to_string(img, config=tesseract_config)
f.write(f"Page {i}\n{text}\n\n")
print(f'OCR text written to file "{output_text_file_path}"')