comparing txt formated and pdf(after orignial extract ).

- September 23, 2024

comparing pdf and txt(formated) one. pyton code.

check if any line, has missed text. or any in line.

# Install google-cloud-vision and pdf2image
!pip install google-cloud-vision pdf2image PdfReader

# Install poppler-utils
!apt-get update
!apt-get install -y poppler-utils

import gc
import io
import os
import re
from google.cloud import vision
from google.cloud.vision_v1 import types
from pdf2image import convert_from_path
from google.oauth2 import service_account

gc.collect()

# Paths to the files
credentials_path = '/content/credentials_doc.json'
pdf_file_path = '/content/Bhagavan-Ravana-Brahma.pdf'
input_text_file_path = '/content/Bhagavan-Ravana-Brahma_coludtext_13sep24_Updated.txt'

# Verify the file existence
def check_file_exists(file_path, file_description):
    if os.path.exists(file_path):
        print(f"{file_description} file found.")
    else:
        print(f"{file_description} file not found. Please check the path.")
        exit()

check_file_exists(credentials_path, "Credentials")
check_file_exists(pdf_file_path, "PDF")
check_file_exists(input_text_file_path, "Input text")

# Authenticate using service account
credentials = service_account.Credentials.from_service_account_file(credentials_path)
client = vision.ImageAnnotatorClient(credentials=credentials)

# Convert PDF to images
images = convert_from_path(pdf_file_path, dpi=300)

# Function to perform OCR on an image
def perform_ocr(image):
    content = io.BytesIO()
    image.save(content, format='PNG')
    content = content.getvalue()
    image = types.Image(content=content)
    response = client.document_text_detection(image=image)
    return response.full_text_annotation.text

# Process each image and store the OCR text
ocr_text = []
for i, image in enumerate(images):
    text = perform_ocr(image)
    ocr_text.append(text)

# Join all OCR text into a single string
ocr_full_text = "\n".join(ocr_text)

# Read input text file
with open(input_text_file_path, 'r', encoding='utf-8') as input_file:
    input_lines = input_file.readlines()

# Split OCR text into lines
ocr_lines = ocr_full_text.split('\n')

# Function to filter lines: get first and last words, ignore empty lines, lines starting with a number, "Page", or space in text file
def filter_lines(lines, is_text_file=False):
    filtered_lines = []
    for line in lines:
        stripped_line = line.strip()
        if stripped_line and not re.match(r'^\d', stripped_line) and not stripped_line.startswith("Page") and not (is_text_file and line.startswith(" ")):  # Conditions
            words = stripped_line.split()
            if len(words) > 1:  # Ensure there are at least two words to extract first and last
                first_last = words[0] + " " + words[-1]
                filtered_lines.append(first_last)
    return filtered_lines

# Filtered OCR lines and input text lines
filtered_ocr_lines = filter_lines(ocr_lines)
filtered_input_lines = filter_lines(input_lines, is_text_file=True)

# Compare filtered lines and display any mismatches from the text file
for line_number, input_line in enumerate(filtered_input_lines, start=1):
    input_line = input_line.strip()
    found = False

    for ocr_line in filtered_ocr_lines:
        if input_line == ocr_line:
            found = True
            break

    if not found:
        print(f"Line {line_number} from text file not found in PDF:")
        print(f"  Text File Line: '{input_line}'")

print('Validation complete.')

Search This Blog

My important workaround in my journey

comparing txt formated and pdf(after orignial extract ).

Popular posts from this blog

SAP CPI : camle expression in sap cpi , cm, router, filter and groovy script. format

pss book: గురు ప్రార్థనామంజరి . completed 21st july 2024

pss book : శ్రీకృష్ణుడు దేవుడా, భగవంతుడా completed , second review needed. 26th April 2024