pss work: replace \r\n or \n in paragraph only, in a file, paragraph as single line worked. colab code;
(?<![.,!?])\r?\n\r?\n it ignore , commas,? ! , but not full stop ; such lines will be highlighted.
find lines without full stop, need to merge with next paragraph. (?<!\.)\r?\n\r?\n
import os
def process_paragraphs(input_file, output_file):
with open(input_file, 'r', encoding='utf-8') as file:
content = file.read()
# Splitting content into paragraphs using double newlines as delimiter
paragraphs = content.split('\n\n') # This handles paragraphs separated by blank lines
processed_paragraphs = []
for paragraph in paragraphs:
# Removing unnecessary line breaks within a paragraph
cleaned_paragraph = ' '.join(paragraph.splitlines()).strip()
processed_paragraphs.append(cleaned_paragraph)
# Joining paragraphs back with double newlines to maintain separation
processed_text = '\n\n'.join(processed_paragraphs)
# Writing processed text to output file
with open(output_file, 'w', encoding='utf-8') as file:
file.write(processed_text)
print(f"Processed file saved as: {output_file}")
# Define file paths
input_file_path = "/content/CLOUD_సామెతల జ్ఞానము_25March2025.txt" # Change filename accordingly
output_file_path = "/content/CLOUD_సామెతల జ్ఞానము_25March2025_output.txt"
# Process file
process_paragraphs(input_file_path, output_file_path)