-
-
Save ranvotu/d8f89781a6999b111d33040df876da1d to your computer and use it in GitHub Desktop.
Regex cleanup for OCR result.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # prerequisite: git | |
| # CLEAN-OCR | |
| # Remove Underscores and empty lines | |
| # These underscores rendered by google when doing images ocr | |
| sed -i 's/________________//g; /^$/d' $1 | |
| # replace double angle quotes to curly quotes | |
| # No need to use | |
| # sed -i 's/[«]/“/g' $1 | |
| # sed -i 's/[»]/”/g' $1 | |
| # Replace Ellipsis | |
| sed -i -E 's/\.\.\./…/g' $1 | |
| sed -i 's/[“”«»]/"/g' $1 # Replace all curly quotes with straight quotes | |
| sed -i "s/[‘’]/'/g" $1 # Replace all single curly quotes with straight single quotes | |
| # replace | character with new-line | |
| sed -i 's/|\s*/\n/g' $1 | |
| # trim trailing whitespace | |
| sed -i 's/\s*$//' $1 | |
| # Remove ZWNBSP (Zero Width No-Break Space) character | |
| # Do this before combining paragraphs | |
| sed -i 's///g' $1 | |
| # combine paragraphs | |
| # sed -i ':a;N;$!ba;s/\([^.?!"’”]\)\n/\1 /g' $1 | |
| sed -i ':a;N;$!ba;s/\([^.?!…"'"'"']\)\n/\1 /g' $1 | |
| # separate paragraphs with an empty line | |
| sed -i '/^$/d;G' $1 | |
| sed -i 's/\("[^"]*"\)\s*\n\s*\("[^"]*"\)/\1\n\n\2/g' $1 | |
| # Remove unwanted character | |
| sed -i 's/[�]//g' $1 | |
| # replace straight quotes to curly quotes before and after non-space characters | |
| # Should not use this here cause there could be some errors afterwards | |
| # Use (?<=\S)[^*]" and "(?=\S)[^*] instead later | |
| # sed -i 's/\([^[:space:]]\)"/\1“/g' $1 | |
| # sed -i 's/"\([^[:space:]]\)/”\1/g' $1 | |
| # Capitalize the first letter of each sentence | |
| # sed -i -E 's/(^)\s*(\S)/\1\U\2/g' $1 # Here to use for which character is at the begining of the line | |
| # sed -i -E 's/\b([.…!?])\s*(\S)/\1 \U\2/g' $1 # Here to use for which character in the middle paragraph, still having problems with quote(s) | |
| # add new-line between double/single quotes if there is a space inside | |
| sed -i 's/"\s*"\s*/"\n\n"/g' $1 | |
| sed -i 's/'"'"'\s*'"'"'\s*/'"'"'\n\n'"'"'/g' $1 | |
| sed -i 's/"\s*'"'"'\s*/"\n\n'"'"'/g' $1 | |
| sed -i 's/'"'"'\s*"\s*/'"'"'\n\n"/g' $1 | |
| # Remove spaces before specified punctuation marks | |
| sed -i 's/[[:space:]]*\([].!?…,);:}]\)/\1/g' $1 | |
| echo "Text cleanup completed." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment