Created
March 2, 2022 13:06
-
-
Save tongoclinh/4bfd5a50779ce3e8b7b2c25e6e50d9a9 to your computer and use it in GitHub Desktop.
Regex cleanup for OCR result.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # prerequisite: perl and sd tool [https://github.com/chmln/sd] | |
| # first, use regex to clear header and footer, depending on content of the file | |
| # replace curly quote to straight quote | |
| sd '[“”]' '"' $1 | |
| sd '[‘’]' "\'" $1 | |
| # replace | character with new-line. Drive OCR generate a lot of | character | |
| sd '\|\s*' '\n' $1 | |
| # remote lines that not contain any Vietnamese characters. CAUTION: it's maybe overkill | |
| sd '^[^àÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬđĐèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆìÌỉỈĩĨíÍịỊjòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰỳỲỷỶỹỸýÝỵỴ]+$' '' $1 | |
| # trim trailing whitespace, needed for combining paragraphs | |
| sd '\s+$' '' $1 | |
| # some custom regex should be placed here, before combining paragraphs | |
| # combine paragraph. paragraph which is not ended with sentence punctuation should be combined with the next one | |
| sd '([^\.\?!":])\s*$\n*' '$1 ' $1 | |
| # separate paragraph with an empty line (markdown style) | |
| sd '\n+' '\n\n' $1 | |
| sd '" "' '"\n\n"' $1 | |
| # fix some mis-capitalized character (can be added more, depend on OCR result) | |
| # using perl here because sd does not support regex look-around | |
| perl -pi -e 's/(?<![\.\?!]) Có/ có/g;' $1 | |
| perl -pi -e 's/(?<![\.\?!]) Cô/ cô/g;' $1 | |
| perl -pi -e 's/(?<![\.\?!]) Sô/ sô/g;' $1 | |
| perl -pi -e 's/(?<![\.\?!]) Số/ số/g;' $1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment