tongoclinh/regex_cleanup.sh

## regex_cleanup.sh
#!/bin/bash

# prerequisite: perl and sd tool [https://github.com/chmln/sd]

# first, use regex to clear header and footer, depending on content of the file

# replace curly quote to straight quote
sd '[“”]' '"' $1
sd '[‘’]' "\'" $1

# replace | character with new-line. Drive OCR generate a lot of | character
sd '\|\s*' '\n' $1

# remote lines that not contain any Vietnamese characters.  CAUTION: it's maybe overkill
sd '^[^àÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬđĐèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆìÌỉỈĩĨíÍịỊjòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰỳỲỷỶỹỸýÝỵỴ]+$' '' $1

# trim trailing whitespace, needed for combining paragraphs
sd '\s+$' '' $1

# some custom regex should be placed here, before combining paragraphs

# combine paragraph. paragraph which is not ended with sentence punctuation should be combined with the next one
sd '([^\.\?!":])\s*$\n*' '$1 ' $1

# separate paragraph with an empty line (markdown style)
sd '\n+' '\n\n' $1
sd '" "' '"\n\n"' $1

# fix some mis-capitalized character (can be added more, depend on OCR result)
# using perl here because sd does not support regex look-around
perl -pi -e 's/(?<![\.\?!]) Có/ có/g;' $1
perl -pi -e 's/(?<![\.\?!]) Cô/ cô/g;' $1
perl -pi -e 's/(?<![\.\?!]) Sô/ sô/g;' $1
perl -pi -e 's/(?<![\.\?!]) Số/ số/g;' $1
	#!/bin/bash

	# prerequisite: perl and sd tool [https://github.com/chmln/sd]

	# first, use regex to clear header and footer, depending on content of the file

	# replace curly quote to straight quote
	sd '[“”]' '"' $1
	sd '[‘’]' "\'" $1

	# replace \| character with new-line. Drive OCR generate a lot of \| character
	sd '\\|\s*' '\n' $1

	# remote lines that not contain any Vietnamese characters. CAUTION: it's maybe overkill
	sd '^[^àÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬđĐèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆìÌỉỈĩĨíÍịỊjòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰỳỲỷỶỹỸýÝỵỴ]+$' '' $1

	# trim trailing whitespace, needed for combining paragraphs
	sd '\s+$' '' $1

	# some custom regex should be placed here, before combining paragraphs

	# combine paragraph. paragraph which is not ended with sentence punctuation should be combined with the next one
	sd '([^\.\?!":])\s$\n' '$1 ' $1

	# separate paragraph with an empty line (markdown style)
	sd '\n+' '\n\n' $1
	sd '" "' '"\n\n"' $1

	# fix some mis-capitalized character (can be added more, depend on OCR result)
	# using perl here because sd does not support regex look-around
	perl -pi -e 's/(?<![\.\?!]) Có/ có/g;' $1
	perl -pi -e 's/(?<![\.\?!]) Cô/ cô/g;' $1
	perl -pi -e 's/(?<![\.\?!]) Sô/ sô/g;' $1
	perl -pi -e 's/(?<![\.\?!]) Số/ số/g;' $1
No results found