Last active
July 28, 2021 12:38
-
-
Save leighlatham123/4b1f9be96f4bc34de3eeeea58a01549a to your computer and use it in GitHub Desktop.
Validate spreadsheet file and it's row data as UTF-8 (csv, xls, etc)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| const MIMES = array( | |
| 'text/csv', | |
| 'text/plain', | |
| 'application/csv', | |
| 'application/vnd.ms-excel', | |
| 'application/vnd.ms-excel.sheet.binary.macroEnabled.12', | |
| 'application/vnd.ms-excel.sheet.macroEnabled.12', | |
| 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' | |
| ); | |
| /** | |
| * [Running from command line. Comment out if you want to run somewhere else] | |
| * | |
| * Check to see if the user has specified any arguments to for the script to consume. | |
| * If no arguments were passed to the script, return the error. | |
| * Check to see if the file path specified in argument 1 is valid. | |
| * If the path is not valid, return the error. | |
| */ | |
| if ($argc < 2 ) | |
| { | |
| die("Error: Missing arguments. Please specify the full file path and name you wish to scan.\r\n"); | |
| } | |
| else | |
| { | |
| $arguments = $argv; | |
| $file = $arguments[1]; | |
| check_file_is_csv($file); | |
| } | |
| // | |
| /** | |
| * Validate the encoding of the file specified against the UTF-8 format. | |
| * If not valid UTF-8 format, return the exact or closes file format identified. | |
| */ | |
| if (mb_check_encoding(file_get_contents($file), 'UTF-8')) | |
| { | |
| echo "File format encoding is valid UTF-8. \r\n"; | |
| } | |
| else | |
| { | |
| $format = exec('file -I ' . $file, $output); | |
| $charset = preg_match('/charset=(.*)/', $format, $matches); | |
| echo "File format encoding is not valid UTF-8, it is " .$matches[1]. "!\r\n"; | |
| } | |
| /** | |
| * Loop through the specified file | |
| */ | |
| $row = 0; | |
| if (($handle = fopen($file,"r")) !== FALSE) { | |
| while (($data = fgetcsv($handle, 1000, ",")) !== FALSE) { | |
| if ($row !== 0) | |
| { | |
| for ($c=0; $c < count($data); $c++) { | |
| $unsupported_char = has_unsupported_char($data[$c]); | |
| if ($unsupported_char) | |
| { | |
| /** | |
| * Take into account that we remove the first row due to it being a header row. | |
| * Take into account that the $col is array and starts from 0. | |
| */ | |
| $actual_row = $row + 1; | |
| $actual_col = $c + 1; | |
| echo "Unsupported character '$unsupported_char' identified found on row #$actual_row in column $c next to $data[$c].\n"; | |
| } | |
| $enc = mb_detect_encoding($data[$c], 'UTF-8'); | |
| if($enc !== "UTF-8") | |
| { | |
| echo "None UTF-8 character identified found on #$row in column $c!\n"; | |
| } | |
| } | |
| } | |
| $row++; | |
| } | |
| echo "\r\n"; | |
| echo "Total of $row rows checked."; | |
| fclose($handle); | |
| } | |
| /** | |
| * Functions | |
| */ | |
| function has_unsupported_char($string){ | |
| if($string == "") return FALSE; | |
| $string_array = str_split($string); | |
| for($i=0; $i < count($string_array); $i++) | |
| { | |
| $byte = ord($string_array[$i]); | |
| if($byte > 127) return $string_array[$i]; | |
| } | |
| } | |
| function check_file_is_csv($file) | |
| { | |
| $finfo = finfo_open(FILEINFO_MIME_TYPE); | |
| $mime = @finfo_file($finfo, $file); | |
| if(!$mime) | |
| { | |
| die("Unable to locate file specified on the system."); | |
| } | |
| if(!in_array($mime, MIMES)) | |
| { | |
| die("Sorry, this mime type not allowed"); | |
| } | |
| finfo_close($finfo); | |
| return; | |
| } | |
| ?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment