Skip to content

Instantly share code, notes, and snippets.

@leighlatham123
Last active July 28, 2021 12:38
Show Gist options
  • Select an option

  • Save leighlatham123/4b1f9be96f4bc34de3eeeea58a01549a to your computer and use it in GitHub Desktop.

Select an option

Save leighlatham123/4b1f9be96f4bc34de3eeeea58a01549a to your computer and use it in GitHub Desktop.
Validate spreadsheet file and it's row data as UTF-8 (csv, xls, etc)
<?php
const MIMES = array(
'text/csv',
'text/plain',
'application/csv',
'application/vnd.ms-excel',
'application/vnd.ms-excel.sheet.binary.macroEnabled.12',
'application/vnd.ms-excel.sheet.macroEnabled.12',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
);
/**
* [Running from command line. Comment out if you want to run somewhere else]
*
* Check to see if the user has specified any arguments to for the script to consume.
* If no arguments were passed to the script, return the error.
* Check to see if the file path specified in argument 1 is valid.
* If the path is not valid, return the error.
*/
if ($argc < 2 )
{
die("Error: Missing arguments. Please specify the full file path and name you wish to scan.\r\n");
}
else
{
$arguments = $argv;
$file = $arguments[1];
check_file_is_csv($file);
}
//
/**
* Validate the encoding of the file specified against the UTF-8 format.
* If not valid UTF-8 format, return the exact or closes file format identified.
*/
if (mb_check_encoding(file_get_contents($file), 'UTF-8'))
{
echo "File format encoding is valid UTF-8. \r\n";
}
else
{
$format = exec('file -I ' . $file, $output);
$charset = preg_match('/charset=(.*)/', $format, $matches);
echo "File format encoding is not valid UTF-8, it is " .$matches[1]. "!\r\n";
}
/**
* Loop through the specified file
*/
$row = 0;
if (($handle = fopen($file,"r")) !== FALSE) {
while (($data = fgetcsv($handle, 1000, ",")) !== FALSE) {
if ($row !== 0)
{
for ($c=0; $c < count($data); $c++) {
$unsupported_char = has_unsupported_char($data[$c]);
if ($unsupported_char)
{
/**
* Take into account that we remove the first row due to it being a header row.
* Take into account that the $col is array and starts from 0.
*/
$actual_row = $row + 1;
$actual_col = $c + 1;
echo "Unsupported character '$unsupported_char' identified found on row #$actual_row in column $c next to $data[$c].\n";
}
$enc = mb_detect_encoding($data[$c], 'UTF-8');
if($enc !== "UTF-8")
{
echo "None UTF-8 character identified found on #$row in column $c!\n";
}
}
}
$row++;
}
echo "\r\n";
echo "Total of $row rows checked.";
fclose($handle);
}
/**
* Functions
*/
function has_unsupported_char($string){
if($string == "") return FALSE;
$string_array = str_split($string);
for($i=0; $i < count($string_array); $i++)
{
$byte = ord($string_array[$i]);
if($byte > 127) return $string_array[$i];
}
}
function check_file_is_csv($file)
{
$finfo = finfo_open(FILEINFO_MIME_TYPE);
$mime = @finfo_file($finfo, $file);
if(!$mime)
{
die("Unable to locate file specified on the system.");
}
if(!in_array($mime, MIMES))
{
die("Sorry, this mime type not allowed");
}
finfo_close($finfo);
return;
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment