teleboas/Cleaning-RNCS.md

## Cleaning-RNCS.md

      
    Raw
  

              Cleaning-RNCS.md
            
          
    1. Exploring

$ rsync -a deploy@ns3107905.ip-54-37-87.eu://home/deploy/rncs_data/opendata-rncs.inpi.fr .

$ cd opendata-rncs.inpi.fr/public/IMR_Donnees_Saisies
$ tree -d -L 3 . 
IMR_Donnees_Saisies
├── tc
│   ├── flux
│   │   ├── 2017
│   │   ├── 2018
│   │   └── 2019
│   └── stock
│       ├── 2017
│       ├── 2018
│       └── 2019
└── titmc
    ├── flux
    │   ├── 2017
    │   ├── 2018
    │   └── 2019
    └── stock
        └── 2018

$ du -sh */*/*                                                                             
3.8G	tc/flux/2017
7.6G	tc/flux/2018
4.9G	tc/flux/2019
1.9G	tc/stock/2017
828K	tc/stock/2018
114M	tc/stock/2019
68M	titmc/flux/2017
164M	titmc/flux/2018
53M	titmc/flux/2019
79M	titmc/stock/2018

$ find ../IMR_Donnees_Saisies/tc/flux -name "*.csv" | awk -F_ '{ $1=$2=$3=$4=$5=$6=$7=""; print }'  | sort | uniq -c                                                                    
  94177        actes.csv
  94177        comptes annuels.csv
  94176        ets.csv
  94176        ets nouveau modifie EVT.csv
  94177        ets supprime EVT.csv
  94177        obs.csv
  94177        PM.csv
  94176        PM EVT.csv
  94176        PP.csv
  94176        PP EVT.csv
  94176        rep.csv
  94176        rep nouveau modifie EVT.csv
  94176        rep partant EVT.csv

$ find  tc/flux -name "*.csv" -print0 | xargs -0 file -b | sort | uniq -c | sort -k2nr
      1 8-channel Octalyzer module sound data Title: "\357\273\277Code Greffe;Nom_Greffe;Numero_Gestion;Siren;Type;Nom_Patronymique;Nom_Usage;Pseudonyme;Pr\303\251n"
1025317 UTF-8 Unicode (with BOM) text, with very long lines
   1070 UTF-8 Unicode (with BOM) text, with CRLF line terminators
 191957 UTF-8 Unicode (with BOM) text
      1 UTF-8 Unicode (with BOM) text, with LF, NEL line terminators
     25 UTF-8 Unicode (with BOM) text, with very long lines, with LF, NEL line terminators
      2 UTF-8 Unicode text, with very long lines, with CRLF line terminators
     35 data
   5885 UTF-8 Unicode (with BOM) text, with very long lines, with CRLF line terminators

 $ find tc/flux -name "*.csv" -exec cat {} + | wc -l  | sed ':a;s/\B[0-9]\{3\}\>/,&/;ta' | awk '{ print $1, "total lines" }' 

42,314,569 total lines

2. Cleaning defects in tc flux files

WARNING: Edits in place. Make a copy of the files before you run this.
This script is idempotent. It can be run several times on the same files, it will fix them the first time and not change them anymore afterwards.
The resulting files should be RFC4180 compliant.
#! /bin/bash

for file in "$@"; do
  if [ ! -f "$file" ]; then echo "The file does not exist" && break; fi
  if [ ! "${file##*.}" = "csv" ]; then echo "The file does not have a csv extension" && break; fi

  # Removing the BOM
  # | Use of a BOM is neither required nor recommended for UTF-8
  # source: https://www.unicode.org/versions/Unicode12.0.0/ch02.pdf#G19273
  # this should touch almost all files (1224255 at the time of writing)
  LANG=C LC_ALL=C sed -i '1s/^\xEF\xBB\xBF//' "$file"

  # Temporaryly removing the CRs
  # We'll add them back at the end
  # this should touch 6957 files
  LANG=C LC_ALL=C sed -i -E 's/\r//g' "$file"

  # Removing Unicode Character 'NEXT LINE (NEL)' (U+0085)
  # this is not strictly necessary but might avoid problems downstream
  # this should touch 26 files
  LANG=C LC_ALL=C sed -i 's/\xC2\x85/\x20/g' "$file"

  # Cleaning col "Décision" in actes.csv files
  # between 2018-01-24 and 2018-04-28
  # this should fix 11619 files
  if [[ $file =~ .*2018012[6789].*_actes\.csv ||
        $file =~ .*2018013.*_actes\.csv ||
        $file =~ .*20180[23].*_actes\.csv ||
        $file =~ .*201804[01].*_actes\.csv ||
        $file =~ .*2018042[012345678].*_actes\.csv ]]; then
    sed -i -E 's/([^"])" {1,2}"([^"])/\1 \2/g' "$file"
    sed -i -E 's/" ;/";/g' "$file"
    sed -i -E 's/; "/;"/g' "$file"
  fi
  
  # Cleaning col "Origine_Fonds_Info"
  # in ets.csv and ets_nouveau_modifie_EVT.csv files
  # between 2018-01-26 and 2018-02-01
  # this should fix 1730 files
  if [[ $file =~ .*20180[12].*_ets.csv ||
        $file =~ .*20180[12].*_ets_nouveau_modifie_EVT\.csv ]]; then
    sed -i -E 's/;"*([^";]*|([^";]*""[^";]*)*)"* - "(Précédent exploitant|Précédent propriétaire|Précédent propriétaire exploitant|Loueur du fonds|Propriétaire non exploitant)" - "*([^";]*|([^";]*""[^";]*)*)"*;/;"\1 - \3 - \4";/' "$file"
    sed -i -E 's/;"([^";]*|([^";]*""[^";]*)*)" -  - ;/;"\1 -  - ";/' "$file"
  fi

  # clean the /;"N;/ pattern
  # in EVT.csv files between 2017-08-24 and 2017-11-20
  # and in actes.csv on 2018-07-03
  # this should fix 56 files
  if [[ $file =~ .*2017.*_PP_EVT\.csv ||
        $file =~ .*2017.*_rep_partant_EVT\.csv ||
        $file =~ .*2017.*_rep_nouveau_modifie_EVT\.csv ||
        $file =~ .*20180703.*_actes\.csv ]]; then
    sed -i ':a;s/;"N;/;"N";/;ta' "$file"
  fi

  # force RFC4180 format for unconforming but parseable files
  # between 2017-05-12 and 2018-02-03
  # requires perl and Text::CSV_XS v>=1.29 (warning Text::CSV_PP might be too lax)
  # requires GNU awk v>=4.1.0 (for the new inplace option)
  # this should fix 9430 files
  if [[ $file =~ .*2017.*\.csv ||
        $file =~ .*201801.*\.csv ||
        $file =~ .*2018020[123].*\.csv ]]; then
    perl -MText::CSV -ne '
       BEGIN
       {$csv = Text::CSV->new(
          { sep_char => chr(59),
            quote_char => chr(34),
            escape_char => chr(34),
            binary => 1,
            strict => 1,
            auto_diag => 2,
            diag_verbose => 0 });}
       $csv->parse($_)' "$file" || \
    { perl -MText::CSV -ne '
       BEGIN
       {$csv = Text::CSV->new(
          { sep_char => chr(59),
            quote_char => chr(196),
            escape_char => chr(196),
            binary => 1,
            strict => 1,
            auto_diag => 2,
            diag_verbose => 0 });}
       $csv->parse($_)' "$file" && \
    awk -i inplace -v FS=';' -v OFS=';' '{
      for ( i=1; i<=NF; i++) {
        if ( $i ~ /[a-zA-Z]/ ) {
          gsub(/["]/,"",$i); $i="\"" $i "\""    # Remove dquotes, add them back.
          }
        }
      }1' "$file"; }
  fi

  # special case: guarding a few ungarded embedded double quotes
  if [[ $file =~ 9301_578_20180426_060039_6_rep_nouveau_modifie_EVT.csv ||
        $file =~ 5103_244_20180413_062946_6_rep_nouveau_modifie_EVT.csv ||
        $file =~ 91305_218_20180315_054206_6_rep_nouveau_modifie_EVT.csv ||
        $file =~ 5301_182_20180126_061912_9_ets_nouveau_modifie_EVT.csv ||
        $file =~ 2002_187_20180126_063640_9_ets_nouveau_modifie_EVT.csv ||
        $file =~ 0101_182_20180130_053127_9_ets_nouveau_modifie_EVT.csv ||
        $file =~ 3802_188_20180127_055646_8_ets.csv ||
        $file =~ 3302_377_20180130_065947_8_ets.csv ]]; then
    sed -i -E 's/;([^";]+)"([^";]+)";/;"\1""\2""";/' "$file"
  fi
  if [[ $file =~ 4901_192_20180130_063155_9_ets_nouveau_modifie_EVT.csv ]]; then
    sed -i -E 's/;"([^";]+)"([^";]+);/;"""\1""\2";/' "$file"
  fi
  if [[ $file =~ 6002_444_20180331_055404_3_PP.csv ||
        $file =~ 6002_450_20180408_091314_3_PP.csv ||
        $file =~ 6002_455_20180410_061903_3_PP.csv ||
        $file =~ 4101_193_20180126_055813_8_ets.csv ||
        $file =~ 9301_404_20180130_061441_8_ets.csv ]]; then
     sed -i -E 's/;([^";]+)"([^";]+)"([^";]+);/;"\1""\2""\3";/' "$file"
   fi

   # special case: bogus file with wrong number of fields
   if [[ $file =~ 1402_580_20190306_091648_5_rep.csv ]]; then
     sed -i 's/;supprimé;supprimé;supprimé;supprimé;supprimé;/;supprimé;supprimé;supprimé;supprimé;/' "$file"
   fi

   # special case: bogus empty files with a blank line
   if [[ $file =~ 5201_353_20181009_084321_4_PP_EVT.csv ||
       $file =~ 5201_353_20181009_084321_7_rep_partant_EVT.csv ]]; then
     sed -i '2 d' "$file"
   fi

  # special case: tc/flux/2017/09/28/1303/197/1303_197_20170928_074405_9_ets_nouveau_modifie_EVT.csv
  # this one is RFC4180 compliant but the content seems bogus (fields misalignement)

  # Fixing the line breaks
  # | Each record is located on a separate line, delimited by a line break (CRLF)
  # source: https://tools.ietf.org/html/rfc4180#section-2
  # this should touch all files (1224230 at the time of writing)
  LANG=C LC_ALL=C sed -i -E 's/([^\r])$/\1\r/g' "$file"

done
Still to take care off:
3405_180_20180103_093457_11_obs.csv
1305_218_20180315_054206_6_rep_nouveau_modifie_EVT.csv
3. Cleaning defects in tc stock files

Everything seems mostly fine, except the Ville_Naissance field in _rep.csv files.
Special case:
7608_S1_20170504_5_rep.csv:7608;"Rouen";2000D00774;433780855;P.Physique;METOT;;;Philippe, François, Jacques;;;;"";"""LA GARENNE"" ; CD 124";"";76570;"EMANVILLE";"";"FRANCE";1953-08-29;"76540 ROUEN ";"FRANCE";"Française";Gérant;;;;;;;;;;;;;;;;;;;;;1;2015-11-26;Nouveau dirigeant
Coming soon...
4. Checking that everyting is strictly RF4180 compliant

#! /bin/bash

exvalue=0;

for file in "$@"; do
  if [ ! -f "$file" ]; then echo "The file does not exist" && break; fi
  if [ ! "${file##*.}" = "csv" ]; then echo "The file does not have a csv extension" && 
break; fi

   # check strict RFC4180 compliance
   # requires perl and Text::CSV_XS v>=1.29 (warning Text::CSV_PP might be too lax)

     perl -MText::CSV -ne '
       BEGIN
       {$csv = Text::CSV->new(
          { sep_char => chr(59),
            quote_char => chr(34),
            escape_char => chr(34),
            binary => 1,
            strict => 1,
            auto_diag => 2,
            diag_verbose => 0 });}
       $csv->parse($_)' "$file" 2>/dev/null || { echo "$file is not RFC4180 compliant" &
& exvalue=1; }

done
exit $exvalue;

Then just use:
find . -name "*.csv" -print0 | xargs -0I{} check.sh {}

or better with GNU parallel:
find . -name "*.csv" -print0 | parallel check.sh {}
No results found