Instantly share code, notes, and snippets.
Last active
March 26, 2021 08:19
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
-
Save szz/e85290b9a51b6202f6566197e4d0dfbd to your computer and use it in GitHub Desktop.
fixing gender parsing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class koronaVirusGovHuParser | |
| { | |
| constructor() { | |
| this.cellSeparator = "\t"; | |
| this.newLine = "\n"; | |
| this.deathCasePageSelector = '.view-elhunytak .views-table > tbody:nth-child(2)'; | |
| this.knownCases = new Map(); | |
| this.__initTextOutput(); | |
| } | |
| __initTextOutput() | |
| { | |
| this.textOutput = ''; | |
| } | |
| __getFormattedDateTime(with_time = false, query_date_time = new Date()) | |
| { | |
| var query_date_time_str = | |
| query_date_time.getFullYear() + '-' + | |
| (query_date_time.getMonth() + 1 < 10 ? '0' : '') + (query_date_time.getMonth()+1) + '-' + | |
| (query_date_time.getDate() < 10 ? '0' : '') + query_date_time.getDate(); | |
| if (with_time === true) | |
| { | |
| query_date_time_str += | |
| ' ' + (query_date_time.getHours() < 10 ? '0' : '') + query_date_time.getHours() + | |
| ':' + (query_date_time.getMinutes() < 10 ? '0' : '') + query_date_time.getMinutes(); | |
| } | |
| return query_date_time_str; | |
| } | |
| __normalizeGender(gender, id) | |
| { | |
| if (gender == 'Férfi' || gender == 'férfi') return 'man'; | |
| if (gender == 'Nő' || gender == 'nő' || gender == 'Nõ' || gender == 'nõ') return 'woman'; | |
| console.error('unknown gender "' + gender + '" @' + id); | |
| return '???'; | |
| } | |
| __normalizeDiseases(diseases_list) | |
| { | |
| return ''; | |
| } | |
| __insertToTextOutput(row) | |
| { | |
| this.textOutput = row + this.newLine + this.textOutput; | |
| } | |
| __addToTextOutput(row) | |
| { | |
| this.textOutput += row + this.newLine; | |
| } | |
| __parseDeathCasePage(last_known_case_id) | |
| { | |
| this.__initTextOutput(); | |
| var last_added_id = 0; | |
| var data_row; | |
| for (data_row of document.querySelector(this.deathCasePageSelector).rows) | |
| { | |
| var id = parseInt(data_row.children.item(0).innerText); | |
| if (id <= last_known_case_id) break; | |
| var case_data = { | |
| 'id' : id, | |
| 'gender' : this.__normalizeGender(data_row.children.item(1).innerText.trim(), id), | |
| 'age' : data_row.children.item(2).innerText.trim(), | |
| 'original_diseases' : data_row.children.item(3).innerText.trim(), | |
| 'normalized_diseases' : this.__normalizeDiseases(data_row.children.item(3).innerText.trim()), | |
| } | |
| this.__insertToTextOutput( | |
| case_data.id + this.cellSeparator + | |
| this.__getFormattedDateTime() + this.cellSeparator + | |
| case_data.gender + this.cellSeparator + | |
| case_data.age + this.cellSeparator + | |
| '"' + case_data.original_diseases + '"' | |
| // + this.cellSeparator | |
| // + case_data.normalized_diseases + '"' | |
| ); | |
| this.knownCases[id] = case_data; | |
| last_added_id = id; | |
| } | |
| return last_added_id ; //last parsed id | |
| } | |
| __parseMainPage() | |
| { | |
| this.__initTextOutput(); | |
| var selectors = | |
| { | |
| '#content-fertozott-pest' : 'fertozott bp', | |
| '#content-fertozott-videk' : 'fertozott videk', | |
| '#content-gyogyult-pest' : 'gyogyult bp', | |
| '#content-gyogyult-videk' : 'gyogyult videk', | |
| '#content-elhunyt-pest' : 'elhunyt bp', | |
| '#content-elhunyt-videk' : 'elhunyt videk', | |
| '#content-karantenban' : 'karanten', | |
| '#content-mintavetel' : 'elvegzett teszt', | |
| "#content-beoltottak" : "oltottak szama", | |
| }; | |
| this.__addToTextOutput('query time' + this.cellSeparator + this.__getFormattedDateTime(true)); | |
| this.__addToTextOutput('last update' + this.cellSeparator + document.querySelector('#block-block-1 div p').innerHTML.substring(28).replace(/\./g,'-').replace(/\- /,' ')); | |
| for (var key in selectors) | |
| { | |
| var label = selectors[key]; | |
| var value = document.querySelector(key).innerHTML.replace(/\s/g, ''); | |
| this.__addToTextOutput(label + this.cellSeparator + value); | |
| } | |
| for (var slided_content of document.querySelector("#flexslider-1 ul").children) | |
| { | |
| var slided_content_text = slided_content.innerText; | |
| var found = slided_content_text.match(/\s*(([0-9]+\s)*[0-9]+) koronavírusos beteget ápolnak kórházban, közülük (([0-9]+\s)*[0-9]+)-[a|e]n vannak lélegeztetőgépen\./i); //https://regexr.com/ | |
| console.log(found); | |
| if (found != null) | |
| { | |
| this.__addToTextOutput('korhazban' + this.cellSeparator + found[1].replace(/\s/g, '')); | |
| this.__addToTextOutput('lelegezteton' + this.cellSeparator + found[3].replace(/\s/g, '')); | |
| } | |
| } | |
| } | |
| getTextOutput() | |
| { | |
| return this.textOutput; | |
| } | |
| getInfoByPage(last_known_case_id = 1) | |
| { | |
| var url = new URL(window.location); | |
| if (url.pathname == '/') | |
| { | |
| this.__parseMainPage(); | |
| } | |
| else if (url.pathname == '/elhunytak') | |
| { | |
| var last_added_id = this.__parseDeathCasePage(last_known_case_id); | |
| console.info('last_added_id:', last_added_id); | |
| } | |
| return this.getTextOutput(); | |
| } | |
| } | |
| var kvghp = new koronaVirusGovHuParser() | |
| last_known_case_id = 19224; | |
| console.log(kvghp.getInfoByPage(last_known_case_id)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment