Skip to content

Instantly share code, notes, and snippets.

@szz
Last active March 26, 2021 08:19
Show Gist options
  • Select an option

  • Save szz/e85290b9a51b6202f6566197e4d0dfbd to your computer and use it in GitHub Desktop.

Select an option

Save szz/e85290b9a51b6202f6566197e4d0dfbd to your computer and use it in GitHub Desktop.
fixing gender parsing
class koronaVirusGovHuParser
{
constructor() {
this.cellSeparator = "\t";
this.newLine = "\n";
this.deathCasePageSelector = '.view-elhunytak .views-table > tbody:nth-child(2)';
this.knownCases = new Map();
this.__initTextOutput();
}
__initTextOutput()
{
this.textOutput = '';
}
__getFormattedDateTime(with_time = false, query_date_time = new Date())
{
var query_date_time_str =
query_date_time.getFullYear() + '-' +
(query_date_time.getMonth() + 1 < 10 ? '0' : '') + (query_date_time.getMonth()+1) + '-' +
(query_date_time.getDate() < 10 ? '0' : '') + query_date_time.getDate();
if (with_time === true)
{
query_date_time_str +=
' ' + (query_date_time.getHours() < 10 ? '0' : '') + query_date_time.getHours() +
':' + (query_date_time.getMinutes() < 10 ? '0' : '') + query_date_time.getMinutes();
}
return query_date_time_str;
}
__normalizeGender(gender, id)
{
if (gender == 'Férfi' || gender == 'férfi') return 'man';
if (gender == 'Nő' || gender == 'nő' || gender == 'Nõ' || gender == 'nõ') return 'woman';
console.error('unknown gender "' + gender + '" @' + id);
return '???';
}
__normalizeDiseases(diseases_list)
{
return '';
}
__insertToTextOutput(row)
{
this.textOutput = row + this.newLine + this.textOutput;
}
__addToTextOutput(row)
{
this.textOutput += row + this.newLine;
}
__parseDeathCasePage(last_known_case_id)
{
this.__initTextOutput();
var last_added_id = 0;
var data_row;
for (data_row of document.querySelector(this.deathCasePageSelector).rows)
{
var id = parseInt(data_row.children.item(0).innerText);
if (id <= last_known_case_id) break;
var case_data = {
'id' : id,
'gender' : this.__normalizeGender(data_row.children.item(1).innerText.trim(), id),
'age' : data_row.children.item(2).innerText.trim(),
'original_diseases' : data_row.children.item(3).innerText.trim(),
'normalized_diseases' : this.__normalizeDiseases(data_row.children.item(3).innerText.trim()),
}
this.__insertToTextOutput(
case_data.id + this.cellSeparator +
this.__getFormattedDateTime() + this.cellSeparator +
case_data.gender + this.cellSeparator +
case_data.age + this.cellSeparator +
'"' + case_data.original_diseases + '"'
// + this.cellSeparator
// + case_data.normalized_diseases + '"'
);
this.knownCases[id] = case_data;
last_added_id = id;
}
return last_added_id ; //last parsed id
}
__parseMainPage()
{
this.__initTextOutput();
var selectors =
{
'#content-fertozott-pest' : 'fertozott bp',
'#content-fertozott-videk' : 'fertozott videk',
'#content-gyogyult-pest' : 'gyogyult bp',
'#content-gyogyult-videk' : 'gyogyult videk',
'#content-elhunyt-pest' : 'elhunyt bp',
'#content-elhunyt-videk' : 'elhunyt videk',
'#content-karantenban' : 'karanten',
'#content-mintavetel' : 'elvegzett teszt',
"#content-beoltottak" : "oltottak szama",
};
this.__addToTextOutput('query time' + this.cellSeparator + this.__getFormattedDateTime(true));
this.__addToTextOutput('last update' + this.cellSeparator + document.querySelector('#block-block-1 div p').innerHTML.substring(28).replace(/\./g,'-').replace(/\- /,' '));
for (var key in selectors)
{
var label = selectors[key];
var value = document.querySelector(key).innerHTML.replace(/\s/g, '');
this.__addToTextOutput(label + this.cellSeparator + value);
}
for (var slided_content of document.querySelector("#flexslider-1 ul").children)
{
var slided_content_text = slided_content.innerText;
var found = slided_content_text.match(/\s*(([0-9]+\s)*[0-9]+) koronavírusos beteget ápolnak kórházban, közülük (([0-9]+\s)*[0-9]+)-[a|e]n vannak lélegeztetőgépen\./i); //https://regexr.com/
console.log(found);
if (found != null)
{
this.__addToTextOutput('korhazban' + this.cellSeparator + found[1].replace(/\s/g, ''));
this.__addToTextOutput('lelegezteton' + this.cellSeparator + found[3].replace(/\s/g, ''));
}
}
}
getTextOutput()
{
return this.textOutput;
}
getInfoByPage(last_known_case_id = 1)
{
var url = new URL(window.location);
if (url.pathname == '/')
{
this.__parseMainPage();
}
else if (url.pathname == '/elhunytak')
{
var last_added_id = this.__parseDeathCasePage(last_known_case_id);
console.info('last_added_id:', last_added_id);
}
return this.getTextOutput();
}
}
var kvghp = new koronaVirusGovHuParser()
last_known_case_id = 19224;
console.log(kvghp.getInfoByPage(last_known_case_id));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment