This is a simple set of scripts for gathering and cleaning irc logs. It's designed to work with the ##hplusroadmap logs at https://gnusha.org/logs/
It relies on the ftfy python lib.
This is a simple set of scripts for gathering and cleaning irc logs. It's designed to work with the ##hplusroadmap logs at https://gnusha.org/logs/
It relies on the ftfy python lib.
| # simple bash script to clean up the log files. depends on ftfy. | |
| cd logfiles || exit 1; | |
| mkdir -p cleaned | |
| # ftfy is python based | |
| mkdir -p venv | |
| mkdir -p venv/lib64 # workaround | |
| python3 -m venv --copies venv/ | |
| . venv/bin/activate | |
| pip install ftfy | |
| echo "using ftfy to make cleaned files" | |
| find . -regextype posix-extended -regex './[0-9]{4}-[0-9]{2}-[0-9]{2}\.log' -print0 | xargs -0 -n 1 -P "$(nproc)" sh -c 'ftfy "$0" -e latin-1 --preserve-entities > "cleaned/${0##*/}"' | |
| echo "filtering out timestamps and discord bridge spam" | |
| cat cleaned/*.log | | |
| grep -ahv '\-!-' | | |
| grep -va '^---' | | |
| sed 's/^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}//' | | |
| sed 's/^[0-9]\{2\}:[0-9]\{2\} //' | | |
| sed 's/^< hprmbridge> /< /g' > logfiles.log | |
| du -h logfiles.log |
| # simple bash script to fetch all dated logs for hplusroadmap | |
| mkdir -p logfiles | |
| curl https://gnusha.org/logs/ -o logindex | |
| sed -n 's/.*<a href="\([^"]*\)">.*/\1/p' logindex | | |
| grep '^[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}.log$' | | |
| sed -n 's/^/https\:\/\/gnusha\.org\/logs\//p' > urls.txt | |
| cd logfiles | |
| wget -nc -i ../urls.txt --tries=3 |