Created
October 24, 2016 15:51
-
-
Save matusstafura/7cd2c2b4c67ba3ce86b7d7233f6de7fa to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # SED replace all html tags | |
| sed -e 's/<[^>]*>//g' | |
| # SED multiple divided by ; | |
| sed 's/ab/~/g; s/bc/ab/g; s/~/bc/g' | |
| # SED delete empty lines | |
| sed '/^\s*$/d' | |
| # SED delete all lines except pattern | |
| sed '/pattern/!d' | |
| # SED get all HTML links | |
| sed -n 's/.*href="\([^"]*\).*/\1/p' file.txt | |
| # SED extract links from sitemap | |
| grep -Po 'http(s?)://[^ \"()\<>]*' file.xml > file_extracted.xml | |
| # WGET download from text file | |
| wget -b -e robots=off --wait 1 --limit-rate=300k -i sitemap.xml --user-agent="Mozilla/5.0 (X11; U; Linux i686; en-US; │rv:1.9.0.3) Gecko/2008092416 Firefox/3.0.3" | |
| # Linux - Removes all tags in all xml and pattern in files in folder | |
| find . -name "*.xml" -type f -exec sed -i 's/<[^>]*>//g; /pattern/d' {} \; | |
| # Linux count lines | |
| ls -1 | wc -l | |
| # AWK - Remove duplicate lines | |
| awk '!seen[$0]++' file.txt > newfile.txt | |
| # REGEX until | |
| (?:(?!/).)* | |
| # REGEX html tags pattern | |
| <[^>]*> | |
| # REGEX last occurence - | |
| .*\- | |
| # REGEX after | |
| [^/]+$ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment