Last active
January 3, 2026 02:35
-
-
Save fabiolimace/c2f3478931277e036eff7f1c4c000dc2 to your computer and use it in GitHub Desktop.
Removed from Note Keeper (formerly APKM) at 02 jan 2026
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/awk -f | |
| # | |
| # | |
| # NOTE: | |
| # | |
| # This file is been refactored. Only a few thing work. | |
| # The original code is safe in `apt-html.original.awk`. | |
| # | |
| # | |
| # | |
| # | |
| # Converts markdown to HTML | |
| # | |
| # See: | |
| # | |
| # * https://spec.commonmark.org | |
| # * https://markdown-it.github.io | |
| # * https://www.javatpoint.com/markdown | |
| # * https://www.markdownguide.org/cheat-sheet | |
| # * https://www.markdownguide.org/extended-syntax | |
| # * https://pandoc.org/MANUAL.html#pandocs-markdown | |
| # * https://www.dotcms.com/docs/latest/markdown-syntax | |
| # * https://www.codecademy.com/resources/docs/markdown | |
| # * https://daringfireball.net/projects/markdown/syntax | |
| # * https://www.ecovida.org.br/docs/manual_site/markdown | |
| # * https://quarto.org/docs/authoring/markdown-basics.html | |
| # * https://docs.github.com/en/get-started/writing-on-github | |
| # * https://fuchsia.dev/fuchsia-src/contribute/docs/markdown | |
| # * https://www.ibm.com/docs/en/SSYKAV?topic=train-how-do-use-markdown | |
| # * https://www.knowledgehut.com/blog/web-development/what-is-markdown | |
| # * https://www.ionos.com/digitalguide/websites/web-development/markdown/ | |
| # * https://learn.microsoft.com/en-us/contribute/content/markdown-reference | |
| # * https://developer.mozilla.org/en-US/docs/MDN/Writing_guidelines/Howto/Markdown_in_MDN | |
| # * https://confluence.atlassian.com/bitbucketserver/markdown-syntax-guide-776639995.html | |
| # * https://learn.microsoft.com/en-us/azure/devops/project/wiki/markdown-guidance?view=azure-devops | |
| # * https://medium.com/analytics-vidhya/the-ultimate-markdown-guide-for-jupyter-notebook-d5e5abf728fd | |
| function ready() { | |
| return at("root") || at("blockquote") || at("li"); | |
| } | |
| function empty() { | |
| return idx == 0 | |
| } | |
| function peek() { | |
| return stk[idx]; | |
| } | |
| function peek_attr() { | |
| return stk_attr[idx]; | |
| } | |
| function peek_spaces() { | |
| return stk_spaces[idx]; | |
| } | |
| function peek_value(key, found) { | |
| attr = " " peek_attr(); | |
| if (match(attr, "[ ]" key "='[^']*'") > 0) { | |
| found = substr(attr, RSTART, RLENGTH); | |
| match(found, "='[^']*'"); | |
| return substr(found, RSTART + 2, RLENGTH - 3); | |
| } | |
| return ""; | |
| } | |
| function identifier() { | |
| return ++id; | |
| } | |
| function at(tag) { | |
| return peek() == tag ? 1 : 0; | |
| } | |
| function any(tags, i, n, arr) { | |
| n = split(tags, arr, ","); | |
| for (i = 1; i <= n; i++) { | |
| if (at(arr[i])) { | |
| return 1; | |
| } | |
| } | |
| return ""; | |
| } | |
| function pop_at(tag) { | |
| if (at(tag)) { | |
| return pop(); | |
| } | |
| return ""; | |
| } | |
| function pop_any(tags) { | |
| if (any(tags)) { | |
| return pop(); | |
| } | |
| return ""; | |
| } | |
| function container() { | |
| return any("ol,ul,li"); | |
| } | |
| function pop() { | |
| if (empty()) { | |
| return ""; | |
| } | |
| if (container()) { | |
| print_buf(); | |
| close_tag(); | |
| } else { | |
| print_tag(); | |
| } | |
| return unpush(); | |
| } | |
| function spaces() { | |
| match($0, /^[ ]*[^ ]/); | |
| # the number of spaces before non-space | |
| return (RLENGTH > 0) ? RLENGTH - 1 : RLENGTH; | |
| } | |
| function push(tag, attr) { | |
| pop_list(tag); | |
| ++idx; | |
| stk[idx] = tag; | |
| stk_attr[idx] = attr; | |
| stk_spaces[idx] = spaces(); | |
| if (container()) { | |
| print_buf(); | |
| open_tag(); | |
| } | |
| } | |
| function pop_list(tag) { | |
| if (any("ol,ul") && tag != "li") { | |
| pop(); | |
| } | |
| } | |
| function unpush( tag) { | |
| tag = peek(); | |
| if (!empty()) { | |
| delete stk_spaces[idx]; | |
| delete stk_attr[idx]; | |
| delete stk[idx]; | |
| idx--; | |
| } | |
| return tag; | |
| } | |
| function print_tag() { | |
| open_tag(); | |
| print_buf(); | |
| close_tag(); | |
| } | |
| function open_tag() { | |
| if (at("br") || at("hr")) { | |
| printf "<%s>\n", peek(); | |
| return; | |
| } | |
| if (at("pre") || at("code")) { | |
| open_pre(peek_value("title")); | |
| return; | |
| } | |
| if (!peek_attr()) { | |
| printf "<%s>\n", peek(); | |
| } else { | |
| printf "<%s %s>\n", peek(), peek_attr(); | |
| } | |
| } | |
| function close_tag() { | |
| if (at("br") || at("hr")) { | |
| return; # empty element | |
| } | |
| if (at("pre") || at("code")) { | |
| close_pre(); | |
| return; | |
| } | |
| printf "</%s>\n", peek(); | |
| } | |
| function buffer(str, sep) { | |
| if (at("pre") || at("code")) { | |
| sep = "\n"; | |
| } else { | |
| sep = " "; | |
| # 2-spaces line break | |
| if (str ~ /[ ][ ]+$/) { | |
| str = rtrim(str) make_tag("br"); | |
| } | |
| str = trim(str); | |
| } | |
| if (buf == "") { | |
| buf = str; | |
| } else { | |
| buf=buf sep str; | |
| } | |
| } | |
| function print_buf() { | |
| if (at("pre") || at("code")) { | |
| buf = escape(buf); | |
| } else { | |
| # the order matters | |
| buf = angles(buf); | |
| buf = footnotes(buf); | |
| buf = images(buf); | |
| buf = links(buf); | |
| buf = reflinks(buf); | |
| buf = styles(buf); | |
| } | |
| if (buf != "") { | |
| print buf; | |
| } | |
| buf = ""; | |
| } | |
| function coalesce(str, alternative) { | |
| return (str) ? str : alternative; | |
| } | |
| function open_pre(title, id) { | |
| id = identifier(); | |
| title = coalesce(title, ">_"); | |
| if (TEST) { | |
| printf "<pre><code>\n"; | |
| } else { | |
| printf "<div class='codeblock'>"; | |
| printf "<div class='codeblock-head'>"; | |
| printf "<span class='codeblock-title'>%s</span>", title; | |
| printf "<span class='codeblock-buttons'>%s</span>", buttons(id); | |
| printf "</div>"; | |
| printf "<pre class='codeblock-body' id='%s'>", id; | |
| printf "<code class='codeblock-code'>"; | |
| } | |
| } | |
| function close_pre() { | |
| if (TEST) { | |
| printf "</code></pre>\n"; | |
| } else { | |
| printf "</code></pre>\n"; | |
| printf "</div>\n"; | |
| } | |
| } | |
| function buttons(id, style, copy, collapse, wordwrap) { | |
| copy_icon = "📋"; | |
| collapse_icon = "↕"; | |
| wordwrap_icon = "↵"; | |
| copy = "<button onclick='copy(" id ")' title='Copy'>" copy_icon "</button>"; | |
| collapse = "<button onclick='collapse(" id ")' title='Collapse'>" collapse_icon "</button>"; | |
| wordwrap = "<button onclick='wordwrap(" id ")' title='Word wrap'>" wordwrap_icon "</button>"; | |
| # must return in reverse order | |
| return copy collapse wordwrap; | |
| } | |
| function styles(buf) { | |
| buf = snippet(buf); | |
| buf = formula(buf); | |
| buf = asterisk(buf); | |
| buf = underscore(buf); | |
| buf = deleted(buf); | |
| buf = inserted(buf); | |
| buf = highlighted(buf); | |
| buf = superscript(buf); | |
| buf = subscript(buf); | |
| return buf; | |
| } | |
| function snippet(buf) { | |
| buf = apply_style(buf, "``", "code"); | |
| buf = apply_style(buf, "`", "code"); | |
| return buf; | |
| } | |
| function formula(buf) { | |
| buf = apply_style(buf, "$$", "code"); | |
| buf = apply_style(buf, "$", "code"); | |
| return buf; | |
| } | |
| function underscore(buf) { | |
| buf = apply_style(buf, "__", "strong"); | |
| buf = apply_style(buf, "_", "em"); | |
| return buf; | |
| } | |
| function asterisk(buf) { | |
| buf = apply_style(buf, "**", "strong"); | |
| buf = apply_style(buf, "*", "em"); | |
| return buf; | |
| } | |
| function deleted(buf) { | |
| return apply_style(buf, "~~", "del"); | |
| } | |
| function inserted(buf) { | |
| return apply_style(buf, "++", "ins"); | |
| } | |
| function highlighted(buf) { | |
| return apply_style(buf, "==", "mark"); | |
| } | |
| function superscript(buf) { | |
| return apply_style(buf, "^", "sup"); | |
| } | |
| function subscript(buf) { | |
| return apply_style(buf, "~", "sub"); | |
| } | |
| function apply_style(buf, mark, tag, out, found, rstart, rlength) { | |
| out = ""; | |
| len = length(mark); | |
| position = index(buf, mark); | |
| while (position > 0) { | |
| rstart = position + len; | |
| rlength = index(substr(buf, rstart), mark) - 1; | |
| if (rlength <= 0) break; | |
| found = substr(buf, rstart, rlength); | |
| if (tag == "code") { | |
| found = escape(found); | |
| } | |
| out = out substr(buf, 1, rstart -1 - len); | |
| out = out make_tag(tag, found); | |
| buf = substr(buf, rstart + rlength + len); | |
| position = index(buf, mark); | |
| } | |
| out = out buf; | |
| return out; | |
| } | |
| function escape(str) { | |
| # html special characters | |
| gsub(/[&]/, "\\&", str); | |
| gsub(/[<]/, "\\<", str); | |
| gsub(/[>]/, "\\>", str); | |
| # markdown special characters | |
| gsub(/[$]/, "\\$", str); | |
| gsub(/[*]/, "\\*", str); | |
| gsub(/[+]/, "\\+", str); | |
| gsub(/[-]/, "\\-", str); | |
| gsub(/[=]/, "\\=", str); | |
| gsub(/[\^]/, "\\^", str); | |
| gsub(/[_]/, "\\_", str); | |
| gsub(/[`]/, "\\`", str); | |
| gsub(/[~]/, "\\~", str); | |
| return str; | |
| } | |
| function prefix(str, start, x) { | |
| x = (x) ? x : 1; | |
| return substr(str, 1, start - x); | |
| } | |
| function suffix(str, start, end, x) { | |
| x = (x) ? x : 1; | |
| return substr(str, start + (end - start) + x); | |
| } | |
| function extract(str, start, end, x, y) { | |
| x = (x) ? x : 1; | |
| y = (y) ? y : 1; | |
| return substr(str, start + x, (end - start) - y); | |
| } | |
| # TODO: change order: tag, attr, text (<tag attr>text</tag>) | |
| function make_tag(tag, text, attr) { | |
| if (text) { | |
| if (attr) { | |
| return "<" tag " " attr ">" text "</" tag ">"; | |
| } else { | |
| return "<" tag ">" text "</" tag ">"; | |
| } | |
| } else { | |
| if (attr) { | |
| return "<" tag " " attr "/>"; | |
| } else { | |
| return "<" tag "/>"; | |
| } | |
| } | |
| } | |
| # TODO: change order: href, title, text (<a href title>text</a>) | |
| function make_link(text, href, title) { | |
| if (title) { | |
| return make_tag("a", text, "href='" href "' title='" title "'"); | |
| } else { | |
| return make_tag("a", text, "href='" href "'"); | |
| } | |
| } | |
| # TODO: change order and names: href, title, alt (<a href title alt/>) | |
| function make_image(text, href, title) { | |
| if (title) { | |
| return make_tag("img", "", "alt='" text "' src='" href "' title='" title "'"); | |
| } else { | |
| return make_tag("img", "", "alt='" text "' src='" href "'"); | |
| } | |
| } | |
| function make_footnote(ref) { | |
| return make_tag("a", "<sup>[" ref "]<sup>", "href='#foot-" ref "'"); | |
| } | |
| # TODO: change order: ref, text (<a href="ref">text</a>) | |
| function make_reflink(text, ref) { | |
| return make_tag("a", text, "href='#link-" ref "'"); | |
| } | |
| # <ftp...> | |
| # <http...> | |
| # <https...> | |
| # <email@...> | |
| function angles(buf, start, end, href, out) { | |
| out = ""; | |
| start = index(buf, "<"); | |
| end = index(buf, ">"); | |
| while (0 < start && start < end) { | |
| href = extract(buf, start, end); | |
| if (index(href, "http") == 1 || index(href, "ftp") == 1) { | |
| push_link(id++, href); | |
| out = out prefix(buf, start); | |
| out = out make_link(href, href); | |
| } else if (index(href, "@") > 1) { | |
| push_link(id++, "mailto:" href); | |
| out = out prefix(buf, start); | |
| out = out make_link(href, "mailto:" href); | |
| } else { | |
| # do nothing; just give back | |
| out = out prefix(buf, end + 1); | |
| } | |
| buf = suffix(buf, start, end); | |
| start = index(buf, "<"); | |
| end = index(buf, ">"); | |
| } | |
| out = out buf; | |
| return out; | |
| } | |
| # [text](href) | |
| # [text](href "title") | |
| function links(buf, regex, start, end, mid, t1, t2, temp, text, href, title, out) { | |
| out = ""; | |
| start = index(buf, "["); | |
| mid = index(buf, "]("); | |
| end = index(buf, ")"); | |
| while (0 < start && start < mid && mid < end) { | |
| out = out prefix(buf, start); | |
| text = extract(buf, start, mid); | |
| href = extract(buf, mid, end, 2, 2); | |
| t1 = index(href, "\""); | |
| t2 = index(substr(href, t1 + 1), "\"") + t1; | |
| if (0 < t1 && t1 < t2) { | |
| temp = href; | |
| href = trim(prefix(temp, t1)); | |
| title = trim(extract(temp, t1, t2)); | |
| } | |
| out = out make_link(text, href, title); | |
| push_link(id++, href, title, text); | |
| buf = suffix(buf, start, end); | |
| start = index(buf, "["); | |
| mid = index(buf, "]("); | |
| end = index(buf, ")"); | |
| } | |
| out = out buf; | |
| return out; | |
| } | |
| #  | |
| #  | |
| function images(buf, regex, start, end, mid, t1, t2, temp, text, href, title, out) { | |
| out = ""; | |
| start = index(buf, "; | |
| end = index(buf, ")"); | |
| while (0 < start && start < mid && mid < end) { | |
| out = out prefix(buf, start); | |
| text = extract(buf, start, mid, 2, 2); | |
| href = extract(buf, mid, end, 2, 2); | |
| t1 = index(href, "\""); | |
| t2 = index(substr(href, t1 + 1), "\"") + t1; | |
| if (0 < t1 && t1 < t2) { | |
| temp = href; | |
| href = trim(prefix(temp, t1)); | |
| title = trim(extract(temp, t1, t2)); | |
| } | |
| out = out make_image(text, href, title); | |
| buf = suffix(buf, start, end); | |
| start = index(buf, "; | |
| end = index(buf, ")"); | |
| } | |
| out = out buf; | |
| return out; | |
| } | |
| # [^footnote] | |
| function footnotes(buf, regex, start, end, ref, out) { | |
| out = ""; | |
| start = index(buf, "[^"); | |
| end = index(buf, "]"); | |
| while (0 < start && start < end) { | |
| out = out prefix(buf, start); | |
| ref = extract(buf, start, end, 2, 2); | |
| out = out make_footnote(ref); | |
| buf = suffix(buf, start, end); | |
| start = index(buf, "[^"); | |
| end = index(buf, "]"); | |
| } | |
| out = out buf; | |
| return out; | |
| } | |
| # [text][ref] | |
| # [text] [ref] | |
| function reflinks(buf, start, end, mid1, mid2, out, text, ref) { | |
| out = ""; | |
| start = index(buf, "["); | |
| mid1 = index(buf, "]"); | |
| while (0 < start && start < mid1) { | |
| mid2 = index(substr(buf, mid1 + 1), "[") + mid1; | |
| end = index(substr(buf, mid2 + 1), "]") + mid2; | |
| if (mid1 < mid2 && mid2 < end) { | |
| if (mid2 - mid1 <= 2) { | |
| text = extract(buf, start, mid1); | |
| ref = extract(buf, mid2, end, 1, 1); | |
| out = out prefix(buf, start); | |
| out = out make_reflink(text, ref); | |
| } else { | |
| out = out prefix(buf, end + 1); | |
| } | |
| } | |
| buf = suffix(buf, start, end); | |
| start = index(buf, "["); | |
| mid1 = index(buf, "]"); | |
| } | |
| out = out buf; | |
| return out; | |
| } | |
| function print_header() { | |
| print "<!DOCTYPE html>"; | |
| print "<html>"; | |
| print "<head>"; | |
| print "<title></title>"; | |
| print "<style>"; | |
| print " :root {"; | |
| print " --gray: #efefef;"; | |
| print " --black: #444;"; | |
| print " --dark-gray: #aaaaaa;"; | |
| print " --light-gray: #fafafa;"; | |
| print " --dark-blue: #0000ff;"; | |
| print " --light-blue: #0969da;"; | |
| print " --light-yellow: #fafaaa;"; | |
| print " }"; | |
| print " html {"; | |
| print " font-size: 16px;"; | |
| print " max-width: 100%;"; | |
| print " }"; | |
| print " body {"; | |
| print " padding: 1rem;"; | |
| print " margin: 0 auto;"; | |
| print " max-width: 50rem;"; | |
| print " line-height: 1.5rem;"; | |
| print " font-family: sans-serif;"; | |
| print " color: var(--black);"; | |
| print " }"; | |
| print " p {"; | |
| print " font-size: 1rem;"; | |
| print " margin-bottom: 1.3rem;"; | |
| print " }"; | |
| print " a, a:visited { color: var(--light-blue); }"; | |
| print " a:hover, a:focus, a:active { color: var(--dark-blue); }"; | |
| print " h1 { font-size: 1.7rem; }"; | |
| print " h2 { font-size: 1.4rem; }"; | |
| print " h3 { font-size: 1.1rem; }"; | |
| print " h4 { font-size: 1.1rem; }"; | |
| print " h5 { font-size: 0.8rem; }"; | |
| print " h6 { font-size: 0.8rem; }"; | |
| print " h1, h2 {"; | |
| print " padding-bottom: 0.5rem;"; | |
| print " border-bottom: 2px solid var(--gray);"; | |
| print " }"; | |
| print " h1, h2, h3, h4, h5, h6 {"; | |
| print " font-weight: bold;"; | |
| print " font-style: normal;"; | |
| print " margin: 1.4rem 0 .5rem;"; | |
| print " }"; | |
| print " h3, h5 {"; | |
| print " font-weight: bold;"; | |
| print " font-style: normal;"; | |
| print " }"; | |
| print " h4, h6 {"; | |
| print " font-weight: normal;"; | |
| print " font-style: italic;"; | |
| print " }"; | |
| print " div.codeblock {"; | |
| print " border-radius: .4rem;"; | |
| print " background-color: var(--gray);"; | |
| print " border: 1px solid var(--dark-gray);"; | |
| print " }"; | |
| print " div.codeblock-head {"; | |
| print " margin: 0rem 0rem;"; | |
| print " padding: 0rem 0rem;"; | |
| print " border-bottom: 1px solid var(--dark-gray);"; | |
| print " }"; | |
| print " span.codeblock-title {"; | |
| print " font-weight: bold;"; | |
| print " margin: 0rem 0rem;"; | |
| print " padding: 0rem 1rem;"; | |
| print " }"; | |
| print " span.codeblock-buttons {"; | |
| print " float: right;"; | |
| print " font-weight: bold;"; | |
| print " margin: 0rem 0rem;"; | |
| print " padding: 0rem 1rem;"; | |
| print " }"; | |
| print " pre.codeblock-body {"; | |
| print " overflow-x:auto;"; | |
| print " margin: 0rem 0rem;"; | |
| print " padding: 1rem 1rem;"; | |
| print " line-height: 1.0rem;"; | |
| print " }"; | |
| print " code.codeblock-code {"; | |
| print " font-size: 0.8rem;"; | |
| print " margin: 0rem 0rem;"; | |
| print " padding: 0rem 0rem;"; | |
| print " font-family: monospace;"; | |
| print " }"; | |
| print " code {"; | |
| print " border-radius: .2rem;"; | |
| print " padding: 0.1rem 0.3rem;"; | |
| print " font-family: monospace;"; | |
| print " background-color: var(--gray);"; | |
| print " }"; | |
| print " mark {"; | |
| print " padding: 0.1rem 0.3rem;"; | |
| print " border-radius: .2rem;"; | |
| print " background-color: var(--light-yellow);"; | |
| print " }"; | |
| print " blockquote {"; | |
| print " margin: 1.5rem;"; | |
| print " padding: 1rem;"; | |
| print " border-radius: .4rem;"; | |
| print " background-color: var(--light-gray);"; | |
| print " border: 1px solid var(--dark-gray);"; | |
| print " border-left: 12px solid var(--dark-gray);"; | |
| print " }"; | |
| print " dt { font-weight: bold; }"; | |
| print " hr { border: 1px solid var(--dark-gray); }"; | |
| print " img { height: auto; max-width: 100%; }"; | |
| print " table { border-collapse: collapse; margin-bottom: 1.3rem; }"; | |
| print " th { padding: .7rem; border-bottom: 1px solid var(--black);}"; | |
| print " td { padding: .7rem; border-bottom: 1px solid var(--gray);}"; | |
| print "</style>"; | |
| print "<script>"; | |
| print " function copy(id) {"; | |
| print " var element = document.getElementById(id);"; | |
| print " navigator.clipboard.writeText(element.textContent);"; | |
| print " }"; | |
| print " function wordwrap(id) {"; | |
| print " var element = document.getElementById(id);"; | |
| print " if (element.style.whiteSpace != 'pre-wrap') {"; | |
| print " element.style.whiteSpace = 'pre-wrap';"; | |
| print " } else {"; | |
| print " element.style.whiteSpace = 'pre';"; | |
| print " }"; | |
| print " }"; | |
| print " function collapse(id) {"; | |
| print " var element = document.getElementById(id);"; | |
| print " if (element.style.display != 'none') {"; | |
| print " element.style.display = 'none';"; | |
| print " } else {"; | |
| print " element.style.display = 'block';"; | |
| print " }"; | |
| print " }"; | |
| print "</script>" | |
| print "</head>"; | |
| print "<body>"; | |
| } | |
| function print_footer ( i, ref, href, title, text) { | |
| print "<footer>"; | |
| if (link_count > 0 || footnote_count > 0) { | |
| print "<hr>"; | |
| } | |
| if (link_count > 0) { | |
| print "<h6>LINKS</h6>"; | |
| print "<ol>"; | |
| for (i = 1; i <= link_count; i++) { | |
| ref = link_ref[i]; | |
| href = link_href[i]; | |
| title = link_title[i]; | |
| if (title == "") { | |
| title = href; | |
| } | |
| print make_tag("li", title " <a href='" href "' id='link-" ref "'>🔗</a>"); | |
| } | |
| print "</ol>"; | |
| } | |
| if (footnote_count > 0) { | |
| print "<h6>FOOTNOTES</h6>"; | |
| print "<ol>"; | |
| for (i = 1; i <= footnote_count; i++) { | |
| ref = footnote_ref[i]; | |
| text = footnote_text[i]; | |
| print make_tag("li", text " <a href='#foot-" ref "' id='link-" ref "'>🔗</a>"); | |
| } | |
| print "</ol>"; | |
| } | |
| print "</footer>"; | |
| print "</body>"; | |
| print "</html>"; | |
| } | |
| BEGIN { | |
| buf="" | |
| idx=0 | |
| stk[0]="root"; | |
| stk_attr[0]=""; | |
| stk_spaces[0]=0; | |
| blockquote_prefix = "^[ ]*>[ ]?"; | |
| ul_prefix = "^([ ][ ][ ][ ])*([ ]|[ ][ ]|[ ][ ][ ])?[*+-][ ]"; | |
| ol_prefix = "^([ ][ ][ ][ ])*([ ]|[ ][ ]|[ ][ ][ ])?[0-9]+\\.[ ]"; | |
| blank = -1; # prepare to signal blank line | |
| print_header(); | |
| } | |
| function pop_until(tag) { | |
| while (!empty() && !at(tag)) { | |
| pop(); | |
| } | |
| } | |
| function level_blockquote( i, n) { | |
| n = 0; | |
| for (i = idx; i > 0; i--) { | |
| if (stk[i] == "blockquote") { | |
| n++; | |
| } | |
| } | |
| return n; | |
| } | |
| function level_list( i, n) { | |
| n = 0; | |
| for (i = idx; i > 0; i--) { | |
| if (stk[i] == "ul" || stk[i] == "ol") { | |
| n++; | |
| } | |
| if (stk[i] == "blockquote") break; | |
| } | |
| return n; | |
| } | |
| function count_indent(line) { | |
| return count_prefix(line, "^[ ][ ][ ][ ]"); | |
| } | |
| function count_prefix(line, pref, n) { | |
| n=0 | |
| while (sub(pref, "", line)) { | |
| n++; | |
| } | |
| return n; | |
| } | |
| function remove_indent(line) { | |
| return remove_prefix(line, "^[ ][ ][ ][ ]"); | |
| } | |
| function remove_prefix(line, pref) { | |
| # remove leading quote marks | |
| while (line ~ pref) { | |
| sub(pref, "", line); | |
| }; | |
| return line; | |
| } | |
| function min(x, y) { | |
| return (x <= y) ? x : y; | |
| } | |
| function max(x, y) { | |
| return (x >= y) ? x : y; | |
| } | |
| function ltrim(s) { sub(/^[ \t]+/, "", s); return s; } | |
| function rtrim(s) { sub(/[ \t]+$/, "", s); return s; } | |
| function trim(s) { return rtrim(ltrim(s)); } | |
| function slug(str) { | |
| gsub(/[^a-zA-Z0-9]/, "-", str); | |
| gsub(/-+/, "-", str); | |
| return tolower(str); | |
| } | |
| function push_link(ref, href, title, text) { | |
| link_count++; | |
| link_ref[link_count] = ref; | |
| link_href[link_count] = href; | |
| link_title[link_count] = title; | |
| link_text[link_count] = text; | |
| } | |
| # undo last push | |
| function undo( tmp) { | |
| tmp = buf; | |
| buf = ""; | |
| unpush(); | |
| return tmp; | |
| } | |
| #=========================================== | |
| # TABULATION | |
| #=========================================== | |
| /^\t/ { | |
| s = " "; | |
| # replace only 1st tab | |
| sub(/^\t/, s s s s, $0); | |
| } | |
| #=========================================== | |
| # BLOCKQUOTES | |
| #=========================================== | |
| function unblockquote() { | |
| sub(/^[ ]*>[ ]*/, "", $0); | |
| } | |
| # one level | |
| /^[ ]*>[ ]*/ { | |
| if (at("blockquote")) { | |
| unblockquote(); | |
| buffer($0); | |
| next; | |
| } | |
| if (at("root")) { | |
| push("blockquote"); | |
| unblockquote(); | |
| buffer($0); | |
| next; | |
| } | |
| if (!at("root")) { | |
| pop(); | |
| push("blockquote"); | |
| unblockquote(); | |
| buffer($0); | |
| next; | |
| } | |
| } | |
| #=========================================== | |
| # LISTS | |
| #=========================================== | |
| /^([ ]*[*+-][ ]+|[ ]*[0-9]+[.][ ]+).+$/ { | |
| str = $0; # copy register | |
| # detect the type of list | |
| if (str ~ /^[ ]*[*+-][ ]+/) { | |
| ulol = "ul"; | |
| sub(/^[ ]*[*+-][ ]+/, "", str); | |
| } else { | |
| ulol = "ol"; | |
| sub(/^[ ]*[0-9]+[.][ ]+/, "", str); | |
| } | |
| # compare spaces | |
| a = peek_spaces(); | |
| b = spaces(); | |
| if (b > a) { | |
| if (at("li")) { | |
| push(ulol); | |
| push("li"); | |
| buffer(str); | |
| next; | |
| } | |
| } | |
| if (b < a) { | |
| if (at("li")) { | |
| pop(); | |
| pop(); | |
| pop(); | |
| push("li"); | |
| buffer(str); | |
| next; | |
| } | |
| } | |
| if (at("li")) { | |
| pop(); | |
| push("li"); | |
| buffer(str); | |
| next; | |
| } | |
| if (at("root")) { | |
| push(ulol); | |
| push("li"); | |
| buffer(str); | |
| next; | |
| } | |
| if (!at("root")) { | |
| pop(); | |
| push(ulol); | |
| push("li"); | |
| buffer(str); | |
| next; | |
| } | |
| } | |
| #=========================================== | |
| # CODE BLOCKS | |
| #=========================================== | |
| function unindent() { | |
| sub(/^[ ][ ][ ][ ]/, "", $0); | |
| } | |
| /^```/ { | |
| if (at("code")) { | |
| pop(); | |
| next; | |
| } | |
| if (at("root")) { | |
| sub(/^`+/, ""); | |
| push("code", "title='" $1 "'"); | |
| next; | |
| } | |
| if (!at("root")) { | |
| pop(); | |
| sub(/^`+/, ""); | |
| push("code", "title='" $1 "'"); | |
| next; | |
| } | |
| } | |
| at("code") { | |
| buffer($0); | |
| next; | |
| } | |
| /^[ ][ ][ ][ ]/ { | |
| if (at("pre")) { | |
| unindent(); | |
| buffer($0); | |
| next; | |
| } | |
| if (at("root")) { | |
| push("pre"); | |
| unindent(); | |
| buffer($0); | |
| next; | |
| } | |
| if (!at("root")) { | |
| pop(); | |
| push("pre"); | |
| unindent(); | |
| buffer($0); | |
| next; | |
| } | |
| } | |
| #=========================================== | |
| # HEADING | |
| #=========================================== | |
| /^[\x23]+[ ]+/ { | |
| # count header level | |
| match($0, /^[\x23]+/); | |
| # remove all leading hashes | |
| sub(/^[\x23]+[ ]*/, "", $0); | |
| # remove all trailing hashes | |
| sub(/[ ]*[\x23]+$/, "", $0); | |
| if (at("root")) { | |
| push("h" min(RLENGTH, 6)); | |
| buffer($0); | |
| next; | |
| } | |
| if (!at("root")) { | |
| pop(); | |
| push("h" min(RLENGTH, 6)); | |
| buffer($0); | |
| next; | |
| } | |
| } | |
| /^=+[ ]*$/ && at("p") { | |
| unpush(); | |
| push("h1"); | |
| pop(); | |
| next; | |
| } | |
| /^-+[ ]*$/ && at("p") { | |
| unpush(); | |
| push("h2"); | |
| pop(); | |
| next; | |
| } | |
| #=========================================== | |
| # HORIZONTAL RULER | |
| #=========================================== | |
| # TODO: fix <hr> between <ul|ol> and <li> | |
| /^[*_-][*_-][*_-]+[ ]*$/ { | |
| if (at("root")) { | |
| push("hr"); | |
| pop(); | |
| next; | |
| } | |
| if (!at("root")) { | |
| pop(); | |
| push("hr"); | |
| pop(); | |
| next; | |
| } | |
| } | |
| #=========================================== | |
| # BLANK | |
| #=========================================== | |
| /^[ ]*$/ { | |
| blank_flag = 1; | |
| if (at("pre")) { | |
| buffer(""); | |
| next; | |
| } | |
| if (at("li")) { | |
| next; | |
| } | |
| if (at("root")) { | |
| next; | |
| } | |
| if (!at("root")) { | |
| pop(); | |
| next; | |
| } | |
| } | |
| #=========================================== | |
| # PARAGRAPH | |
| #=========================================== | |
| /^.+$/ { | |
| if (at("p")) { | |
| buffer($0); | |
| next; | |
| } | |
| if (any("h1,h2,h3,h4,h5,h6")) { | |
| buffer($0); | |
| next; | |
| } | |
| if (at("root")) { | |
| push("p"); | |
| buffer($0); | |
| next; | |
| } | |
| if (!at("root")) { | |
| pop(); | |
| push("p"); | |
| buffer($0); | |
| next; | |
| } | |
| } | |
| { | |
| blank_flag = 0; | |
| } | |
| #=========================================== | |
| # THE END | |
| #=========================================== | |
| END { | |
| pop_at("p"); | |
| pop_at("li"); | |
| pop_any("pre,code"); | |
| pop_any("h1,h2,h3,h4,h5,h6"); | |
| # compatible end of file, | |
| # e.g., `diff`, `ed` etc. | |
| printf "\n"; | |
| } | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/awk -f | |
| # | |
| # Converts markdown to HTML | |
| # | |
| # See: | |
| # | |
| # * https://spec.commonmark.org | |
| # * https://markdown-it.github.io | |
| # * https://www.javatpoint.com/markdown | |
| # * https://www.markdownguide.org/cheat-sheet | |
| # * https://www.markdownguide.org/extended-syntax | |
| # * https://pandoc.org/MANUAL.html#pandocs-markdown | |
| # * https://www.dotcms.com/docs/latest/markdown-syntax | |
| # * https://www.codecademy.com/resources/docs/markdown | |
| # * https://daringfireball.net/projects/markdown/syntax | |
| # * https://www.ecovida.org.br/docs/manual_site/markdown | |
| # * https://quarto.org/docs/authoring/markdown-basics.html | |
| # * https://docs.github.com/en/get-started/writing-on-github | |
| # * https://fuchsia.dev/fuchsia-src/contribute/docs/markdown | |
| # * https://www.ibm.com/docs/en/SSYKAV?topic=train-how-do-use-markdown | |
| # * https://www.knowledgehut.com/blog/web-development/what-is-markdown | |
| # * https://www.ionos.com/digitalguide/websites/web-development/markdown/ | |
| # * https://learn.microsoft.com/en-us/contribute/content/markdown-reference | |
| # * https://developer.mozilla.org/en-US/docs/MDN/Writing_guidelines/Howto/Markdown_in_MDN | |
| # * https://confluence.atlassian.com/bitbucketserver/markdown-syntax-guide-776639995.html | |
| # * https://learn.microsoft.com/en-us/azure/devops/project/wiki/markdown-guidance?view=azure-devops | |
| # * https://medium.com/analytics-vidhya/the-ultimate-markdown-guide-for-jupyter-notebook-d5e5abf728fd | |
| function ready() { | |
| return at("root") || at("blockquote") || at("li"); | |
| } | |
| function empty() { | |
| return idx == 0 | |
| } | |
| function at(tag) { | |
| return peek() == tag ? 1 : 0; | |
| } | |
| function peek() { | |
| return stk[idx]; | |
| } | |
| function peek_attr() { | |
| return stk_attr[idx]; | |
| } | |
| function push(tag, attr) { | |
| ++id; | |
| ++idx; | |
| stk[idx] = tag; | |
| stk_attr[idx] = attr; | |
| open_tag(id); | |
| # close <br> and <hr> | |
| if (at("br") || at("hr")) { | |
| pop(); | |
| } | |
| return id; | |
| } | |
| function pop() { | |
| if (empty()) { | |
| return ""; | |
| } | |
| close_tag(); | |
| return unpush(); | |
| } | |
| function unpush( tag) { | |
| tag = peek(); | |
| if (!empty()) { | |
| delete stk_attr[idx]; | |
| delete stk[idx--]; | |
| } | |
| return tag; | |
| } | |
| function write() { | |
| if (at("pre") || at("code")) { | |
| buf = escape(buf); | |
| } else { | |
| # the order matters | |
| buf = diamonds(buf); | |
| buf = footnotes(buf); | |
| buf = images(buf); | |
| buf = links(buf); | |
| buf = reflinks(buf); | |
| buf = styles(buf); | |
| } | |
| if (buf != "") { | |
| print buf; | |
| } | |
| buf = ""; | |
| } | |
| function append(str, sep) { | |
| if (at("pre") || at("code")) { | |
| if (sep == "") sep = "\n"; | |
| } else { | |
| if (sep == "") sep = " "; | |
| # append 2-spaces line break | |
| if (str ~ /^[^ ]+[ ][ ]+$/) { | |
| str = rtrim(str) "<br>"; | |
| } | |
| str = trim(str); | |
| } | |
| if (buf == "") { | |
| buf = str; | |
| } else { | |
| buf=buf sep str; | |
| } | |
| } | |
| function open_tag(id) { | |
| write(); | |
| tag = peek(); | |
| attr = peek_attr(); | |
| if (at("br") || at("hr")) { | |
| printf "<%s>\n", tag; | |
| return; | |
| } | |
| if (at("pre") || at("code")) { | |
| open_pre(id, peek_value("title")); | |
| return; | |
| } | |
| # if (at("h1") || at("h2") || at("h3")) { | |
| # if (!attr) { | |
| # attr = "id='" id "'"; | |
| # } else { | |
| # attr = "id='" id "' " attr; | |
| # } | |
| # } | |
| if (!attr) { | |
| printf "<%s>\n", tag; | |
| } else { | |
| printf "<%s %s>\n", tag, attr; | |
| } | |
| } | |
| function close_tag() { | |
| write(); | |
| if (at("br") || at("hr")) { | |
| # do nothing. | |
| # already closed. | |
| return; | |
| } | |
| if (at("pre") || at("code")) { | |
| close_pre(); | |
| return; | |
| } | |
| printf "</%s>\n", peek(); | |
| } | |
| function peek_value(key, found) { | |
| attr = " " peek_attr(); | |
| if (match(attr, "[ ]" key "='[^']*'") > 0) { | |
| found = substr(attr, RSTART, RLENGTH); | |
| match(found, "='[^']*'"); | |
| return substr(found, RSTART + 2, RLENGTH - 3); | |
| } | |
| return ""; | |
| } | |
| function open_pre(id, title) { | |
| printf "<pre>"; | |
| printf "<div class='pre-head'>"; | |
| printf "<span>%s</span>", title; | |
| printf "%s", buttons(id); | |
| printf "</div>"; | |
| printf "<div class='pre-body' id='%s'>", id; | |
| return; | |
| } | |
| function close_pre() { | |
| printf "</div>"; | |
| printf "</pre>"; | |
| return; | |
| } | |
| function buttons(id, style, clipboard, wordwrap) { | |
| collapse = "<button onclick='collapse(" id ")' title='Toggle collapse' class='pre-button'>↕</button>"; | |
| clipboard = "<button onclick='wordwrap(" id ")' title='Toggle word-wrap' class='pre-button'>⏎</button>"; | |
| wordwrap = "<button onclick='clipboard(" id ")' title='Copy to clipboard' class='pre-button'>📋</button>"; | |
| return clipboard collapse wordwrap; | |
| } | |
| # TODO: change order: tag, attr, text (<tag attr>text</tag>) | |
| function make(tag, text, attr) { | |
| if (text) { | |
| if (attr) { | |
| return "<" tag " " attr ">" text "</" tag ">"; | |
| } else { | |
| return "<" tag ">" text "</" tag ">"; | |
| } | |
| } else { | |
| if (attr) { | |
| return "<" tag " " attr "/>"; | |
| } else { | |
| return "<" tag "/>"; | |
| } | |
| } | |
| } | |
| function snippet(buf) { | |
| buf = apply_style(buf, "``", 2, "code"); | |
| buf = apply_style(buf, "`", 1, "code"); | |
| return buf; | |
| } | |
| function formula(buf) { | |
| buf = apply_style(buf, "$$", 2, "code"); | |
| buf = apply_style(buf, "$", 1, "code"); | |
| return buf; | |
| } | |
| function underscore(buf) { | |
| buf = apply_style(buf, "__", 2, "strong"); | |
| buf = apply_style(buf, "_", 1, "em"); | |
| return buf; | |
| } | |
| function asterisk(buf) { | |
| buf = apply_style(buf, "**", 2, "strong"); | |
| buf = apply_style(buf, "*", 1, "em"); | |
| return buf; | |
| } | |
| function deleted(buf) { | |
| return apply_style(buf, "~~", 2, "del"); | |
| } | |
| function inserted(buf) { | |
| return apply_style(buf, "++", 2, "ins"); | |
| } | |
| function highlighted(buf) { | |
| return apply_style(buf, "==", 2, "mark"); | |
| } | |
| function superscript(buf) { | |
| return apply_style(buf, "^", 1, "sup"); | |
| } | |
| function subscript(buf) { | |
| return apply_style(buf, "~", 1, "sub"); | |
| } | |
| function styles(buf) { | |
| buf = snippet(buf); | |
| buf = formula(buf); | |
| buf = asterisk(buf); | |
| buf = underscore(buf); | |
| buf = deleted(buf); | |
| buf = inserted(buf); | |
| buf = highlighted(buf); | |
| buf = superscript(buf); | |
| buf = subscript(buf); | |
| return buf; | |
| } | |
| function apply_style(buf, mark, len, tag, out, found, rstart, rlength) { | |
| out = ""; | |
| position = index(buf, mark); | |
| while (position > 0) { | |
| rstart = position + len; | |
| rlength = index(substr(buf, rstart), mark) - 1; | |
| if (rlength <= 0) break; | |
| found = substr(buf, rstart, rlength); | |
| if (tag == "code") { | |
| found = escape(found); | |
| } | |
| out = out substr(buf, 1, rstart -1 - len); | |
| out = out make(tag, found); | |
| buf = substr(buf, rstart + rlength + len); | |
| position = index(buf, mark); | |
| } | |
| out = out buf; | |
| return out; | |
| } | |
| function escape(str) { | |
| # html special characters | |
| gsub(/[&]/, "\\&", str); | |
| gsub(/[<]/, "\\<", str); | |
| gsub(/[>]/, "\\>", str); | |
| # markdown special characters | |
| gsub(/[$]/, "\\$", str); | |
| gsub(/[*]/, "\\*", str); | |
| gsub(/[+]/, "\\+", str); | |
| gsub(/[-]/, "\\-", str); | |
| gsub(/[=]/, "\\=", str); | |
| gsub(/[\^]/, "\\^", str); | |
| gsub(/[_]/, "\\_", str); | |
| gsub(/[`]/, "\\`", str); | |
| gsub(/[~]/, "\\~", str); | |
| return str; | |
| } | |
| function prefix(str, start, x) { | |
| x = (x) ? x : 1; | |
| return substr(str, 1, start - x); | |
| } | |
| function suffix(str, start, end, x) { | |
| x = (x) ? x : 1; | |
| return substr(str, start + (end - start) + x); | |
| } | |
| function extract(str, start, end, x, y) { | |
| x = (x) ? x : 1; | |
| y = (y) ? y : 1; | |
| return substr(str, start + x, (end - start) - y); | |
| } | |
| function make_link(text, href, title) { | |
| if (title) { | |
| return make("a", text, "href='" href "' title='" title "'"); | |
| } else { | |
| return make("a", text, "href='" href "'"); | |
| } | |
| } | |
| function make_image(text, href, title) { | |
| if (title) { | |
| return make("img", "", "alt='" text "' src='" href "' title='" title "'"); | |
| } else { | |
| return make("img", "", "alt='" text "' src='" href "'"); | |
| } | |
| } | |
| function make_footnote(footnote) { | |
| return make("a", "<sup>[" footnote "]<sup>", "href='#foot-" footnote "'"); | |
| } | |
| function make_reflink(text, ref) { | |
| return make("a", text, "href='#link-" ref "'"); | |
| } | |
| # <ftp...> | |
| # <http...> | |
| # <https...> | |
| # <email@...> | |
| function diamonds(buf, start, end, href, out) { | |
| out = ""; | |
| start = index(buf, "<"); | |
| end = index(buf, ">"); | |
| while (0 < start && start < end) { | |
| href = extract(buf, start, end); | |
| if (index(href, "http") == 1 || index(href, "ftp") == 1) { | |
| push_link(id++, href); | |
| out = out prefix(buf, start); | |
| out = out make_link(href, href); | |
| } else if (index(href, "@") > 1) { | |
| push_link(id++, "mailto:" href); | |
| out = out prefix(buf, start); | |
| out = out make_link(href, "mailto:" href); | |
| } else { | |
| # do nothing; just give back | |
| out = out prefix(buf, end + 1); | |
| } | |
| buf = suffix(buf, start, end); | |
| start = index(buf, "<"); | |
| end = index(buf, ">"); | |
| } | |
| out = out buf; | |
| return out; | |
| } | |
| # [text](href) | |
| # [text](href "title") | |
| function links(buf, regex, start, end, mid, t1, t2, temp, text, href, title, out) { | |
| out = ""; | |
| start = index(buf, "["); | |
| mid = index(buf, "]("); | |
| end = index(buf, ")"); | |
| while (0 < start && start < mid && mid < end) { | |
| out = out prefix(buf, start); | |
| text = extract(buf, start, mid); | |
| href = extract(buf, mid, end, 2, 2); | |
| t1 = index(href, "\""); | |
| t2 = index(substr(href, t1 + 1), "\"") + t1; | |
| if (0 < t1 && t1 < t2) { | |
| temp = href; | |
| href = trim(prefix(temp, t1)); | |
| title = trim(extract(temp, t1, t2)); | |
| } | |
| out = out make_link(text, href, title); | |
| push_link(id++, href, title, text); | |
| buf = suffix(buf, start, end); | |
| start = index(buf, "["); | |
| mid = index(buf, "]("); | |
| end = index(buf, ")"); | |
| } | |
| out = out buf; | |
| return out; | |
| } | |
| #  | |
| #  | |
| function images(buf, regex, start, end, mid, t1, t2, temp, text, href, title, out) { | |
| out = ""; | |
| start = index(buf, "; | |
| end = index(buf, ")"); | |
| while (0 < start && start < mid && mid < end) { | |
| out = out prefix(buf, start); | |
| text = extract(buf, start, mid, 2, 2); | |
| href = extract(buf, mid, end, 2, 2); | |
| t1 = index(href, "\""); | |
| t2 = index(substr(href, t1 + 1), "\"") + t1; | |
| if (0 < t1 && t1 < t2) { | |
| temp = href; | |
| href = trim(prefix(temp, t1)); | |
| title = trim(extract(temp, t1, t2)); | |
| } | |
| out = out make_image(text, href, title); | |
| buf = suffix(buf, start, end); | |
| start = index(buf, "; | |
| end = index(buf, ")"); | |
| } | |
| out = out buf; | |
| return out; | |
| } | |
| # [^footnote] | |
| function footnotes(buf, regex, start, end, out, footnote) { | |
| out = ""; | |
| start = index(buf, "[^"); | |
| end = index(buf, "]"); | |
| while (0 < start && start < end) { | |
| out = out prefix(buf, start); | |
| footnote = extract(buf, start, end, 2, 2); | |
| out = out make_footnote(footnote); | |
| buf = suffix(buf, start, end); | |
| start = index(buf, "[^"); | |
| end = index(buf, "]"); | |
| } | |
| out = out buf; | |
| return out; | |
| } | |
| function min(x, y) { | |
| return (x <= y) ? x : y; | |
| } | |
| function max(x, y) { | |
| return (x >= y) ? x : y; | |
| } | |
| # [text][ref] | |
| # [text] [ref] | |
| function reflinks(buf, start, end, mid1, mid2, out, text, ref) { | |
| out = ""; | |
| start = index(buf, "["); | |
| mid1 = index(buf, "]"); | |
| while (0 < start && start < mid1) { | |
| mid2 = index(substr(buf, mid1 + 1), "[") + mid1; | |
| end = index(substr(buf, mid2 + 1), "]") + mid2; | |
| if (mid1 < mid2 && mid2 < end) { | |
| if (mid2 - mid1 <= 2) { | |
| text = extract(buf, start, mid1); | |
| ref = extract(buf, mid2, end, 1, 1); | |
| out = out prefix(buf, start); | |
| out = out make_reflink(text, ref); | |
| } else { | |
| out = out prefix(buf, end + 1); | |
| } | |
| } | |
| buf = suffix(buf, start, end); | |
| start = index(buf, "["); | |
| mid1 = index(buf, "]"); | |
| } | |
| out = out buf; | |
| return out; | |
| } | |
| function print_header() { | |
| print "<!DOCTYPE html>"; | |
| print "<html>"; | |
| print "<head>"; | |
| print "<title></title>"; | |
| print "<style>"; | |
| print " :root {"; | |
| print " --gray: #efefef;"; | |
| print " --black: #444;"; | |
| print " --dark-gray: #aaaaaa;"; | |
| print " --light-gray: #fafafa;"; | |
| print " --dark-blue: #0000ff;"; | |
| print " --light-blue: #0969da;"; | |
| print " --light-yellow: #fafaaa;"; | |
| print " }"; | |
| print " html {"; | |
| print " font-size: 16px;"; | |
| print " max-width: 100%;"; | |
| print " }"; | |
| print " body {"; | |
| print " padding: 1rem;"; | |
| print " margin: 0 auto;"; | |
| print " max-width: 50rem;"; | |
| print " line-height: 1.8;"; | |
| print " font-family: sans-serif;"; | |
| print " color: var(--black);"; | |
| print " }"; | |
| print " p {"; | |
| print " font-size: 1rem;"; | |
| print " margin-bottom: 1.3rem;"; | |
| print " }"; | |
| print " a, a:visited { color: var(--light-blue); }"; | |
| print " a:hover, a:focus, a:active { color: var(--dark-blue); }"; | |
| print " h1 { font-size: 2.0rem; }"; | |
| print " h2 { font-size: 1.5rem; }"; | |
| print " h3 { font-size: 1.2rem; }"; | |
| print " h4 { font-size: 1.2rem; }"; | |
| print " h5 { font-size: 0.8rem; }"; | |
| print " h6 { font-size: 0.8rem; }"; | |
| print " h1, h2 {"; | |
| print " padding-bottom: 0.5rem;"; | |
| print " border-bottom: 2px solid var(--gray);"; | |
| print " }"; | |
| print " h1, h2, h3, h4, h5, h6 {"; | |
| print " line-height: 1.4;"; | |
| print " font-style: normal;"; | |
| print " font-weight: bold;"; | |
| print " margin: 1.4rem 0 .5rem;"; | |
| print " }"; | |
| print " h3, h5 {"; | |
| print " font-weight: bold;"; | |
| print " font-style: normal;"; | |
| print " }"; | |
| print " h4, h6 {"; | |
| print " font-weight: normal;"; | |
| print " font-style: italic;"; | |
| print " }"; | |
| print " pre {"; | |
| print " overflow-x:auto;"; | |
| print " line-height: 1.5;"; | |
| print " border-radius: .4rem;"; | |
| print " font-family: monospace;"; | |
| print " background-color: var(--gray);"; | |
| print " border: 1px solid var(--dark-gray);"; | |
| print " }"; | |
| print " div.pre-head {"; | |
| print " height: 1.5rem;"; | |
| print " padding: 1rem;"; | |
| print " font-weight: bold;"; | |
| print " padding-top: 0.5rem;"; | |
| print " padding-bottom: 0.5rem;"; | |
| print " border-bottom: 1px solid var(--dark-gray);"; | |
| print " }"; | |
| print " div.pre-body {"; | |
| print " padding: 1rem;"; | |
| print " }"; | |
| print " button.pre-button {"; | |
| print " font-size: 100%; float: right;"; | |
| print " }"; | |
| print " code {"; | |
| print " padding: 0.3rem;"; | |
| print " border-radius: .2rem;"; | |
| print " font-family: monospace;"; | |
| print " background-color: var(--gray);"; | |
| print " }"; | |
| print " mark {"; | |
| print " padding: 0.3rem;"; | |
| print " border-radius: .2rem;"; | |
| print " background-color: var(--light-yellow);"; | |
| print " }"; | |
| print " blockquote {"; | |
| print " margin: 1.5rem;"; | |
| print " padding: 1rem;"; | |
| print " border-radius: .4rem;"; | |
| print " background-color: var(--light-gray);"; | |
| print " border: 1px solid var(--dark-gray);"; | |
| print " border-left: 12px solid var(--dark-gray);"; | |
| print " }"; | |
| print " dt { font-weight: bold; }"; | |
| print " hr { border: 1px solid var(--dark-gray); }"; | |
| print " img { height: auto; max-width: 100%; }"; | |
| print " table { border-collapse: collapse; margin-bottom: 1.3rem; }"; | |
| print " th { padding: .7rem; border-bottom: 1px solid var(--black);}"; | |
| print " td { padding: .7rem; border-bottom: 1px solid var(--gray);}"; | |
| print "</style>"; | |
| print "<script>"; | |
| print " function clipboard(id) {"; | |
| print " var element = document.getElementById(id);"; | |
| print " navigator.clipboard.writeText(element.textContent);"; | |
| print " }"; | |
| print " function wordwrap(id) {"; | |
| print " var element = document.getElementById(id);"; | |
| print " if (element.style.whiteSpace != 'pre-wrap') {"; | |
| print " element.style.whiteSpace = 'pre-wrap';"; | |
| print " } else {"; | |
| print " element.style.whiteSpace = 'pre';"; | |
| print " }"; | |
| print " }"; | |
| print " function collapse(id) {"; | |
| print " var element = document.getElementById(id);"; | |
| print " if (element.style.display != 'none') {"; | |
| print " element.style.display = 'none';"; | |
| print " } else {"; | |
| print " element.style.display = 'block';"; | |
| print " }"; | |
| print " }"; | |
| print "</script>" | |
| print "</head>"; | |
| print "<body>"; | |
| } | |
| function print_footer ( i, ref, href, title, text) { | |
| print "<footer>"; | |
| if (link_count > 0 || footnote_count > 0) { | |
| print "<hr>"; | |
| } | |
| if (link_count > 0) { | |
| print "<h6>LINKS</h6>"; | |
| print "<ol>"; | |
| for (i = 1; i <= link_count; i++) { | |
| ref = link_ref[i]; | |
| href = link_href[i]; | |
| title = link_title[i]; | |
| if (title == "") { | |
| title = href; | |
| } | |
| print make("li", title " <a href='" href "' id='link-" ref "'>🔗</a>"); | |
| } | |
| print "</ol>"; | |
| } | |
| if (footnote_count > 0) { | |
| print "<h6>FOOTNOTES</h6>"; | |
| print "<ol>"; | |
| for (i = 1; i <= footnote_count; i++) { | |
| ref = footnote_ref[i]; | |
| text = footnote_text[i]; | |
| print make("li", text " <a href='#foot-" ref "' id='link-" ref "'>🔗</a>"); | |
| } | |
| print "</ol>"; | |
| } | |
| print "</footer>"; | |
| print "</body>"; | |
| print "</html>"; | |
| } | |
| BEGIN { | |
| buf="" | |
| idx=0 | |
| stk[0]="root"; | |
| stk_attr[0]=""; | |
| blockquote_prefix = "^[ ]*>[ ]?"; | |
| ul_prefix = "^([ ][ ][ ][ ])*([ ]|[ ][ ]|[ ][ ][ ])?[*+-][ ]"; | |
| ol_prefix = "^([ ][ ][ ][ ])*([ ]|[ ][ ]|[ ][ ][ ])?[0-9]+\\.[ ]"; | |
| blank = -1; # prepare to signal blank line | |
| print_header(); | |
| } | |
| function pop_until(tag) { | |
| while (!empty() && !at(tag)) { | |
| pop(); | |
| } | |
| } | |
| function level_blockquote( i, n) { | |
| n = 0; | |
| for (i = idx; i > 0; i--) { | |
| if (stk[i] == "blockquote") { | |
| n++; | |
| } | |
| } | |
| return n; | |
| } | |
| function level_list( i, n) { | |
| n = 0; | |
| for (i = idx; i > 0; i--) { | |
| if (stk[i] == "ul" || stk[i] == "ol") { | |
| n++; | |
| } | |
| if (stk[i] == "blockquote") break; | |
| } | |
| return n; | |
| } | |
| function count_indent(line) { | |
| return count_prefix(line, "^[ ][ ][ ][ ]"); | |
| } | |
| function count_prefix(line, pref, n) { | |
| n=0 | |
| while (sub(pref, "", line)) { | |
| n++; | |
| } | |
| return n; | |
| } | |
| function remove_indent(line) { | |
| return remove_prefix(line, "^[ ][ ][ ][ ]"); | |
| } | |
| function remove_prefix(line, pref) { | |
| # remove leading quote marks | |
| while (line ~ pref) { | |
| sub(pref, "", line); | |
| }; | |
| return line; | |
| } | |
| function ltrim(s) { sub(/^[ \t]+/, "", s); return s; } | |
| function rtrim(s) { sub(/[ \t]+$/, "", s); return s; } | |
| function trim(s) { return rtrim(ltrim(s)); } | |
| function slug(str) { | |
| gsub(/[^a-zA-Z0-9]/, "-", str); | |
| gsub(/-+/, "-", str); | |
| return tolower(str); | |
| } | |
| #=========================================== | |
| # TABULATIONS | |
| #=========================================== | |
| { | |
| gsub("\t", " ", $0); # replace tabas with 4 spaces | |
| } | |
| #=========================================== | |
| # BLANK LINES | |
| #=========================================== | |
| # Blank line flag states: | |
| # 0: not signaling blank line | |
| # -1: preparing to signal blank line | |
| # 1: signaling blank line | |
| blank == 1 { | |
| blank = 0; | |
| } | |
| blank == -1 { | |
| blank = 1; | |
| } | |
| /^[ ]*$/ { | |
| if (!at("code")) { | |
| blank = -1; | |
| pop_p(); | |
| pop_blockquote(); | |
| next; | |
| } | |
| } | |
| #=========================================== | |
| # BLOCKQUOTE | |
| #=========================================== | |
| function pop_blockquote() { | |
| if (!at("blockquote")) return; | |
| lv = level_blockquote(); | |
| cp = count_prefix($0, blockquote_prefix); | |
| n = lv - cp; | |
| while (n-- > 0) { | |
| if (at("blockquote")) pop(); | |
| } | |
| } | |
| $0 !~ blockquote_prefix { | |
| pop_blockquote(); | |
| } | |
| $0 ~ blockquote_prefix { | |
| lv = level_blockquote(); | |
| cp = count_prefix($0, blockquote_prefix); | |
| $0 = remove_prefix($0, blockquote_prefix); | |
| if (cp > lv) { | |
| n = cp - lv; | |
| while (n-- > 0) { | |
| pop_p(); | |
| push("blockquote"); | |
| } | |
| } else { | |
| n = lv - cp; | |
| while (n-- > 0) { | |
| pop(); | |
| } | |
| } | |
| if ($0 ~ /^$/) { | |
| pop_until("blockquote"); | |
| } | |
| } | |
| #=========================================== | |
| # LIST ITENS | |
| #=========================================== | |
| # TODO: add more POSIX compatibility as MAWK doesn't support regex quantifiers {x,y} | |
| # See: https://unix.stackexchange.com/questions/506119/how-to-specify-regex-quantifiers-with-mawk | |
| function pop_p() { | |
| if (!ready()) pop(); | |
| } | |
| function pop_list () { | |
| if (!at("li")) return; | |
| lv = level_list(); | |
| cp = count_indent($0); | |
| n = lv - cp; | |
| while (n-- > 0) { | |
| if (stk[idx-1] == "li") pop(); | |
| if (at("li")) pop(); | |
| if (at("ol") || at("ul")) pop(); | |
| } | |
| } | |
| function remove_list_indent (line) { | |
| n = level_list(); | |
| while (n > 0) { | |
| sub(/^[ ][ ][ ][ ]/, "", line); | |
| n--; | |
| } | |
| return line; | |
| } | |
| $0 !~ ul_prefix && $0 !~ ol_prefix { | |
| temp = remove_list_indent($0); | |
| if (blank > 0) { | |
| pop_list(); | |
| } | |
| $0 = temp; | |
| } | |
| function list_start(line) { | |
| sub("^[ ]+", "", line); | |
| match(line, "^[0-9]+"); | |
| return substr(line, RSTART, RLENGTH); | |
| } | |
| function push_li(tag, start) { | |
| if (tag == "ol") { | |
| if (start == "") { | |
| if (!at("ul") && !at("ol")) push(tag); | |
| } else { | |
| if (!at("ul") && !at("ol")) push(tag, "start='" start "'"); | |
| } | |
| } else { | |
| if (!at("ul") && !at("ol")) push(tag); | |
| } | |
| push("li"); | |
| } | |
| function parse_list_item(tag, pref, start) { | |
| lv = level_list(); | |
| cp = count_indent($0) + 1; | |
| $0 = remove_prefix($0, pref); | |
| if (cp == lv) { | |
| pop_p(); | |
| if (at("li")) pop(); | |
| push_li(tag); | |
| append($0); | |
| } else if (cp > lv) { | |
| # add levels | |
| n = (cp - 1) - lv; | |
| while (n-- > 0) { | |
| push_li(tag); | |
| } | |
| push_li(tag, start); | |
| append($0); | |
| } else if (cp < lv) { | |
| # del levels | |
| n = lv - cp; | |
| while (n-- > 0) { | |
| pop_p(); | |
| if (at("li")) pop(); | |
| if (at("ol") || at("ul")) pop(); | |
| } | |
| if (at("li")) pop(); | |
| push_li(tag); | |
| append($0); | |
| } | |
| } | |
| $0 ~ ul_prefix { | |
| parse_list_item("ul", ul_prefix); | |
| next; | |
| } | |
| $0 ~ ol_prefix { | |
| # the user specifies | |
| # the starting number | |
| start = list_start($0); | |
| parse_list_item("ol", ol_prefix, start); | |
| next; | |
| } | |
| #=========================================== | |
| # CODE BLOCKS | |
| #=========================================== | |
| /^```/ { | |
| if (!at("code")) { | |
| sub(/^`+/, ""); | |
| title = $0; | |
| push("code", "title='" title "'"); | |
| next; | |
| } | |
| pop(); | |
| next; | |
| } | |
| at("code") { | |
| append($0); | |
| next; | |
| } | |
| /^[ ][ ][ ][ ]/ { | |
| if (!at("pre")) { | |
| push("pre"); | |
| } | |
| sub("^[ ][ ][ ][ ]", "", $0); | |
| append($0); | |
| next; | |
| } | |
| #=========================================== | |
| # HEADING | |
| #=========================================== | |
| # undo last push | |
| function undo( tmp) { | |
| tmp = buf; | |
| buf = ""; | |
| unpush(); | |
| return tmp; | |
| } | |
| /^===+/ && at("p") { | |
| # <h1> | |
| $0 = undo(); | |
| push("h1"); | |
| append($0); | |
| pop_p(); | |
| next; | |
| } | |
| /^---+/ && at("p") { | |
| # <h2> | |
| $0 = undo(); | |
| push("h2"); | |
| append($0); | |
| pop_p(); | |
| next; | |
| } | |
| /^[\x23]+[ ]+/ { | |
| # count hashes | |
| match($0, "\x23+") | |
| n = RLENGTH > 6 ? 6 : RLENGTH | |
| # remove leading hashes | |
| $0 = substr($0, n + 1); | |
| pop_p(); | |
| push("h" n); | |
| append($0); | |
| next; | |
| } | |
| #=========================================== | |
| # HORIZONTAL RULER | |
| #=========================================== | |
| /^[*_-][*_-][*_-]+[ ]*$/ { | |
| pop_p(); | |
| push("hr"); | |
| next; | |
| } | |
| #=========================================== | |
| # DEFINITION LIST | |
| #=========================================== | |
| # TODO: make definition list multi-level like <li> | |
| /^:/ { | |
| dd = substr($0, 2); | |
| if (at("p")) { | |
| dt = undo(); | |
| push("dl"); | |
| push("dt"); | |
| append(dt); | |
| pop_p(); | |
| push("dd"); | |
| append(dd); | |
| next; | |
| } | |
| if (at("dd")) { | |
| pop_p(); | |
| push("dd"); | |
| append(dd); | |
| next; | |
| } | |
| } | |
| #=========================================== | |
| # TABLE | |
| #=========================================== | |
| function set_table_aligns(line, arr, regex, found, l, r, n) { | |
| delete table_aligns; | |
| regex = "(:--[-]+:|:--[-]+|--[-]+:)"; | |
| delete arr; # starts from 2 | |
| n = split(line, arr, /\|/); | |
| for(i = 2; i < n; i++) { | |
| if (match(arr[i], regex) > 0) { | |
| found = substr(arr[i], RSTART, RLENGTH); | |
| l = substr(found, 1, 1); | |
| r = substr(found, RLENGTH, 1); | |
| if (l == ":" && r == ":") { | |
| table_aligns[i] = "center"; | |
| } else if (l == ":" && r == "-") { | |
| table_aligns[i] = "left"; | |
| } else if (l == "-" && r == ":") { | |
| table_aligns[i] = "right"; | |
| } else { | |
| table_aligns[i] = "l:" l " r: " r; | |
| } | |
| } | |
| } | |
| } | |
| /^[ ]*\|.*\|[ ]*/ { | |
| if (!at("table")) { | |
| push("table"); | |
| push("tr"); | |
| delete arr; # starts from 2 | |
| n = split($0, arr, /\|/); | |
| for(i = 2; i < n; i++) { | |
| push("th"); | |
| append(arr[i]); | |
| pop(); | |
| } | |
| pop(); | |
| next; | |
| } | |
| if (at("table")) { | |
| if ($0 ~ /^[ ]*\|[ ]*([:]?--[-]+[:]?)[ ]*\|[ ]*/) { | |
| set_table_aligns($0); | |
| next; | |
| } | |
| push("tr"); | |
| delete arr; # starts from 2 | |
| n = split($0, arr, /\|/); | |
| for(i = 2; i < n; i++) { | |
| if (table_aligns[i] != "") { | |
| push("td", "style='text-align:" table_aligns[i] ";'"); | |
| } else { | |
| push("td"); | |
| } | |
| append(arr[i]); | |
| pop(); | |
| } | |
| pop(); | |
| next; | |
| } | |
| } | |
| #=========================================== | |
| # FOOTNOTE | |
| #=========================================== | |
| function push_footnote(ref, text) { | |
| footnote_count++ | |
| footnote_ref[footnote_count] = ref; | |
| footnote_text[footnote_count] = styles(text); | |
| } | |
| /^[ ]*\[\^[^]]+\][:]/ { | |
| # [^id]: note | |
| if (match($0, /\[\^[^]]+\][:]/) > 0) { | |
| ref = substr($0, RSTART + 2, RLENGTH - 4); | |
| text = substr($0, RSTART + RLENGTH); | |
| push_footnote(ref, text); | |
| } | |
| next; | |
| } | |
| #=========================================== | |
| # (REFERENCE STYLE) LINK | |
| #=========================================== | |
| # TODO: implement all styles: https://gist.github.com/emedinaa/28ed71b450243aba48accd634679f805 | |
| function push_link(ref, href, title, text) { | |
| link_count++; | |
| link_ref[link_count] = ref; | |
| link_href[link_count] = href; | |
| link_title[link_count] = title; | |
| link_text[link_count] = text; | |
| } | |
| /^[ ]*\[[^]]+\][:]/ { | |
| # [ref]: href | |
| # [ref]: href "title" | |
| # [ref]: href 'title' | |
| # [ref]: href (title) | |
| # [ref]: <href> "title" | |
| # [ref]: <href> 'title' | |
| # [ref]: <href> (title) | |
| if (match($0, /\[[^]]+\][:]/) > 0) { | |
| ref = substr($0, RSTART + 1, RLENGTH - 3); | |
| href = substr($0, RSTART + RLENGTH); | |
| if (match(href, "[ ](\"[^\"]*\"|'[^']*'|\\([^\\)]*\\))") > 0) { | |
| title = substr(href, RSTART + 2, RLENGTH - 3); | |
| href = substr(href, 1, RSTART - 1) | |
| # remove '<' '>'. | |
| if (match(href, "<[^>]+>") > 0) { | |
| href = substr(href, RSTART + 1, RLENGTH - 2); | |
| } | |
| } | |
| # remove leading spaces | |
| sub("^[ ]*", "", href); | |
| push_link(ref, href, title, title); | |
| } | |
| next; | |
| } | |
| #=========================================== | |
| # PARAGRAPH | |
| #=========================================== | |
| # TODO: transform "<li>text" in "<li><p>text", undoing the previous <li> | |
| /^.+$/ { | |
| if (ready()) { | |
| if (at("li")) { | |
| if (blank == 1) { | |
| push("p"); | |
| } | |
| } else { | |
| push("p"); | |
| } | |
| } | |
| append($0); | |
| next; | |
| } | |
| #=========================================== | |
| # THE END | |
| #=========================================== | |
| END { | |
| pop_p(); | |
| pop_list(); | |
| pop_blockquote(); | |
| print_footer(); | |
| } | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/sh | |
| # | |
| # Runs the Busybox httpd server. | |
| # | |
| # Usage: | |
| # | |
| # notekeeper-http-server.sh | |
| # | |
| # Configuration: | |
| # | |
| # # file .notekeeper/conf.txt | |
| # busybox.httpd.port=127.0.0.1:9000 | |
| # | |
| . "`dirname "$0"`/notekeeper-common.sh"; | |
| property_port="busybox.httpd.port" | |
| property_port_default="127.0.0.1:9000" | |
| busybox_httpd_port() { | |
| local port=`grep -E "^${property_port}" "${WORKING_DIR}/.notekeeper/notekeeper.conf" | sed "s/${property_port}=//"`; | |
| if [ -n "${port}" ]; then | |
| echo "${port}"; | |
| else | |
| echo "${property_port_default}"; | |
| fi; | |
| } | |
| busybox_httpd_stop() { | |
| local pid=`ps aux | grep 'busybox httpd' | grep -v "grep" | awk '{ print $2 }'` | |
| if [ -n "$pid" ] && [ "$pid" -gt 1024 ]; then | |
| kill -9 $pid; | |
| fi; | |
| } | |
| busybox_httpd_start() { | |
| local port=`busybox_httpd_port`; | |
| # busybox httpd -p "$port" -h "$PROGRAM_DIR/www/" | |
| busybox httpd -p "$port" -h "$WORKING_DIR/.notekeeper/html/" | |
| echo Listening: "http://$port" | |
| } | |
| main() { | |
| busybox_httpd_stop; | |
| busybox_httpd_start; | |
| } | |
| main; | |
| # https://datatracker.ietf.org/doc/html/rfc3875 | |
| # https://www.vivaolinux.com.br/artigo/Introducao-a-CGI-com-a-RFC-3875 | |
| # https://gist.github.com/stokito/a9a2732ffc7982978a16e40e8d063c8f | |
| # https://github.com/Mikepicker/cgiblog | |
| # https://medium.com/@Mikepicker/no-framework-blog-for-fun-and-profit-using-bash-cgi-cbb99cf5366b |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/sh | |
| # | |
| # Saves HTML in `html` folder. | |
| # | |
| # Usage: | |
| # | |
| # notekeeper-save-html.sh FILE | |
| # | |
| . "`dirname "$0"`/notekeeper-common.sh"; | |
| file="${1}" | |
| require_file "${file}"; | |
| main() { | |
| local file="${1}" | |
| local html=`html_path "${file}"` | |
| mkdir -p "`dirname "${html}"`" | |
| "$PROGRAM_DIR/awk/notekeeper-html.awk" "${file}" > "${html}" | |
| } | |
| main "${file}"; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/sh | |
| # | |
| # Saves a STAT file in in `data` folder. | |
| # | |
| # Usage: | |
| # | |
| # apwm-save-stat.sh FILE | |
| # | |
| . "`dirname "$0"`/notekeeper-common.sh"; | |
| file="${1}" | |
| require_file "${file}"; | |
| main() { | |
| local file="${1}" | |
| local uuid=`path_uuid "${file}"`; | |
| local stat=`make_stat "${file}"`; | |
| LC_ALL=C "$PROGRAM_DIR/awk/notekeeper-stat.awk" -v WRITETO=/dev/stdout "${file}" > "${stat}" | |
| } | |
| main "${file}"; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/awk -f | |
| # Note: | |
| # * Files encoded using MAC-UTF-8 must be normalized to UTF-8. | |
| function token_type(token) | |
| { | |
| return toascii(tolower(token)); | |
| } | |
| function token_format(token) | |
| { | |
| if (token ~ /^[[:alpha:]]+([\x27’-]?[[:alpha:]])*$/) { | |
| return "W"; # Word format: all-letter token with optional hyphens | |
| } else if (token ~ /^[+-]?([[:digit:]][h°%/:,.+-]?)+$/) { | |
| return "N"; # Number format: all-letter token with some optional puncts | |
| } else if (token ~ /^[[:punct:]]+$/) { | |
| return "P"; # Punct format: all-punct token | |
| } else { | |
| return "NA"; # None of the above | |
| } | |
| # NOTE: | |
| # This function returns NA to words that contain "accented" characters encoded | |
| # with MAC-UTF-8. You must normilize the input files to regular UTF-8 encoding. | |
| } | |
| function token_case(token) | |
| { | |
| token = toascii(token); | |
| if (token ~ /^[[:upper:]][[:lower:]]*([\x27’-]([[:alpha:]][[:lower:]]*))*$/) { | |
| return "S"; # Start case: "Word", "Compound-word" | |
| } else if (token ~ /^[[:lower:]]+([\x27’-]([[:lower:]]+))*$/) { | |
| return "L"; # Lower case: "word", "compound-word" | |
| } else if (token ~ /^[[:upper:]]+([\x27’-]([[:upper:]]+))*$/) { | |
| return "U"; # Upper case: "WORD", "COMPOUND-WORD" | |
| } else if (token ~ /^[[:alpha:]][[:lower:]]*([[:upper:]][[:lower:]]+)+$/) { | |
| return "C"; # Camel case: "compoundWord", "CompoundWord" | |
| } else if (token ~ /^[[:alpha:]]+([\x27’-]([[:alpha:]]+))*$/) { | |
| return "M"; # Mixed case: "wOrD", "cOmPoUnD-wOrD" | |
| } else { | |
| return "NA"; # None of the above | |
| } | |
| # NOTE: | |
| # UPPERCASE words with a single character, for example "É", are treated as start case words by this function. | |
| # The author considers it a very convenient behavior that helps to identify proper nouns and the beginning of | |
| # sentences, although he admits that it may not be intuitive. The order of the `if`s is important to preserve | |
| # this behavior. | |
| } | |
| function token_mask(token) | |
| { | |
| if (token ~ /^[+-]?[0-9]+$/) { | |
| return "I"; # Integer mask | |
| } else if (token ~ /^[+-]?[0-9][0-9]?[0-9]?([,.]?[0-9][0-9][0-9])*([,.][0-9]+)?$/) { | |
| return "R"; # Real number | |
| } else if (token ~ /^[0-9]([0-9]|[0-9][0-9][0-9])[/.-][0-9][0-9]?[/.-][0-9]([0-9]|[0-9][0-9][0-9])$/) { | |
| return "D"; # Date mask | |
| } else if (token ~ /^([0-9][0-9]?[:h][0-9][0-9]|[0-9][0-9]?[h])$/) { | |
| return "T"; # Time mask | |
| } else if (token ~ /^[+-]?[0-9]+[/][0-9]+$/) { | |
| return "F"; # Fraction mask | |
| } else if (token ~ /^[+-]?[0-9]+([,.][0-9]+)?%$/) { | |
| return "P"; # Percent mask | |
| } else if (token ~ /^[+-]?[0-9]+([,.][0-9]+)?°$/) { | |
| return "G"; # Degrees mask | |
| } else { | |
| return "NA"; # None of the above | |
| } | |
| } | |
| function insert_token(token) | |
| { | |
| idx++; | |
| tokens[idx]=token; | |
| counters[token]++; | |
| if (!types[token]) types[token] = token_type(token); | |
| if (!formats[token]) formats[token] = token_format(token); | |
| if (!cases[token]) cases[token] = token_case(token); | |
| if (!masks[token]) masks[token] = token_mask(token); | |
| if (!indexes[token]) indexes[token] = idx; | |
| else indexes[token] = indexes[token] "," idx; | |
| } | |
| function toascii(string) { | |
| # Unicode Latin-1 Supplement | |
| gsub(/[ÀÁÂÃÄÅ]/,"A", string); | |
| gsub(/[ÈÉÊË]/,"E", string); | |
| gsub(/[ÌÍÎÏ]/,"I", string); | |
| gsub(/[ÒÓÔÕÖ]/,"O", string); | |
| gsub(/[ÙÚÛÜ]/,"U", string); | |
| gsub(/Ý/,"Y", string); | |
| gsub(/Ç/,"C", string); | |
| gsub(/Ñ/,"N", string); | |
| gsub(/Ð/,"D", string); | |
| gsub(/Ø/,"OE", string); | |
| gsub(/Þ/,"TH", string); | |
| gsub(/Æ/,"AE", string); | |
| gsub(/[àáâãäåª]/,"a", string); | |
| gsub(/[èéêë]/,"e", string); | |
| gsub(/[ìíîï]/,"i", string); | |
| gsub(/[òóôõöº°]/,"o", string); | |
| gsub(/[ùúûü]/,"u", string); | |
| gsub(/[ýÿ]/,"y", string); | |
| gsub(/ç/,"c", string); | |
| gsub(/ñ/,"n", string); | |
| gsub(/ð/,"d", string); | |
| gsub(/ø/,"oe", string); | |
| gsub(/þ/,"th", string); | |
| gsub(/ae/,"ae", string); | |
| gsub(/ß/,"ss", string); | |
| # Unicode Punctuation | |
| gsub(/–/,"-", string); | |
| gsub(/—/,"--", string); | |
| gsub(/…/,"...", string); | |
| gsub(/[‘’]/,"\x27", string); | |
| gsub(/[“”«»]/,"\x22", string); | |
| # Remove MAC-UTF-8 combining diacritical marks (only those used in Latin-1) | |
| gsub(/[\xCC\x80\xCC\x81\xCC\x82\xCC\x83\xCC\x88\xCC\x8A\xCC\xA7]/,"", string); | |
| # Replace non-ASCII with SUB (0x1A) | |
| gsub(/[^\x00-\x7E]/,"\x1A", string); | |
| return string; | |
| } | |
| function get_stopwords_regex( file, regex, line) { | |
| if (!option_value("stopwords")) { | |
| return /^$/; | |
| } | |
| file=pwd "/../lib/lang/" lang "/stopwords.txt" | |
| regex="" | |
| while((getline line < file) > 0) { | |
| # skip line started with # | |
| if (line ~ /^[[:space:]]*$/ || line ~ /^#/) continue; | |
| regex=regex "|" line; | |
| } | |
| # remove leading pipe | |
| regex=substr(regex,2); | |
| return "^(" regex ")$" | |
| } | |
| # separates tokens by spaces | |
| function separate_tokens() { | |
| $0=" " $0 " "; | |
| gsub(/\xA0/, " "); | |
| gsub(/[]()—{}[]/, " & "); | |
| gsub(/[.,;:!?…][[:space:][:punct:]]/, " &"); | |
| gsub(/[[:space:][:punct:]][\x22\x27“”‘’«»]/, "& "); | |
| gsub(/[\x22\x27“”‘’«»][[:space:][:punct:]]/, " &"); | |
| } | |
| # 123 456 789,01 -> 123456789,01 | |
| function join_numbers( number) { | |
| while (match($0, /[[:space:][:punct:]][0-9]+[[:space:]][0-9][0-9][0-9][[:space:][:punct:]]/)) { | |
| number = substr($0, RSTART + 1, RLENGTH - 2); | |
| sub(/[[:space:]]/, "", number); | |
| $0 = substr($0, 0, RSTART) number substr($0, RSTART + RLENGTH - 1); | |
| } | |
| } | |
| function generate_records( token, count, ratio, sum, sep, r, f, flength, key, val) | |
| { | |
| # start of operational checks # | |
| sum=0 | |
| for (token in counters) { | |
| sum += counters[token]; | |
| } | |
| if (sum != length(tokens)) { | |
| print "Wrong sum of counts" > "/dev/stderr"; | |
| exit 1; | |
| } | |
| # end of operational checks # | |
| r=0 | |
| for (token in counters) { | |
| r++; | |
| sep = "" | |
| flength = fields[0]; | |
| count = counters[token]; | |
| ratio = count / length(tokens); | |
| for (f = 1; f <= flength; f++) { | |
| key = fields[f,"key"]; | |
| val = fields[f,"value"]; | |
| if (val == 0) continue; | |
| if (key == "token") { | |
| records[r,"token"] = token; | |
| } else if (key == "type") { | |
| records[r,"type"] = types[token]; | |
| } else if (key == "count") { | |
| records[r,"count"] = count; | |
| } else if (key == "ratio") { | |
| records[r,"ratio"] = ratio; | |
| } else if (key == "format") { | |
| records[r,"format"] = formats[token]; | |
| } else if (key == "case") { | |
| records[r,"case"] = cases[token]; | |
| } else if (key == "mask") { | |
| records[r,"mask"] = masks[token]; | |
| } else if (key == "length") { | |
| records[r,"length"] = length(token); | |
| } else if (key == "indexes") { | |
| records[r,"indexes"] = indexes[token]; | |
| } else { | |
| continue; | |
| } | |
| sep="\t" | |
| } | |
| } | |
| # array length | |
| records[0] = r; | |
| } | |
| function print_records( sep, r, f, rlength, flength) | |
| { | |
| flength = fields[0]; | |
| rlength = records[0]; | |
| if (length(records)) { | |
| sep = "" | |
| for (f = 1; f <= flength; f++) { | |
| if (fields[f,"value"] == 0) continue; | |
| printf "%s%s", sep, toupper(fields[f,"key"]) > output; | |
| sep = "\t" | |
| } | |
| printf "\n" > output; | |
| for (r = 1; r <= rlength; r++) { | |
| sep = "" | |
| for (f = 1; f <= flength; f++) { | |
| if (fields[f,"value"] == 0) continue; | |
| printf "%s%s", sep, records[r,fields[f,"key"]] > output; | |
| sep = "\t" | |
| } | |
| printf "\n" > output; | |
| } | |
| } | |
| } | |
| function basename(file) { | |
| sub("^.*/", "", file) | |
| return file | |
| } | |
| function basedir(file) { | |
| sub("/[^/]+$", "", file) | |
| return file | |
| } | |
| function parse_confs( file, line, string) | |
| { | |
| file=pwd "/../abw.conf" | |
| string="" | |
| while((getline line < file) > 0) { | |
| # skip comments | |
| gsub(/#.*$/,"", line); | |
| # skip invalid lines | |
| if (line !~ /^[[:space:]]*[[:alnum:]]+[[:space:]]*=[[:space:]]*[[:alnum:]]+[[:space:]]*$/) continue; | |
| if (!string) string = line; | |
| else string=string "," line; | |
| } | |
| fields[0] = 0; # declare array | |
| parse_fields(FIELDS, fields); | |
| if (length(fields) == 0) { | |
| parse_fields(string, fields); | |
| } | |
| options[0] = 0; # declare array | |
| parse_options(OPTIONS, options); | |
| if (length(options) == 0) { | |
| parse_options(string, options); | |
| } | |
| } | |
| function parse_fields(string, fields, default_string) | |
| { | |
| gsub(":","=",string); | |
| default_string="token,type,count,ratio,format,case,mask,length,indexes"; | |
| if (!string) string = default_string; | |
| parse_key_values(string, fields, default_string); | |
| } | |
| function parse_options(string, options, default_string) | |
| { | |
| gsub(":","=",string); | |
| default_string="ascii=0,lower=0,upper=0,stopwords=1,lang=none,eol=1,asc=none,desc=none"; | |
| if (!string) string = default_string; | |
| parse_key_values(string, options, default_string); | |
| } | |
| # Option formats: 'key' or 'key:value' | |
| # If the format is 'key', name is 'key' and value is '1' | |
| # If the format is 'key:value', name is 'key' and value is 'value' | |
| function parse_key_values(string, keyvalues, default_string, items, i, key, value, splitter) | |
| { | |
| split(string, items, ","); | |
| for (i in items) | |
| { | |
| gsub(/=.*$/, "", items[i]); | |
| if (default_string !~ "\\<" items[i] "\\>") { | |
| gsub("\\<" items[i] "\\>(=[^,]*)?", "", string); | |
| } | |
| } | |
| gsub(",+", ",", string); | |
| gsub("^,|,$", "", string); | |
| split(string, items, ","); | |
| for (i in items) | |
| { | |
| if (items[i] !~ "=" ) { | |
| key = items[i]; | |
| value = 1; | |
| } else { | |
| splitter = index(items[i], "="); | |
| key = substr(items[i], 0, splitter - 1); | |
| value = substr(items[i], splitter + 1); | |
| } | |
| keyvalues[i,"key"] = key; | |
| keyvalues[i,"value"] = value; | |
| } | |
| # save the array length | |
| keyvalues[0] = length(items); | |
| } | |
| function get_sort_order( sort_order, o, olength, key) | |
| { | |
| olength = options[0]; | |
| for (o = 1; o <= olength; o++) { | |
| key = options[o,"key"]; | |
| if (key == "asc") { | |
| if (options[o,"value"] == "token") sort_order = "@ind_str_asc"; | |
| if (options[o,"value"] == "count") sort_order = "@val_num_asc"; | |
| } else if (key == "desc") { | |
| if (options[o,"value"] == "token") sort_order = "@ind_str_desc"; | |
| if (options[o,"value"] == "count") sort_order = "@val_num_desc"; | |
| } else { | |
| continue; | |
| } | |
| } | |
| return sort_order; | |
| } | |
| function remove_stopwords( i) | |
| { | |
| for (i = 1; i <= NF; i++) { | |
| if (tolower($i) ~ tolower(stopwords_regex)) $i = ""; | |
| } | |
| } | |
| function transform_line( o, olength, key) | |
| { | |
| olength = options[0]; | |
| for (o = 1; o <= olength; o++) { | |
| key = options[o,"key"]; | |
| if (key == "ascii") { | |
| if (options[o,"value"] == 1) $0 = toascii($0); | |
| } else if (key == "lower") { | |
| if (options[o,"value"] == 1) $0 = tolower($0); | |
| } else if (key == "upper") { | |
| if (options[o,"value"] == 1) $0 = toupper($0); | |
| } else if (key == "stopwords") { | |
| if (options[o,"value"] == 0) remove_stopwords(); | |
| } else { | |
| continue; | |
| } | |
| } | |
| } | |
| function option_value(key, o, olength) { | |
| olength = options[0]; | |
| for (o = 1; o <= olength; o++) { | |
| if (options[o,"key"] == key) return options[o,"value"]; | |
| } | |
| return 0; | |
| } | |
| BEGIN { | |
| pwd = PWD; | |
| parse_confs(); | |
| eol = option_value("eol"); | |
| lang = option_value("lang"); | |
| sort_order = get_sort_order(); | |
| stopwords_regex = get_stopwords_regex(); | |
| } | |
| function endfile() { | |
| output=WRITETO; | |
| filedir=basedir(FILENAME) | |
| filename=basename(FILENAME) | |
| sub(/:filedir/, filedir, output); | |
| sub(/:filename/, filename, output); | |
| generate_records(); | |
| print_records(); | |
| idx = 0; | |
| delete tokens; | |
| delete types; | |
| delete counters; | |
| delete formats; | |
| delete cases; | |
| delete masks; | |
| delete indexes; | |
| delete records; | |
| } | |
| FNR == 1 && (NR > 1) { | |
| endfile(); | |
| } | |
| NF { | |
| join_numbers(); | |
| transform_line(); | |
| separate_tokens(); | |
| for (i = 1; i <= NF; i++) { | |
| insert_token($i); | |
| } | |
| if (eol) insert_token("<eol>"); | |
| } | |
| END { | |
| endfile(); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/sh | |
| cat <<EOF | |
| Content-Type: text/json; charset=utf-8 | |
| $(date -Iseconds) | |
| EOF |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/sh | |
| cat <<EOF | |
| Content-Type: text/html; charset=utf-8 | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <title>Hello, World!</title> | |
| </head> | |
| <body> | |
| <p>Hello, World!</p> | |
| </body> | |
| </html> | |
| EOF |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment