Created
March 2, 2017 16:50
-
-
Save dromanov/07f86e67025f665fd8a141c8b2989453 to your computer and use it in GitHub Desktop.
Задание на участие в школе по настоящему С++ программированию [Unigine, Shodan].
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| LINES = 5 | |
| ́ | |
| ̈ | |
| ̂ | |
| ⃗ | |
| ̆ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Задание из https://docs.google.com/forms/d/e/1FAIpQLSc-T7mmLsI4skRKbm2px3ssBmtv1cbiIp1oBpvYXC_62qb9tw/viewform | |
| // Mirror: https://www.evernote.com/shard/s171/sh/57bba412-db24-4496-8aee-fbc1ef18db88/5162aeea37a68042ba7f162356edb83c | |
| // | |
| // | |
| // Key points: | |
| // * no UTF-8 "code point" can start from another "code point" | |
| // * we need to support Russian, including "combining inverted breve" over `и' | |
| // * it is enough to map `й' -> `и', `Ё' -> `е', etc | |
| // * according to test on the problem page, order is lexicographical | |
| // | |
| // | |
| // Decision: | |
| // * std::map to normalize case, map separators to space, strip accents | |
| // * std::set to drop accents and selected modifiers | |
| // * use good source of knowledge to build this map (Python + unicodedata) | |
| // | |
| // | |
| // References and links: | |
| // [1] Russian codepage [http://www.utf8-chartable.de/unicode-utf8-table.pl?start=1024] | |
| // [2] Русские символы - это 2 байта [0xd0, 0xd3] × [0x80, 0xbf]. | |
| // [3] https://habrahabr.ru/post/262679/ | |
| // | |
| // | |
| // WARNING! | |
| // ======== | |
| // I strip all accents: for some languages it is unacceptable. You can fix | |
| // 'UTF8Tokenizer::blacklist()' - apply blacklist rejection only if codepoint | |
| // is Russian/English, which is easy to do (see comment in 'UTF8Tokenizer::eat()'). | |
| // Russian codepoints start from 0xd0, .., 0xd3. | |
| #include <ctype.h> | |
| #include <stdio.h> | |
| #include <string.h> | |
| #include <assert.h> | |
| #include <algorithm> | |
| #include <vector> | |
| #include <string> | |
| #include <map> | |
| #include <set> | |
| #include <iostream> | |
| // ---------------------------------------------------------------------------- | |
| // Instance of class can be enhanced with 'replacing dictionary' and blacklist. | |
| // Usage: | |
| // UTF8Tokenizer u; | |
| // u.load_mapping("mappings.txt"); | |
| // u.load_blacklist("blacklist.txt"); | |
| // | |
| // const char* res1 = u.eat(byte); | |
| // const char* res2 = u.eat(byte); | |
| // ... | |
| // You feed byte by byte to 'u.eat()' and it will return 'nullptr' if token is | |
| // not assembled/rejected, or C asciiz string with utf8 string. | |
| // ---------------------------------------------------------------------------- | |
| class UTF8Tokenizer { | |
| std::map<std::string, std::string> simplify; // Й -> и, ё -> е, ... | |
| std::set<std::string> blacklist; // Ударения, кратки, акценты. | |
| int pos; | |
| int chunk_size; | |
| char chunk[5]; | |
| public: | |
| UTF8Tokenizer() : pos(-1), chunk_size(0) | |
| {} | |
| void load_mapping(const char* filename) | |
| { | |
| // Add manually to simplify parsing of replacing data file. | |
| simplify["'"] = " "; | |
| simplify["\""] = " "; | |
| simplify["\n"] = " "; | |
| FILE *fp = fopen(filename, "rb"); | |
| assert(fp && "Please run `unucode_lookup.py` to generate maps."); | |
| int numlines; | |
| fscanf(fp, "LINES = %d ", &numlines); | |
| char from[5], to[5]; | |
| for (int i = 0 ; i < numlines ; ++i) { | |
| fscanf(fp, "'%[^']' -> '%[^']'\n", from, to); | |
| simplify[std::string(from)] = std::string(to); | |
| } | |
| fclose(fp); | |
| } | |
| void load_blacklist(const char* filename) | |
| { | |
| FILE *fp = fopen(filename, "rb"); | |
| assert(fp && "Please run `unucode_lookup.py` to generate maps."); | |
| int numlines; | |
| fscanf(fp, "LINES = %d\n", &numlines); | |
| char token[5]; | |
| for (int i = 0 ; i < numlines ; ++i) { | |
| fscanf(fp, "%[^\n]\n", token); | |
| blacklist.insert(std::string(token)); | |
| } | |
| fclose(fp); | |
| } | |
| int utf8size(int byte) | |
| { | |
| // TODO: 3-bit lookup table? 8 elements, seems ok. | |
| if (byte < 128) { | |
| return 1; | |
| } | |
| int count = 0; | |
| while (byte & 128) { | |
| byte <<= 1; | |
| count++; | |
| } | |
| return count; | |
| } | |
| // Eat byte one by one and return either valid UTF-8 codepoint or | |
| // `nullptr` if token is not assembled yet. See full comment for the class. | |
| const char* eat(int byte) | |
| { | |
| if (pos < 0) { | |
| chunk_size = utf8size(byte); | |
| // Set or drop 'Russian or English' flag here and use it below (**) | |
| pos = 0; | |
| } | |
| chunk[pos++] = byte; | |
| if (pos == chunk_size) { | |
| chunk[pos] = 0; | |
| pos = -1; | |
| // Below you can optionally cancel rejection for nonRussian languages. | |
| auto drop_pos = blacklist.find(chunk); | |
| if (drop_pos != blacklist.end()) // (**) | |
| return nullptr; | |
| auto simplify_pos = simplify.find(chunk); | |
| if (simplify_pos != simplify.end()) { | |
| return simplify_pos->second.c_str(); | |
| } | |
| return chunk; | |
| } | |
| return nullptr; | |
| } | |
| ~UTF8Tokenizer () | |
| { | |
| assert(pos < 0 && "There are unfinished token waiting in buffer."); | |
| } | |
| }; | |
| // ---------------------------------------------------------------------------- | |
| // Simple wrapper for a pair "word + count". | |
| // Provides operators necessary to interface with algorithm::sort(). | |
| // Note that order of count is descending while word are sorted normally: | |
| // | |
| // +-- [ https://en.wikipedia.org/wiki/UTF-8#Advantages ] | |
| // | Sorting a set of UTF-8 encoded strings as strings of unsigned bytes yields | |
| // | the same order as sorting the corresponding Unicode strings | |
| // | lexicographically by codepoint. | |
| // ---------------------------------------------------------------------------- | |
| struct Pair { | |
| int count; | |
| std::string word; | |
| Pair(const char *word) : count(1), word(word) | |
| { | |
| } | |
| // Required by 'std::sort'. | |
| Pair& operator= (const Pair& other) | |
| { | |
| // Protect against invalid self-assignment | |
| if (this != &other) | |
| { | |
| count = other.count; | |
| word = other.word; | |
| } | |
| // By convention, always return *this. | |
| return *this; | |
| } | |
| // Required by 'std::sort'. | |
| bool operator<(const Pair& b) const | |
| { | |
| // ::count in decreasing, ::word in lexicographically increasing order. | |
| if (count > b.count) | |
| return true; | |
| if (count < b.count) | |
| return false; | |
| return strcmp(word.c_str(), b.word.c_str()) < 0; | |
| } | |
| }; | |
| // ---------------------------------------------------------------------------- | |
| // Container for incoming words. | |
| // Search is linear, simple map<word, count> will fix it if necessary. | |
| // ---------------------------------------------------------------------------- | |
| class FreqCounter { | |
| std::vector<Pair> dict; | |
| public: | |
| void insert(const char *word) | |
| { | |
| for (int i = 0 ; i < dict.size() ; ++i) { | |
| if (dict[i].word == word) { | |
| dict[i].count++; | |
| return; | |
| } | |
| } | |
| dict.push_back(Pair(word)); | |
| } | |
| void dump_sorted_into_file(const char* filename) | |
| { | |
| FILE *fp = fopen(filename, "wb"); | |
| assert(fp && "Cannot open output file for writing."); | |
| std::sort(dict.begin(), dict.end()); | |
| for (int i = 0 ; i < dict.size() ; ++i) | |
| fprintf(fp, "%d %s\n", dict[i].count, dict[i].word.c_str()); | |
| fclose(fp); | |
| } | |
| }; | |
| // ---------------------------------------------------------------------------- | |
| // Structure: | |
| // prepare instances of tokenizer and counter | |
| // feed incoming text char by char to utf8 assembler | |
| // outgoing utf8 codepoints are merged into words using simple finite | |
| // automata | |
| // dump result into file. | |
| // ---------------------------------------------------------------------------- | |
| int main(int argc, char **argv) | |
| { | |
| assert(argc == 3 && "Usage: ./a.out in.txt out.txt"); | |
| FILE *fin = fopen(argv[1], "rb"); | |
| assert(fin && "Cannot open input file for reading."); | |
| FreqCounter fq; | |
| UTF8Tokenizer utf8tokenizer; | |
| utf8tokenizer.load_mapping("mappings.txt"); | |
| utf8tokenizer.load_blacklist("blacklist.txt"); | |
| // Пробую автоматное программирование [Шалыто]. | |
| enum {EATING_SPACES, EATING_CHARS}; | |
| int state = EATING_SPACES; | |
| int c; | |
| std::string word = ""; | |
| while ((c = fgetc(fin)) != EOF) { | |
| const char *token = utf8tokenizer.eat(c); | |
| if (!token) { | |
| continue; | |
| } | |
| switch (state) { | |
| case EATING_SPACES: | |
| if (token[0] != ' ') { | |
| state = EATING_CHARS; | |
| word = token; | |
| } | |
| break; | |
| case EATING_CHARS: | |
| if (token[0] == ' ') { | |
| state = EATING_SPACES; | |
| fq.insert(word.c_str()); | |
| word = ""; | |
| } else { | |
| word += token; | |
| } | |
| break; | |
| }; | |
| } | |
| fclose(fin); | |
| fq.dump_sorted_into_file(argv[2]); | |
| return 0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| LINES = 160 | |
| 'Э' -> 'э' | |
| 'V' -> 'v' | |
| 'А' -> 'а' | |
| 'Д' -> 'д' | |
| 'X' -> 'x' | |
| 'И' -> 'и' | |
| 'М' -> 'м' | |
| 'Р' -> 'р' | |
| 'Ф' -> 'ф' | |
| 'г' -> 'г' | |
| 'Ш' -> 'ш' | |
| ',' -> ' ' | |
| '\' -> ' ' | |
| '0' -> ' ' | |
| 'n' -> 'n' | |
| '4' -> ' ' | |
| 'и' -> 'и' | |
| 'м' -> 'м' | |
| 'р' -> 'р' | |
| 'ф' -> 'ф' | |
| 'ш' -> 'ш' | |
| 'ь' -> 'ь' | |
| '9' -> ' ' | |
| 'b' -> 'b' | |
| 'T' -> 't' | |
| '2' -> ' ' | |
| ';' -> ' ' | |
| '`' -> ' ' | |
| 'd' -> 'd' | |
| '=' -> ' ' | |
| 'l' -> 'l' | |
| 'p' -> 'p' | |
| '?' -> ' ' | |
| 'h' -> 'h' | |
| 'x' -> 'x' | |
| '|' -> ' ' | |
| 'A' -> 'a' | |
| 'C' -> 'c' | |
| 'Г' -> 'г' | |
| 'З' -> 'з' | |
| 'E' -> 'e' | |
| 'Л' -> 'л' | |
| '5' -> ' ' | |
| 'П' -> 'п' | |
| 'У' -> 'у' | |
| 'G' -> 'g' | |
| 'Ч' -> 'ч' | |
| 'Ы' -> 'ы' | |
| 'Я' -> 'я' | |
| 'I' -> 'i' | |
| '3' -> ' ' | |
| 'з' -> 'з' | |
| 'л' -> 'л' | |
| 'K' -> 'k' | |
| 'п' -> 'п' | |
| 't' -> 't' | |
| 'у' -> 'у' | |
| 'ч' -> 'ч' | |
| ' ' -> ' ' | |
| 'M' -> 'm' | |
| 'ы' -> 'ы' | |
| 'я' -> 'я' | |
| 'S' -> 's' | |
| '"' -> ' ' | |
| 'O' -> 'o' | |
| 'W' -> 'w' | |
| '[' -> ' ' | |
| '_' -> ' ' | |
| '$' -> ' ' | |
| 'Q' -> 'q' | |
| 'c' -> 'c' | |
| 'z' -> 'z' | |
| 'g' -> 'g' | |
| 'k' -> 'k' | |
| '&' -> ' ' | |
| 'o' -> 'o' | |
| '7' -> ' ' | |
| 's' -> 's' | |
| 'w' -> 'w' | |
| '(' -> ' ' | |
| '{' -> ' ' | |
| '6' -> ' ' | |
| 'Ъ' -> 'ъ' | |
| 'Ь' -> 'ь' | |
| 'В' -> 'в' | |
| 'Ж' -> 'ж' | |
| 'К' -> 'к' | |
| '>' -> ' ' | |
| 'О' -> 'о' | |
| 'Т' -> 'т' | |
| 'Ц' -> 'ц' | |
| '*' -> ' ' | |
| 'Ю' -> 'ю' | |
| 'в' -> 'в' | |
| 'ж' -> 'ж' | |
| 'к' -> 'к' | |
| 'д' -> 'д' | |
| 'т' -> 'т' | |
| 'ц' -> 'ц' | |
| 'ъ' -> 'ъ' | |
| 'D' -> 'd' | |
| 'ю' -> 'ю' | |
| 'R' -> 'r' | |
| '8' -> ' ' | |
| 'Z' -> 'z' | |
| '^' -> ' ' | |
| ':' -> ' ' | |
| 'f' -> 'f' | |
| 'j' -> 'j' | |
| '<' -> ' ' | |
| 'r' -> 'r' | |
| 'v' -> 'v' | |
| 'о' -> 'о' | |
| '~' -> ' ' | |
| 'Ё' -> 'е' | |
| '@' -> ' ' | |
| 'Б' -> 'б' | |
| 'B' -> 'b' | |
| 'Е' -> 'е' | |
| 'Й' -> 'и' | |
| 'Н' -> 'н' | |
| '/' -> ' ' | |
| 'С' -> 'с' | |
| 'Х' -> 'х' | |
| '#' -> ' ' | |
| 'Щ' -> 'щ' | |
| 'F' -> 'f' | |
| '-' -> ' ' | |
| 'б' -> 'б' | |
| '.' -> ' ' | |
| 'е' -> 'е' | |
| 'H' -> 'h' | |
| 'й' -> 'и' | |
| 'н' -> 'н' | |
| 'с' -> 'с' | |
| '!' -> ' ' | |
| 'J' -> 'j' | |
| 'х' -> 'х' | |
| 'щ' -> 'щ' | |
| 'э' -> 'э' | |
| 'а' -> 'а' | |
| 'L' -> 'l' | |
| 'ё' -> 'е' | |
| 'U' -> 'u' | |
| 'Y' -> 'y' | |
| '%' -> ' ' | |
| 'N' -> 'n' | |
| ']' -> ' ' | |
| 'a' -> 'a' | |
| 'e' -> 'e' | |
| 'P' -> 'p' | |
| 'i' -> 'i' | |
| 'm' -> 'm' | |
| 'q' -> 'q' | |
| ')' -> ' ' | |
| '1' -> ' ' | |
| 'u' -> 'u' | |
| 'y' -> 'y' | |
| '}' -> ' ' | |
| '+' -> ' ' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: UTF-8 -*- | |
| import unicodedata | |
| import codecs | |
| russian = u"йцукеёнгшщзхъфывапролджэячсмитьбю" | |
| def make(): | |
| """ | |
| Our goal is to prepare lookup tables for UTF-8 russian text tokenizer. | |
| We treat `й' as `и' and `ё' as `е'. | |
| """ | |
| replacing = { | |
| u'ё': u'е', | |
| u'й': u'и', | |
| } | |
| # You can add any number of s⃗pêciál symbols here. | |
| removing = set(u'\u0301 \u20d7 \u0302'.split()) | |
| def extract_modifiers(u): | |
| u = unicodedata.normalize(u"NFKD", u) | |
| map(removing.add, u[1:]) | |
| for c in russian: | |
| replacing[c] = replacing.get(c, c) | |
| replacing[c.upper()] = replacing.get(c, c) | |
| extract_modifiers(c) | |
| extract_modifiers(c.upper()) | |
| for c in map(chr, range(32, 127)): | |
| if c.isalpha(): | |
| replacing[c] = c.lower() | |
| elif c not in "\n'": # I will add them manually. | |
| replacing[c] = ' ' | |
| with codecs.open('mappings.txt', 'w', 'utf-8') as fp: | |
| fp.write(u"LINES = %d\n" % len(replacing)) | |
| for item in replacing.iteritems(): | |
| fp.write(u"'%s' -> '%s'\n" % item) | |
| with codecs.open('blacklist.txt', 'w', 'utf-8') as fp: | |
| fp.write(u"LINES = %d\n" % len(removing)) | |
| for item in removing: | |
| fp.write(u"%s\n" % item) | |
| print u"\n".join([u"'%s' -> '%s'" % i for i in replacing.iteritems()]) | |
| print u"\n".join([u"'%s' = 'a%s'" % (repr(i), i) for i in removing]) | |
| make() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
PostMortem
В коде ошибка, в UNIGINE нашли мгновенно :-). ^M, который должен работать как разделитель, был воспринят как слово и посчитан.
Причина - вместо локали и её традиционного isalpha для определения "буква/не буква" был словарь, который все разделители отображал на ' ', переводил всё в нижний регистр, нормализовывал 'Й' -> 'и', чистил ударения и так далее (сразу isalpha и tolower в один проход).
Этот словарь делался на Питоне (unicode_lookup.py), и там надо строку 33 исправить на
вместо
Тогда все табы, ^M и так далее будут внесены в словарь и будут обрабатываться как надо (их ASCII коды меньше 32).
В целом сказали, что решение по сути overengineered.
Для сравнения есть вариант Руслана Абдикеева, весьма интересный для чтения:
новый и старый.
Дискуссия и обсуждение тут.
Ждём второго набора ;-)