-
-
Save dromanov/07f86e67025f665fd8a141c8b2989453 to your computer and use it in GitHub Desktop.
| LINES = 5 | |
| ́ | |
| ̈ | |
| ̂ | |
| ⃗ | |
| ̆ |
| // Задание из https://docs.google.com/forms/d/e/1FAIpQLSc-T7mmLsI4skRKbm2px3ssBmtv1cbiIp1oBpvYXC_62qb9tw/viewform | |
| // Mirror: https://www.evernote.com/shard/s171/sh/57bba412-db24-4496-8aee-fbc1ef18db88/5162aeea37a68042ba7f162356edb83c | |
| // | |
| // | |
| // Key points: | |
| // * no UTF-8 "code point" can start from another "code point" | |
| // * we need to support Russian, including "combining inverted breve" over `и' | |
| // * it is enough to map `й' -> `и', `Ё' -> `е', etc | |
| // * according to test on the problem page, order is lexicographical | |
| // | |
| // | |
| // Decision: | |
| // * std::map to normalize case, map separators to space, strip accents | |
| // * std::set to drop accents and selected modifiers | |
| // * use good source of knowledge to build this map (Python + unicodedata) | |
| // | |
| // | |
| // References and links: | |
| // [1] Russian codepage [http://www.utf8-chartable.de/unicode-utf8-table.pl?start=1024] | |
| // [2] Русские символы - это 2 байта [0xd0, 0xd3] × [0x80, 0xbf]. | |
| // [3] https://habrahabr.ru/post/262679/ | |
| // | |
| // | |
| // WARNING! | |
| // ======== | |
| // I strip all accents: for some languages it is unacceptable. You can fix | |
| // 'UTF8Tokenizer::blacklist()' - apply blacklist rejection only if codepoint | |
| // is Russian/English, which is easy to do (see comment in 'UTF8Tokenizer::eat()'). | |
| // Russian codepoints start from 0xd0, .., 0xd3. | |
| #include <ctype.h> | |
| #include <stdio.h> | |
| #include <string.h> | |
| #include <assert.h> | |
| #include <algorithm> | |
| #include <vector> | |
| #include <string> | |
| #include <map> | |
| #include <set> | |
| #include <iostream> | |
| // ---------------------------------------------------------------------------- | |
| // Instance of class can be enhanced with 'replacing dictionary' and blacklist. | |
| // Usage: | |
| // UTF8Tokenizer u; | |
| // u.load_mapping("mappings.txt"); | |
| // u.load_blacklist("blacklist.txt"); | |
| // | |
| // const char* res1 = u.eat(byte); | |
| // const char* res2 = u.eat(byte); | |
| // ... | |
| // You feed byte by byte to 'u.eat()' and it will return 'nullptr' if token is | |
| // not assembled/rejected, or C asciiz string with utf8 string. | |
| // ---------------------------------------------------------------------------- | |
| class UTF8Tokenizer { | |
| std::map<std::string, std::string> simplify; // Й -> и, ё -> е, ... | |
| std::set<std::string> blacklist; // Ударения, кратки, акценты. | |
| int pos; | |
| int chunk_size; | |
| char chunk[5]; | |
| public: | |
| UTF8Tokenizer() : pos(-1), chunk_size(0) | |
| {} | |
| void load_mapping(const char* filename) | |
| { | |
| // Add manually to simplify parsing of replacing data file. | |
| simplify["'"] = " "; | |
| simplify["\""] = " "; | |
| simplify["\n"] = " "; | |
| FILE *fp = fopen(filename, "rb"); | |
| assert(fp && "Please run `unucode_lookup.py` to generate maps."); | |
| int numlines; | |
| fscanf(fp, "LINES = %d ", &numlines); | |
| char from[5], to[5]; | |
| for (int i = 0 ; i < numlines ; ++i) { | |
| fscanf(fp, "'%[^']' -> '%[^']'\n", from, to); | |
| simplify[std::string(from)] = std::string(to); | |
| } | |
| fclose(fp); | |
| } | |
| void load_blacklist(const char* filename) | |
| { | |
| FILE *fp = fopen(filename, "rb"); | |
| assert(fp && "Please run `unucode_lookup.py` to generate maps."); | |
| int numlines; | |
| fscanf(fp, "LINES = %d\n", &numlines); | |
| char token[5]; | |
| for (int i = 0 ; i < numlines ; ++i) { | |
| fscanf(fp, "%[^\n]\n", token); | |
| blacklist.insert(std::string(token)); | |
| } | |
| fclose(fp); | |
| } | |
| int utf8size(int byte) | |
| { | |
| // TODO: 3-bit lookup table? 8 elements, seems ok. | |
| if (byte < 128) { | |
| return 1; | |
| } | |
| int count = 0; | |
| while (byte & 128) { | |
| byte <<= 1; | |
| count++; | |
| } | |
| return count; | |
| } | |
| // Eat byte one by one and return either valid UTF-8 codepoint or | |
| // `nullptr` if token is not assembled yet. See full comment for the class. | |
| const char* eat(int byte) | |
| { | |
| if (pos < 0) { | |
| chunk_size = utf8size(byte); | |
| // Set or drop 'Russian or English' flag here and use it below (**) | |
| pos = 0; | |
| } | |
| chunk[pos++] = byte; | |
| if (pos == chunk_size) { | |
| chunk[pos] = 0; | |
| pos = -1; | |
| // Below you can optionally cancel rejection for nonRussian languages. | |
| auto drop_pos = blacklist.find(chunk); | |
| if (drop_pos != blacklist.end()) // (**) | |
| return nullptr; | |
| auto simplify_pos = simplify.find(chunk); | |
| if (simplify_pos != simplify.end()) { | |
| return simplify_pos->second.c_str(); | |
| } | |
| return chunk; | |
| } | |
| return nullptr; | |
| } | |
| ~UTF8Tokenizer () | |
| { | |
| assert(pos < 0 && "There are unfinished token waiting in buffer."); | |
| } | |
| }; | |
| // ---------------------------------------------------------------------------- | |
| // Simple wrapper for a pair "word + count". | |
| // Provides operators necessary to interface with algorithm::sort(). | |
| // Note that order of count is descending while word are sorted normally: | |
| // | |
| // +-- [ https://en.wikipedia.org/wiki/UTF-8#Advantages ] | |
| // | Sorting a set of UTF-8 encoded strings as strings of unsigned bytes yields | |
| // | the same order as sorting the corresponding Unicode strings | |
| // | lexicographically by codepoint. | |
| // ---------------------------------------------------------------------------- | |
| struct Pair { | |
| int count; | |
| std::string word; | |
| Pair(const char *word) : count(1), word(word) | |
| { | |
| } | |
| // Required by 'std::sort'. | |
| Pair& operator= (const Pair& other) | |
| { | |
| // Protect against invalid self-assignment | |
| if (this != &other) | |
| { | |
| count = other.count; | |
| word = other.word; | |
| } | |
| // By convention, always return *this. | |
| return *this; | |
| } | |
| // Required by 'std::sort'. | |
| bool operator<(const Pair& b) const | |
| { | |
| // ::count in decreasing, ::word in lexicographically increasing order. | |
| if (count > b.count) | |
| return true; | |
| if (count < b.count) | |
| return false; | |
| return strcmp(word.c_str(), b.word.c_str()) < 0; | |
| } | |
| }; | |
| // ---------------------------------------------------------------------------- | |
| // Container for incoming words. | |
| // Search is linear, simple map<word, count> will fix it if necessary. | |
| // ---------------------------------------------------------------------------- | |
| class FreqCounter { | |
| std::vector<Pair> dict; | |
| public: | |
| void insert(const char *word) | |
| { | |
| for (int i = 0 ; i < dict.size() ; ++i) { | |
| if (dict[i].word == word) { | |
| dict[i].count++; | |
| return; | |
| } | |
| } | |
| dict.push_back(Pair(word)); | |
| } | |
| void dump_sorted_into_file(const char* filename) | |
| { | |
| FILE *fp = fopen(filename, "wb"); | |
| assert(fp && "Cannot open output file for writing."); | |
| std::sort(dict.begin(), dict.end()); | |
| for (int i = 0 ; i < dict.size() ; ++i) | |
| fprintf(fp, "%d %s\n", dict[i].count, dict[i].word.c_str()); | |
| fclose(fp); | |
| } | |
| }; | |
| // ---------------------------------------------------------------------------- | |
| // Structure: | |
| // prepare instances of tokenizer and counter | |
| // feed incoming text char by char to utf8 assembler | |
| // outgoing utf8 codepoints are merged into words using simple finite | |
| // automata | |
| // dump result into file. | |
| // ---------------------------------------------------------------------------- | |
| int main(int argc, char **argv) | |
| { | |
| assert(argc == 3 && "Usage: ./a.out in.txt out.txt"); | |
| FILE *fin = fopen(argv[1], "rb"); | |
| assert(fin && "Cannot open input file for reading."); | |
| FreqCounter fq; | |
| UTF8Tokenizer utf8tokenizer; | |
| utf8tokenizer.load_mapping("mappings.txt"); | |
| utf8tokenizer.load_blacklist("blacklist.txt"); | |
| // Пробую автоматное программирование [Шалыто]. | |
| enum {EATING_SPACES, EATING_CHARS}; | |
| int state = EATING_SPACES; | |
| int c; | |
| std::string word = ""; | |
| while ((c = fgetc(fin)) != EOF) { | |
| const char *token = utf8tokenizer.eat(c); | |
| if (!token) { | |
| continue; | |
| } | |
| switch (state) { | |
| case EATING_SPACES: | |
| if (token[0] != ' ') { | |
| state = EATING_CHARS; | |
| word = token; | |
| } | |
| break; | |
| case EATING_CHARS: | |
| if (token[0] == ' ') { | |
| state = EATING_SPACES; | |
| fq.insert(word.c_str()); | |
| word = ""; | |
| } else { | |
| word += token; | |
| } | |
| break; | |
| }; | |
| } | |
| fclose(fin); | |
| fq.dump_sorted_into_file(argv[2]); | |
| return 0; | |
| } |
| LINES = 160 | |
| 'Э' -> 'э' | |
| 'V' -> 'v' | |
| 'А' -> 'а' | |
| 'Д' -> 'д' | |
| 'X' -> 'x' | |
| 'И' -> 'и' | |
| 'М' -> 'м' | |
| 'Р' -> 'р' | |
| 'Ф' -> 'ф' | |
| 'г' -> 'г' | |
| 'Ш' -> 'ш' | |
| ',' -> ' ' | |
| '\' -> ' ' | |
| '0' -> ' ' | |
| 'n' -> 'n' | |
| '4' -> ' ' | |
| 'и' -> 'и' | |
| 'м' -> 'м' | |
| 'р' -> 'р' | |
| 'ф' -> 'ф' | |
| 'ш' -> 'ш' | |
| 'ь' -> 'ь' | |
| '9' -> ' ' | |
| 'b' -> 'b' | |
| 'T' -> 't' | |
| '2' -> ' ' | |
| ';' -> ' ' | |
| '`' -> ' ' | |
| 'd' -> 'd' | |
| '=' -> ' ' | |
| 'l' -> 'l' | |
| 'p' -> 'p' | |
| '?' -> ' ' | |
| 'h' -> 'h' | |
| 'x' -> 'x' | |
| '|' -> ' ' | |
| 'A' -> 'a' | |
| 'C' -> 'c' | |
| 'Г' -> 'г' | |
| 'З' -> 'з' | |
| 'E' -> 'e' | |
| 'Л' -> 'л' | |
| '5' -> ' ' | |
| 'П' -> 'п' | |
| 'У' -> 'у' | |
| 'G' -> 'g' | |
| 'Ч' -> 'ч' | |
| 'Ы' -> 'ы' | |
| 'Я' -> 'я' | |
| 'I' -> 'i' | |
| '3' -> ' ' | |
| 'з' -> 'з' | |
| 'л' -> 'л' | |
| 'K' -> 'k' | |
| 'п' -> 'п' | |
| 't' -> 't' | |
| 'у' -> 'у' | |
| 'ч' -> 'ч' | |
| ' ' -> ' ' | |
| 'M' -> 'm' | |
| 'ы' -> 'ы' | |
| 'я' -> 'я' | |
| 'S' -> 's' | |
| '"' -> ' ' | |
| 'O' -> 'o' | |
| 'W' -> 'w' | |
| '[' -> ' ' | |
| '_' -> ' ' | |
| '$' -> ' ' | |
| 'Q' -> 'q' | |
| 'c' -> 'c' | |
| 'z' -> 'z' | |
| 'g' -> 'g' | |
| 'k' -> 'k' | |
| '&' -> ' ' | |
| 'o' -> 'o' | |
| '7' -> ' ' | |
| 's' -> 's' | |
| 'w' -> 'w' | |
| '(' -> ' ' | |
| '{' -> ' ' | |
| '6' -> ' ' | |
| 'Ъ' -> 'ъ' | |
| 'Ь' -> 'ь' | |
| 'В' -> 'в' | |
| 'Ж' -> 'ж' | |
| 'К' -> 'к' | |
| '>' -> ' ' | |
| 'О' -> 'о' | |
| 'Т' -> 'т' | |
| 'Ц' -> 'ц' | |
| '*' -> ' ' | |
| 'Ю' -> 'ю' | |
| 'в' -> 'в' | |
| 'ж' -> 'ж' | |
| 'к' -> 'к' | |
| 'д' -> 'д' | |
| 'т' -> 'т' | |
| 'ц' -> 'ц' | |
| 'ъ' -> 'ъ' | |
| 'D' -> 'd' | |
| 'ю' -> 'ю' | |
| 'R' -> 'r' | |
| '8' -> ' ' | |
| 'Z' -> 'z' | |
| '^' -> ' ' | |
| ':' -> ' ' | |
| 'f' -> 'f' | |
| 'j' -> 'j' | |
| '<' -> ' ' | |
| 'r' -> 'r' | |
| 'v' -> 'v' | |
| 'о' -> 'о' | |
| '~' -> ' ' | |
| 'Ё' -> 'е' | |
| '@' -> ' ' | |
| 'Б' -> 'б' | |
| 'B' -> 'b' | |
| 'Е' -> 'е' | |
| 'Й' -> 'и' | |
| 'Н' -> 'н' | |
| '/' -> ' ' | |
| 'С' -> 'с' | |
| 'Х' -> 'х' | |
| '#' -> ' ' | |
| 'Щ' -> 'щ' | |
| 'F' -> 'f' | |
| '-' -> ' ' | |
| 'б' -> 'б' | |
| '.' -> ' ' | |
| 'е' -> 'е' | |
| 'H' -> 'h' | |
| 'й' -> 'и' | |
| 'н' -> 'н' | |
| 'с' -> 'с' | |
| '!' -> ' ' | |
| 'J' -> 'j' | |
| 'х' -> 'х' | |
| 'щ' -> 'щ' | |
| 'э' -> 'э' | |
| 'а' -> 'а' | |
| 'L' -> 'l' | |
| 'ё' -> 'е' | |
| 'U' -> 'u' | |
| 'Y' -> 'y' | |
| '%' -> ' ' | |
| 'N' -> 'n' | |
| ']' -> ' ' | |
| 'a' -> 'a' | |
| 'e' -> 'e' | |
| 'P' -> 'p' | |
| 'i' -> 'i' | |
| 'm' -> 'm' | |
| 'q' -> 'q' | |
| ')' -> ' ' | |
| '1' -> ' ' | |
| 'u' -> 'u' | |
| 'y' -> 'y' | |
| '}' -> ' ' | |
| '+' -> ' ' |
| # -*- coding: UTF-8 -*- | |
| import unicodedata | |
| import codecs | |
| russian = u"йцукеёнгшщзхъфывапролджэячсмитьбю" | |
| def make(): | |
| """ | |
| Our goal is to prepare lookup tables for UTF-8 russian text tokenizer. | |
| We treat `й' as `и' and `ё' as `е'. | |
| """ | |
| replacing = { | |
| u'ё': u'е', | |
| u'й': u'и', | |
| } | |
| # You can add any number of s⃗pêciál symbols here. | |
| removing = set(u'\u0301 \u20d7 \u0302'.split()) | |
| def extract_modifiers(u): | |
| u = unicodedata.normalize(u"NFKD", u) | |
| map(removing.add, u[1:]) | |
| for c in russian: | |
| replacing[c] = replacing.get(c, c) | |
| replacing[c.upper()] = replacing.get(c, c) | |
| extract_modifiers(c) | |
| extract_modifiers(c.upper()) | |
| for c in map(chr, range(32, 127)): | |
| if c.isalpha(): | |
| replacing[c] = c.lower() | |
| elif c not in "\n'": # I will add them manually. | |
| replacing[c] = ' ' | |
| with codecs.open('mappings.txt', 'w', 'utf-8') as fp: | |
| fp.write(u"LINES = %d\n" % len(replacing)) | |
| for item in replacing.iteritems(): | |
| fp.write(u"'%s' -> '%s'\n" % item) | |
| with codecs.open('blacklist.txt', 'w', 'utf-8') as fp: | |
| fp.write(u"LINES = %d\n" % len(removing)) | |
| for item in removing: | |
| fp.write(u"%s\n" % item) | |
| print u"\n".join([u"'%s' -> '%s'" % i for i in replacing.iteritems()]) | |
| print u"\n".join([u"'%s' = 'a%s'" % (repr(i), i) for i in removing]) | |
| make() |
PostMortem
В коде ошибка, в UNIGINE нашли мгновенно :-). ^M, который должен работать как разделитель, был воспринят как слово и посчитан.
Причина - вместо локали и её традиционного isalpha для определения "буква/не буква" был словарь, который все разделители отображал на ' ', переводил всё в нижний регистр, нормализовывал 'Й' -> 'и', чистил ударения и так далее (сразу isalpha и tolower в один проход).
Этот словарь делался на Питоне (unicode_lookup.py), и там надо строку 33 исправить на
for c in map(chr, range(1, 127)):
вместо
for c in map(chr, range(32, 127)):
Тогда все табы, ^M и так далее будут внесены в словарь и будут обрабатываться как надо (их ASCII коды меньше 32).
В целом сказали, что решение по сути overengineered.
Для сравнения есть вариант Руслана Абдикеева, весьма интересный для чтения:
новый и старый.
Дискуссия и обсуждение тут.
Ждём второго набора ;-)
Работает, игнорирует ударения, спец-символы, кратки, акценты и так далее.