dromanov/blacklist.txt

## blacklist.txt
LINES = 5
́
̈
̂
⃗
̆

## main.cpp
// Задание из https://docs.google.com/forms/d/e/1FAIpQLSc-T7mmLsI4skRKbm2px3ssBmtv1cbiIp1oBpvYXC_62qb9tw/viewform
// Mirror: https://www.evernote.com/shard/s171/sh/57bba412-db24-4496-8aee-fbc1ef18db88/5162aeea37a68042ba7f162356edb83c
//
//
// Key points:
// * no UTF-8 "code point" can start from another "code point"
// * we need to support Russian, including "combining inverted breve" over `и'
// * it is enough to map `й' -> `и', `Ё' -> `е', etc
// * according to test on the problem page, order is lexicographical
//
//
// Decision:
// * std::map to normalize case, map separators to space, strip accents
// * std::set to drop accents and selected modifiers
// * use good source of knowledge to build this map (Python + unicodedata)
//
//
// References and links:
// [1] Russian codepage [http://www.utf8-chartable.de/unicode-utf8-table.pl?start=1024]
// [2] Русские символы - это 2 байта [0xd0, 0xd3] × [0x80, 0xbf].
// [3] https://habrahabr.ru/post/262679/
//
//
// WARNING!
// ========
// I strip all accents: for some languages it is unacceptable. You can fix
// 'UTF8Tokenizer::blacklist()' - apply blacklist rejection only if codepoint
// is Russian/English, which is easy to do (see comment in 'UTF8Tokenizer::eat()').
// Russian codepoints start from 0xd0, .., 0xd3.

#include <ctype.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>

#include <algorithm>
#include <vector>
#include <string>
#include <map>
#include <set>
#include <iostream>


// ----------------------------------------------------------------------------
// Instance of class can be enhanced with 'replacing dictionary' and blacklist.
// Usage:
//     UTF8Tokenizer u;
//     u.load_mapping("mappings.txt");
//     u.load_blacklist("blacklist.txt");
//
//     const char* res1 = u.eat(byte);
//     const char* res2 = u.eat(byte);
//     ...
// You feed byte by byte to 'u.eat()' and it will return 'nullptr' if token is
// not assembled/rejected, or C asciiz string with utf8 string.
// ----------------------------------------------------------------------------
class UTF8Tokenizer {
    std::map<std::string, std::string>  simplify;   // Й -> и, ё -> е, ...
    std::set<std::string>               blacklist;  // Ударения, кратки, акценты.

    int   pos;
    int   chunk_size;
    char  chunk[5];

public:
    UTF8Tokenizer() : pos(-1), chunk_size(0)
    {}

    void load_mapping(const char* filename)
    {
        // Add manually to simplify parsing of replacing data file.
        simplify["'"] = " ";
        simplify["\""] = " ";
        simplify["\n"] = " ";

        FILE *fp = fopen(filename, "rb");
        assert(fp && "Please run `unucode_lookup.py` to generate maps.");

        int numlines;
        fscanf(fp, "LINES = %d ", &numlines);

        char from[5], to[5];
        for (int i = 0 ; i < numlines ; ++i) {
            fscanf(fp, "'%[^']' -> '%[^']'\n", from, to);
            simplify[std::string(from)] = std::string(to);
        }

        fclose(fp);
    }

    void load_blacklist(const char* filename)
    {
        FILE *fp = fopen(filename, "rb");
        assert(fp && "Please run `unucode_lookup.py` to generate maps.");

        int numlines;
        fscanf(fp, "LINES = %d\n", &numlines);

        char token[5];
        for (int i = 0 ; i < numlines ; ++i) {
            fscanf(fp, "%[^\n]\n", token);
            blacklist.insert(std::string(token));
        }

        fclose(fp);
    }

    int utf8size(int byte)
    {
        // TODO: 3-bit lookup table? 8 elements, seems ok.
        if (byte < 128) {
            return 1;
        }
        int count = 0;
        while (byte & 128) {
            byte <<= 1;
            count++;
        }
        return count;
    }

    // Eat byte one by one and return either valid UTF-8 codepoint or
    // `nullptr` if token is not assembled yet. See full comment for the class.
    const char* eat(int byte)
    {
        if (pos < 0) {
            chunk_size = utf8size(byte);
            // Set or drop 'Russian or English' flag here and use it below (**)
            pos = 0;
        }

        chunk[pos++] = byte;

        if (pos == chunk_size) {
            chunk[pos] = 0;
            pos = -1;

            // Below you can optionally cancel rejection for nonRussian languages.
            auto drop_pos = blacklist.find(chunk);
            if (drop_pos != blacklist.end())        // (**)
                return nullptr;

            auto simplify_pos = simplify.find(chunk);
            if (simplify_pos != simplify.end()) {
                return simplify_pos->second.c_str();
            }

            return chunk;
        }

        return nullptr;
    }

    ~UTF8Tokenizer ()
    {
        assert(pos < 0 && "There are unfinished token waiting in buffer.");
    }
};


// ----------------------------------------------------------------------------
// Simple wrapper for a pair "word + count".
// Provides operators necessary to interface with algorithm::sort().
// Note that order of count is descending while word are sorted normally:
//
// +-- [ https://en.wikipedia.org/wiki/UTF-8#Advantages ]
// | Sorting a set of UTF-8 encoded strings as strings of unsigned bytes yields
// | the same order as sorting the corresponding Unicode strings
// | lexicographically by codepoint.
// ----------------------------------------------------------------------------
struct Pair {
    int         count;
    std::string word;

    Pair(const char *word) : count(1), word(word)
    {
    }

    // Required by 'std::sort'.
    Pair& operator= (const Pair& other)
    {
        // Protect against invalid self-assignment
        if (this != &other)
        {
            count = other.count;
            word = other.word;
        }

        // By convention, always return *this.
        return *this;
    }

    // Required by 'std::sort'.
    bool operator<(const Pair& b) const
    {
        // ::count in decreasing, ::word in lexicographically increasing order.
        if (count > b.count)
            return true;
        if (count < b.count)
            return false;
        return strcmp(word.c_str(), b.word.c_str()) < 0;
    }
};


// ----------------------------------------------------------------------------
// Container for incoming words.
// Search is linear, simple map<word, count> will fix it if necessary.
// ----------------------------------------------------------------------------
class FreqCounter {
    std::vector<Pair> dict;

public:
    void insert(const char *word)
    {
        for (int i = 0 ; i < dict.size() ; ++i) {
            if (dict[i].word == word) {
                dict[i].count++;
                return;
            }
        }
        dict.push_back(Pair(word));
    }

    void dump_sorted_into_file(const char* filename)
    {
        FILE *fp = fopen(filename, "wb");
        assert(fp && "Cannot open output file for writing.");

        std::sort(dict.begin(), dict.end());

        for (int i = 0 ; i < dict.size() ; ++i)
            fprintf(fp, "%d %s\n", dict[i].count, dict[i].word.c_str());

        fclose(fp);
    }
};


// ----------------------------------------------------------------------------
// Structure:
//     prepare instances of tokenizer and counter
//     feed incoming text char by char to utf8 assembler
//     outgoing utf8 codepoints are merged into words using simple finite
//         automata
//     dump result into file.
// ----------------------------------------------------------------------------
int main(int argc, char **argv)
{
    assert(argc == 3 && "Usage: ./a.out in.txt out.txt");

    FILE *fin = fopen(argv[1], "rb");
    assert(fin && "Cannot open input file for reading.");

    FreqCounter fq;

    UTF8Tokenizer utf8tokenizer;
    utf8tokenizer.load_mapping("mappings.txt");
    utf8tokenizer.load_blacklist("blacklist.txt");


    // Пробую автоматное программирование [Шалыто].
    enum {EATING_SPACES, EATING_CHARS};

    int           state = EATING_SPACES;
    int           c;
    std::string   word = "";
    while ((c = fgetc(fin)) != EOF) {
        const char *token = utf8tokenizer.eat(c);

        if (!token) {
            continue;
        }

        switch (state) {
        case EATING_SPACES:
            if (token[0] != ' ') {
                state = EATING_CHARS;
                word = token;
            }
        break;

        case EATING_CHARS:
            if (token[0] == ' ') {
                state = EATING_SPACES;
                fq.insert(word.c_str());
                word = "";
            } else {
                word += token;
            }
        break;
        };
    }

    fclose(fin);

    fq.dump_sorted_into_file(argv[2]);

    return 0;
}

## mappings.txt
LINES = 160
'Э' -> 'э'
'V' -> 'v'
'А' -> 'а'
'Д' -> 'д'
'X' -> 'x'
'И' -> 'и'
'М' -> 'м'
'Р' -> 'р'
'Ф' -> 'ф'
'г' -> 'г'
'Ш' -> 'ш'
',' -> ' '
'\' -> ' '
'0' -> ' '
'n' -> 'n'
'4' -> ' '
'и' -> 'и'
'м' -> 'м'
'р' -> 'р'
'ф' -> 'ф'
'ш' -> 'ш'
'ь' -> 'ь'
'9' -> ' '
'b' -> 'b'
'T' -> 't'
'2' -> ' '
';' -> ' '
'`' -> ' '
'd' -> 'd'
'=' -> ' '
'l' -> 'l'
'p' -> 'p'
'?' -> ' '
'h' -> 'h'
'x' -> 'x'
'|' -> ' '
'A' -> 'a'
'C' -> 'c'
'Г' -> 'г'
'З' -> 'з'
'E' -> 'e'
'Л' -> 'л'
'5' -> ' '
'П' -> 'п'
'У' -> 'у'
'G' -> 'g'
'Ч' -> 'ч'
'Ы' -> 'ы'
'Я' -> 'я'
'I' -> 'i'
'3' -> ' '
'з' -> 'з'
'л' -> 'л'
'K' -> 'k'
'п' -> 'п'
't' -> 't'
'у' -> 'у'
'ч' -> 'ч'
' ' -> ' '
'M' -> 'm'
'ы' -> 'ы'
'я' -> 'я'
'S' -> 's'
'"' -> ' '
'O' -> 'o'
'W' -> 'w'
'[' -> ' '
'_' -> ' '
'$' -> ' '
'Q' -> 'q'
'c' -> 'c'
'z' -> 'z'
'g' -> 'g'
'k' -> 'k'
'&' -> ' '
'o' -> 'o'
'7' -> ' '
's' -> 's'
'w' -> 'w'
'(' -> ' '
'{' -> ' '
'6' -> ' '
'Ъ' -> 'ъ'
'Ь' -> 'ь'
'В' -> 'в'
'Ж' -> 'ж'
'К' -> 'к'
'>' -> ' '
'О' -> 'о'
'Т' -> 'т'
'Ц' -> 'ц'
'*' -> ' '
'Ю' -> 'ю'
'в' -> 'в'
'ж' -> 'ж'
'к' -> 'к'
'д' -> 'д'
'т' -> 'т'
'ц' -> 'ц'
'ъ' -> 'ъ'
'D' -> 'd'
'ю' -> 'ю'
'R' -> 'r'
'8' -> ' '
'Z' -> 'z'
'^' -> ' '
':' -> ' '
'f' -> 'f'
'j' -> 'j'
'<' -> ' '
'r' -> 'r'
'v' -> 'v'
'о' -> 'о'
'~' -> ' '
'Ё' -> 'е'
'@' -> ' '
'Б' -> 'б'
'B' -> 'b'
'Е' -> 'е'
'Й' -> 'и'
'Н' -> 'н'
'/' -> ' '
'С' -> 'с'
'Х' -> 'х'
'#' -> ' '
'Щ' -> 'щ'
'F' -> 'f'
'-' -> ' '
'б' -> 'б'
'.' -> ' '
'е' -> 'е'
'H' -> 'h'
'й' -> 'и'
'н' -> 'н'
'с' -> 'с'
'!' -> ' '
'J' -> 'j'
'х' -> 'х'
'щ' -> 'щ'
'э' -> 'э'
'а' -> 'а'
'L' -> 'l'
'ё' -> 'е'
'U' -> 'u'
'Y' -> 'y'
'%' -> ' '
'N' -> 'n'
']' -> ' '
'a' -> 'a'
'e' -> 'e'
'P' -> 'p'
'i' -> 'i'
'm' -> 'm'
'q' -> 'q'
')' -> ' '
'1' -> ' '
'u' -> 'u'
'y' -> 'y'
'}' -> ' '
'+' -> ' '

## unicode_lookup.py
# -*- coding: UTF-8 -*-

import unicodedata
import codecs


russian = u"йцукеёнгшщзхъфывапролджэячсмитьбю"


def make():
    """
    Our goal is to prepare lookup tables for UTF-8 russian text tokenizer.
    We treat `й' as `и' and `ё' as `е'.
    """
    replacing = {
        u'ё': u'е',
        u'й': u'и',
    }

    # You can add any number of s⃗pêciál symbols here.
    removing = set(u'\u0301 \u20d7 \u0302'.split())

    def extract_modifiers(u):
        u = unicodedata.normalize(u"NFKD", u)
        map(removing.add, u[1:])

    for c in russian:
        replacing[c] = replacing.get(c, c)
        replacing[c.upper()] = replacing.get(c, c)
        extract_modifiers(c)
        extract_modifiers(c.upper())

    for c in map(chr, range(32, 127)):
        if c.isalpha():
            replacing[c] = c.lower()
        elif c not in "\n'":    # I will add them manually.
            replacing[c] = ' '

    with codecs.open('mappings.txt', 'w', 'utf-8') as fp:
        fp.write(u"LINES = %d\n" % len(replacing))
        for item in replacing.iteritems():
            fp.write(u"'%s' -> '%s'\n" % item)

    with codecs.open('blacklist.txt', 'w', 'utf-8') as fp:
        fp.write(u"LINES = %d\n" % len(removing))
        for item in removing:
            fp.write(u"%s\n" % item)

    print u"\n".join([u"'%s' -> '%s'" % i for i in replacing.iteritems()])
    print u"\n".join([u"'%s' = 'a%s'" % (repr(i), i) for i in removing])


make()
	// Задание из https://docs.google.com/forms/d/e/1FAIpQLSc-T7mmLsI4skRKbm2px3ssBmtv1cbiIp1oBpvYXC_62qb9tw/viewform
	// Mirror: https://www.evernote.com/shard/s171/sh/57bba412-db24-4496-8aee-fbc1ef18db88/5162aeea37a68042ba7f162356edb83c
	//
	//
	// Key points:
	// * no UTF-8 "code point" can start from another "code point"
	// * we need to support Russian, including "combining inverted breve" over `и'
	// * it is enough to map `й' -> `и', `Ё' -> `е', etc
	// * according to test on the problem page, order is lexicographical
	//
	//
	// Decision:
	// * std::map to normalize case, map separators to space, strip accents
	// * std::set to drop accents and selected modifiers
	// * use good source of knowledge to build this map (Python + unicodedata)
	//
	//
	// References and links:
	// [1] Russian codepage [http://www.utf8-chartable.de/unicode-utf8-table.pl?start=1024]
	// [2] Русские символы - это 2 байта [0xd0, 0xd3] × [0x80, 0xbf].
	// [3] https://habrahabr.ru/post/262679/
	//
	//
	// WARNING!
	// ========
	// I strip all accents: for some languages it is unacceptable. You can fix
	// 'UTF8Tokenizer::blacklist()' - apply blacklist rejection only if codepoint
	// is Russian/English, which is easy to do (see comment in 'UTF8Tokenizer::eat()').
	// Russian codepoints start from 0xd0, .., 0xd3.

	#include <ctype.h>
	#include <stdio.h>
	#include <string.h>
	#include <assert.h>

	#include <algorithm>
	#include <vector>
	#include <string>
	#include <map>
	#include <set>
	#include <iostream>


	// ----------------------------------------------------------------------------
	// Instance of class can be enhanced with 'replacing dictionary' and blacklist.
	// Usage:
	// UTF8Tokenizer u;
	// u.load_mapping("mappings.txt");
	// u.load_blacklist("blacklist.txt");
	//
	// const char* res1 = u.eat(byte);
	// const char* res2 = u.eat(byte);
	// ...
	// You feed byte by byte to 'u.eat()' and it will return 'nullptr' if token is
	// not assembled/rejected, or C asciiz string with utf8 string.
	// ----------------------------------------------------------------------------
	class UTF8Tokenizer {
	std::map<std::string, std::string> simplify; // Й -> и, ё -> е, ...
	std::set<std::string> blacklist; // Ударения, кратки, акценты.

	int pos;
	int chunk_size;
	char chunk[5];

	public:
	UTF8Tokenizer() : pos(-1), chunk_size(0)
	{}

	void load_mapping(const char* filename)
	{
	// Add manually to simplify parsing of replacing data file.
	simplify["'"] = " ";
	simplify["\""] = " ";
	simplify["\n"] = " ";

	FILE *fp = fopen(filename, "rb");
	assert(fp && "Please run `unucode_lookup.py` to generate maps.");

	int numlines;
	fscanf(fp, "LINES = %d ", &numlines);

	char from[5], to[5];
	for (int i = 0 ; i < numlines ; ++i) {
	fscanf(fp, "'%[^']' -> '%[^']'\n", from, to);
	simplify[std::string(from)] = std::string(to);
	}

	fclose(fp);
	}

	void load_blacklist(const char* filename)
	{
	FILE *fp = fopen(filename, "rb");
	assert(fp && "Please run `unucode_lookup.py` to generate maps.");

	int numlines;
	fscanf(fp, "LINES = %d\n", &numlines);

	char token[5];
	for (int i = 0 ; i < numlines ; ++i) {
	fscanf(fp, "%[^\n]\n", token);
	blacklist.insert(std::string(token));
	}

	fclose(fp);
	}

	int utf8size(int byte)
	{
	// TODO: 3-bit lookup table? 8 elements, seems ok.
	if (byte < 128) {
	return 1;
	}
	int count = 0;
	while (byte & 128) {
	byte <<= 1;
	count++;
	}
	return count;
	}

	// Eat byte one by one and return either valid UTF-8 codepoint or
	// `nullptr` if token is not assembled yet. See full comment for the class.
	const char* eat(int byte)
	{
	if (pos < 0) {
	chunk_size = utf8size(byte);
	// Set or drop 'Russian or English' flag here and use it below (**)
	pos = 0;
	}

	chunk[pos++] = byte;

	if (pos == chunk_size) {
	chunk[pos] = 0;
	pos = -1;

	// Below you can optionally cancel rejection for nonRussian languages.
	auto drop_pos = blacklist.find(chunk);
	if (drop_pos != blacklist.end()) // (**)
	return nullptr;

	auto simplify_pos = simplify.find(chunk);
	if (simplify_pos != simplify.end()) {
	return simplify_pos->second.c_str();
	}

	return chunk;
	}

	return nullptr;
	}

	~UTF8Tokenizer ()
	{
	assert(pos < 0 && "There are unfinished token waiting in buffer.");
	}
	};


	// ----------------------------------------------------------------------------
	// Simple wrapper for a pair "word + count".
	// Provides operators necessary to interface with algorithm::sort().
	// Note that order of count is descending while word are sorted normally:
	//
	// +-- [ https://en.wikipedia.org/wiki/UTF-8#Advantages ]
	// \| Sorting a set of UTF-8 encoded strings as strings of unsigned bytes yields
	// \| the same order as sorting the corresponding Unicode strings
	// \| lexicographically by codepoint.
	// ----------------------------------------------------------------------------
	struct Pair {
	int count;
	std::string word;

	Pair(const char *word) : count(1), word(word)
	{
	}

	// Required by 'std::sort'.
	Pair& operator= (const Pair& other)
	{
	// Protect against invalid self-assignment
	if (this != &other)
	{
	count = other.count;
	word = other.word;
	}

	// By convention, always return *this.
	return *this;
	}

	// Required by 'std::sort'.
	bool operator<(const Pair& b) const
	{
	// ::count in decreasing, ::word in lexicographically increasing order.
	if (count > b.count)
	return true;
	if (count < b.count)
	return false;
	return strcmp(word.c_str(), b.word.c_str()) < 0;
	}
	};


	// ----------------------------------------------------------------------------
	// Container for incoming words.
	// Search is linear, simple map<word, count> will fix it if necessary.
	// ----------------------------------------------------------------------------
	class FreqCounter {
	std::vector<Pair> dict;

	public:
	void insert(const char *word)
	{
	for (int i = 0 ; i < dict.size() ; ++i) {
	if (dict[i].word == word) {
	dict[i].count++;
	return;
	}
	}
	dict.push_back(Pair(word));
	}

	void dump_sorted_into_file(const char* filename)
	{
	FILE *fp = fopen(filename, "wb");
	assert(fp && "Cannot open output file for writing.");

	std::sort(dict.begin(), dict.end());

	for (int i = 0 ; i < dict.size() ; ++i)
	fprintf(fp, "%d %s\n", dict[i].count, dict[i].word.c_str());

	fclose(fp);
	}
	};


	// ----------------------------------------------------------------------------
	// Structure:
	// prepare instances of tokenizer and counter
	// feed incoming text char by char to utf8 assembler
	// outgoing utf8 codepoints are merged into words using simple finite
	// automata
	// dump result into file.
	// ----------------------------------------------------------------------------
	int main(int argc, char **argv)
	{
	assert(argc == 3 && "Usage: ./a.out in.txt out.txt");

	FILE *fin = fopen(argv[1], "rb");
	assert(fin && "Cannot open input file for reading.");

	FreqCounter fq;

	UTF8Tokenizer utf8tokenizer;
	utf8tokenizer.load_mapping("mappings.txt");
	utf8tokenizer.load_blacklist("blacklist.txt");


	// Пробую автоматное программирование [Шалыто].
	enum {EATING_SPACES, EATING_CHARS};

	int state = EATING_SPACES;
	int c;
	std::string word = "";
	while ((c = fgetc(fin)) != EOF) {
	const char *token = utf8tokenizer.eat(c);

	if (!token) {
	continue;
	}

	switch (state) {
	case EATING_SPACES:
	if (token[0] != ' ') {
	state = EATING_CHARS;
	word = token;
	}
	break;

	case EATING_CHARS:
	if (token[0] == ' ') {
	state = EATING_SPACES;
	fq.insert(word.c_str());
	word = "";
	} else {
	word += token;
	}
	break;
	};
	}

	fclose(fin);

	fq.dump_sorted_into_file(argv[2]);

	return 0;
	}
	LINES = 160
	'Э' -> 'э'
	'V' -> 'v'
	'А' -> 'а'
	'Д' -> 'д'
	'X' -> 'x'
	'И' -> 'и'
	'М' -> 'м'
	'Р' -> 'р'
	'Ф' -> 'ф'
	'г' -> 'г'
	'Ш' -> 'ш'
	',' -> ' '
	'\' -> ' '
	'0' -> ' '
	'n' -> 'n'
	'4' -> ' '
	'и' -> 'и'
	'м' -> 'м'
	'р' -> 'р'
	'ф' -> 'ф'
	'ш' -> 'ш'
	'ь' -> 'ь'
	'9' -> ' '
	'b' -> 'b'
	'T' -> 't'
	'2' -> ' '
	';' -> ' '
	'`' -> ' '
	'd' -> 'd'
	'=' -> ' '
	'l' -> 'l'
	'p' -> 'p'
	'?' -> ' '
	'h' -> 'h'
	'x' -> 'x'
	'\|' -> ' '
	'A' -> 'a'
	'C' -> 'c'
	'Г' -> 'г'
	'З' -> 'з'
	'E' -> 'e'
	'Л' -> 'л'
	'5' -> ' '
	'П' -> 'п'
	'У' -> 'у'
	'G' -> 'g'
	'Ч' -> 'ч'
	'Ы' -> 'ы'
	'Я' -> 'я'
	'I' -> 'i'
	'3' -> ' '
	'з' -> 'з'
	'л' -> 'л'
	'K' -> 'k'
	'п' -> 'п'
	't' -> 't'
	'у' -> 'у'
	'ч' -> 'ч'
	' ' -> ' '
	'M' -> 'm'
	'ы' -> 'ы'
	'я' -> 'я'
	'S' -> 's'
	'"' -> ' '
	'O' -> 'o'
	'W' -> 'w'
	'[' -> ' '
	'_' -> ' '
	'$' -> ' '
	'Q' -> 'q'
	'c' -> 'c'
	'z' -> 'z'
	'g' -> 'g'
	'k' -> 'k'
	'&' -> ' '
	'o' -> 'o'
	'7' -> ' '
	's' -> 's'
	'w' -> 'w'
	'(' -> ' '
	'{' -> ' '
	'6' -> ' '
	'Ъ' -> 'ъ'
	'Ь' -> 'ь'
	'В' -> 'в'
	'Ж' -> 'ж'
	'К' -> 'к'
	'>' -> ' '
	'О' -> 'о'
	'Т' -> 'т'
	'Ц' -> 'ц'
	'*' -> ' '
	'Ю' -> 'ю'
	'в' -> 'в'
	'ж' -> 'ж'
	'к' -> 'к'
	'д' -> 'д'
	'т' -> 'т'
	'ц' -> 'ц'
	'ъ' -> 'ъ'
	'D' -> 'd'
	'ю' -> 'ю'
	'R' -> 'r'
	'8' -> ' '
	'Z' -> 'z'
	'^' -> ' '
	':' -> ' '
	'f' -> 'f'
	'j' -> 'j'
	'<' -> ' '
	'r' -> 'r'
	'v' -> 'v'
	'о' -> 'о'
	'~' -> ' '
	'Ё' -> 'е'
	'@' -> ' '
	'Б' -> 'б'
	'B' -> 'b'
	'Е' -> 'е'
	'Й' -> 'и'
	'Н' -> 'н'
	'/' -> ' '
	'С' -> 'с'
	'Х' -> 'х'
	'#' -> ' '
	'Щ' -> 'щ'
	'F' -> 'f'
	'-' -> ' '
	'б' -> 'б'
	'.' -> ' '
	'е' -> 'е'
	'H' -> 'h'
	'й' -> 'и'
	'н' -> 'н'
	'с' -> 'с'
	'!' -> ' '
	'J' -> 'j'
	'х' -> 'х'
	'щ' -> 'щ'
	'э' -> 'э'
	'а' -> 'а'
	'L' -> 'l'
	'ё' -> 'е'
	'U' -> 'u'
	'Y' -> 'y'
	'%' -> ' '
	'N' -> 'n'
	']' -> ' '
	'a' -> 'a'
	'e' -> 'e'
	'P' -> 'p'
	'i' -> 'i'
	'm' -> 'm'
	'q' -> 'q'
	')' -> ' '
	'1' -> ' '
	'u' -> 'u'
	'y' -> 'y'
	'}' -> ' '
	'+' -> ' '
	# -- coding: UTF-8 --

	import unicodedata
	import codecs


	russian = u"йцукеёнгшщзхъфывапролджэячсмитьбю"


	def make():
	"""
	Our goal is to prepare lookup tables for UTF-8 russian text tokenizer.
	We treat `й' as `и' and `ё' as `е'.
	"""
	replacing = {
	u'ё': u'е',
	u'й': u'и',
	}

	# You can add any number of s⃗pêciál symbols here.
	removing = set(u'\u0301 \u20d7 \u0302'.split())

	def extract_modifiers(u):
	u = unicodedata.normalize(u"NFKD", u)
	map(removing.add, u[1:])

	for c in russian:
	replacing[c] = replacing.get(c, c)
	replacing[c.upper()] = replacing.get(c, c)
	extract_modifiers(c)
	extract_modifiers(c.upper())

	for c in map(chr, range(32, 127)):
	if c.isalpha():
	replacing[c] = c.lower()
	elif c not in "\n'": # I will add them manually.
	replacing[c] = ' '

	with codecs.open('mappings.txt', 'w', 'utf-8') as fp:
	fp.write(u"LINES = %d\n" % len(replacing))
	for item in replacing.iteritems():
	fp.write(u"'%s' -> '%s'\n" % item)

	with codecs.open('blacklist.txt', 'w', 'utf-8') as fp:
	fp.write(u"LINES = %d\n" % len(removing))
	for item in removing:
	fp.write(u"%s\n" % item)

	print u"\n".join([u"'%s' -> '%s'" % i for i in replacing.iteritems()])
	print u"\n".join([u"'%s' = 'a%s'" % (repr(i), i) for i in removing])


	make()