Skip to content

Instantly share code, notes, and snippets.

@jcb91
Created November 17, 2017 13:14
Show Gist options
  • Select an option

  • Save jcb91/956abc5611d61098d80d8333caa25486 to your computer and use it in GitHub Desktop.

Select an option

Save jcb91/956abc5611d61098d80d8333caa25486 to your computer and use it in GitHub Desktop.
investigating hunspell eu dictionaries
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"## prep"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-16T11:52:54.498794Z",
"start_time": "2017-11-16T11:52:54.311405Z"
},
"collapsed": true,
"hidden": true
},
"outputs": [],
"source": [
"import os\n",
"\n",
"try: # py3\n",
" from urllib.request import urlopen\n",
"except ImportError: # py2\n",
" from urllib2 import urlopen\n",
" \n",
"def get_dict_file(url, force_fetch=False):\n",
" localpath = os.path.basename(url)\n",
" if not force_fetch and os.path.exists(localpath):\n",
" print('loading', os.path.realpath(localpath))\n",
" with open(localpath, 'r') as f:\n",
" return f.readlines()\n",
" print('fetching', url)\n",
" lines = []\n",
" with urlopen(url) as req:\n",
" for line in req:\n",
" lines.append(line.decode())\n",
" if (len(lines) % 100) == 0:\n",
" print('.', end=('' if len(lines) % 8000 else '\\n'))\n",
" print('\\n\\n' + ('-'*80) + '\\n')\n",
" print(sum([len(line) for line in lines]), 'chars')\n",
" # save a local copy\n",
" with open(localpath, 'w') as f:\n",
" f.writelines(lines)\n",
" return lines"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"## `.aff` file"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-16T11:53:03.701529Z",
"start_time": "2017-11-16T11:52:57.200996Z"
},
"hidden": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"fetching http://xuxen.eus/static/hunspell/eu_ES.aff\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
".................................................................\n",
"\n",
"--------------------------------------------------------------------------------\n",
"\n",
"3302091 chars\n"
]
}
],
"source": [
"aff_lines = get_dict_file('http://xuxen.eus/static/hunspell/eu_ES.aff')"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-16T12:24:08.953332Z",
"start_time": "2017-11-16T12:24:07.460376Z"
},
"hidden": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"353 rules loaded\n"
]
}
],
"source": [
"rules = {}\n",
"current_rule, seen = '', {}\n",
"for ii, line in enumerate(aff_lines):\n",
" lineparts = tuple(line.strip().split())\n",
" if not lineparts or lineparts[0].startswith('#'):\n",
" continue\n",
" if lineparts[0] not in ('SFX', 'PFX'):\n",
" continue\n",
"\n",
" rtype, rcode = lineparts[:2]\n",
" if len(lineparts) < 5:\n",
" # new rule!\n",
" current_rule = rtype, rcode\n",
" continue\n",
" else:\n",
" rule = rules.setdefault(rcode, {})\n",
" if lineparts in rule:\n",
" print('duplicate line:', ii, 'already seen on line', rule)\n",
" rule.setdefault(lineparts, ii)\n",
" if (rtype, rcode) != current_rule:\n",
" raise ValueError('err: line %d' % ii)\n",
"print(len(rules), 'rules loaded')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-16T11:53:19.603521Z",
"start_time": "2017-11-16T11:53:17.948323Z"
},
"collapsed": true,
"hidden": true
},
"outputs": [],
"source": [
"rules = {}\n",
"ii = 0\n",
"while ii < len(aff_lines):\n",
" line = aff_lines[ii].strip()\n",
" if not line or line.startswith('#') or not line.startswith('SFX'):\n",
" ii += 1\n",
" continue\n",
" \n",
" lineparts = line.split()\n",
" rule_type = lineparts[0]\n",
" rule_code = int(lineparts[1])\n",
" combineable = lineparts[2] == 'Y'\n",
" nents = int(lineparts[3])\n",
" \n",
" seen = {}\n",
" for jj in range(ii+1,ii+1+nents):\n",
" line = aff_lines[jj].strip()\n",
" lineparts = tuple(line.split())\n",
" seen.setdefault(lineparts, []).append(jj)\n",
" if lineparts[0] != rule_type:\n",
" raise ValueError('err: line %d' % jj)\n",
" rules[rule_code] = seen\n",
" for idxs in [ll for ll in seen.values() if len(ll)>1]:\n",
" print('duplicate lines: ' + ','.join(map(str, idxs)))\n",
" ii = jj + 1"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-16T12:59:14.728897Z",
"start_time": "2017-11-16T12:59:14.241940Z"
},
"hidden": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"exact-duplicate rules:\n",
"[1] : 18\n",
"[11] : 21, 22\n",
"[23] : 27, 29\n",
"[34] : 35\n",
"[51] : 354, 355\n",
"[53] : 54, 55\n",
"[63] : 67, 69\n",
"[65] : 66, 68\n",
"[76] : 77, 78\n",
"[85] : 86, 87, 88, 89, 91, 92\n",
"[97] : 357\n",
"[104] : 106, 107\n",
"[105] : 108, 109\n",
"[110] : 111\n",
"[117] : 120\n",
"[122] : 128, 129\n",
"[126] : 130, 132\n",
"[150] : 152, 153, 311\n",
"[157] : 159, 163, 164, 165, 166\n",
"[171] : 173\n",
"[172] : 175\n",
"[174] : 176, 177, 181\n",
"[183] : 184\n",
"[193] : 194, 197, 198, 199, 200, 201, 202, 203, 204, 205\n",
"[207] : 208\n",
"[220] : 225\n",
"[221] : 226\n",
"[229] : 230\n",
"[234] : 321\n",
"[237] : 322\n",
"[238] : 244, 323, 328\n",
"[239] : 324\n",
"[240] : 242, 243, 319, 325, 326, 327\n",
"[247] : 329\n",
"[250] : 253, 255, 260, 261, 262, 263\n",
"[254] : 257, 258\n",
"[272] : 274, 277\n",
"[273] : 275, 276\n",
"[282] : 283, 284\n",
"[285] : 286, 287\n",
"[293] : 297, 298, 300\n",
"[331] : 332, 334, 335\n",
"[340] : 342, 343, 344\n",
"[353] : 356\n",
"[1000] : 1001\n"
]
}
],
"source": [
"rdict = {}\n",
"for rcode, rule in sorted(rules.items()):\n",
" rkey = tuple(sorted(' '.join(kk[2:]) for kk in list(rule.keys())))\n",
" rdict.setdefault(rkey, []).append(rcode)\n",
"\n",
"print('exact-duplicate rules:')\n",
"for rcodes in sorted(sorted(map(int, v)) for v in rdict.values()):\n",
" if len(rcodes) > 1:\n",
" print(rcodes[:1], ':',\n",
" ', '.join(map(str, sorted(rcodes[1:]))))"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-16T13:02:30.150607Z",
"start_time": "2017-11-16T13:02:30.120704Z"
},
"hidden": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"353 rules in aff\n",
"252 distinct rules in aff\n",
"=> 28.61% duplicate rules in aff\n",
"=> 23.11% duplicate lines in aff\n"
]
}
],
"source": [
"print('{} rules in aff'.format(len(rules)))\n",
"print('{} distinct rules in aff'.format(len(rdict)))\n",
"print('=> {:.2f}% duplicate rules in aff'.format(\n",
" 100 * (1 - len(rdict) / len(rules))))\n",
"print('=> {:.2f}% duplicate lines in aff'.format(\n",
" 100 * (1 - sum(map(len, rdict.keys())) / sum(map(len, rules.values())))))"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"## `.dic` file"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-16T11:54:13.675880Z",
"start_time": "2017-11-16T11:54:09.091309Z"
},
"hidden": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"fetching http://xuxen.eus/static/hunspell/eu_ES.dic\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"................................................................................\n",
"......\n",
"\n",
"--------------------------------------------------------------------------------\n",
"\n",
"2144523 chars\n"
]
}
],
"source": [
"dic_lines = get_dict_file('http://xuxen.eus/static/hunspell/eu_ES.dic')"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-16T13:26:11.591863Z",
"start_time": "2017-11-16T13:26:09.471543Z"
},
"collapsed": true,
"hidden": true
},
"outputs": [],
"source": [
"dic = {}\n",
"for ii, line in enumerate(dic_lines):\n",
" line = line.strip()\n",
" # skip first line which just lists the approx number of entries\n",
" if ii < 1 or not line or line.startswith('#'):\n",
" continue\n",
" lineparts = tuple(line.split('/'))\n",
" if len(lineparts) > 1:\n",
" rulecodes = tuple(sorted(lineparts[1].split(',')))\n",
" else:\n",
" rulecodes = tuple()\n",
" dic.setdefault(lineparts[:1] + rulecodes, []).append(ii)"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-16T13:26:15.710637Z",
"start_time": "2017-11-16T13:26:15.603245Z"
},
"hidden": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"144690 non-blank lines in dic\n",
"133225 distinct non-blank lines in dic\n",
"=> 7.92% exact-duplicate lines in dic\n"
]
}
],
"source": [
"print('%d non-blank lines in dic' % len(dic_lines))\n",
"print('%d distinct non-blank lines in dic' % len(set(dic_lines)))\n",
"print('=> %.2f%% exact-duplicate lines in dic' % (\n",
" 100 * (1 - float(len(set(dic_lines)))/len(dic_lines))))"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"## mismatched codes"
]
},
{
"cell_type": "code",
"execution_count": 258,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-16T20:08:54.073304Z",
"start_time": "2017-11-16T20:08:53.764438Z"
},
"hidden": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"codes in aff unused by dic:\n",
" 197, 312\n",
"codes in dic missing from aff:\n",
" 314, 315, 316, 317, 359, 52, 9999\n"
]
}
],
"source": [
"aff_rcodes = set(rules.keys())\n",
"dic_rcodes = {rcode for dkey in dic.keys() for rcode in dkey[1:]}\n",
"\n",
"print('codes in aff unused by dic:')\n",
"print(' ', ', '.join(map(str, sorted(aff_rcodes.difference(dic_rcodes)))))\n",
"print('codes in dic missing from aff:')\n",
"print(' ', ', '.join(map(str, sorted(dic_rcodes.difference(aff_rcodes)))))"
]
},
{
"cell_type": "code",
"execution_count": 291,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-16T20:30:46.772251Z",
"start_time": "2017-11-16T20:30:46.160376Z"
},
"hidden": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 word:\n",
"\t1, 3, 7, 11, 12, 13, 18, 22, 23, 32, 39, 42, 45, 50, 53, 55, 56, 57, 58, 59, 73, 75, 77, 78, 83, 86, 92, 93, 94, 98, 99, 110, 111, 112, 113, 114, 117, 122, 136, 138, 139, 141, 142, 143, 144, 148, 149, 150, 151, 152, 155, 156, 157, 165, 169, 171, 172, 174, 176, 179, 184, 186, 188, 193, 200, 204, 205, 207, 208, 210, 211, 213, 220, 221, 222, 227, 232, 236, 250, 251, 254, 302, 305, 306, 308, 310, 313, 315, 329, 330, 333, 334, 335, 342, 343, 344, 345, 347, 349, 350, 351, 352, 354, 355, 356, 357, 358\n",
"\n",
"2 words:\n",
"\t2, 15, 16, 17, 21, 40, 47, 52, 76, 80, 88, 91, 96, 115, 140, 145, 147, 153, 154, 173, 175, 178, 180, 187, 189, 190, 192, 194, 215, 216, 218, 219, 224, 225, 226, 231, 253, 259, 281, 326, 331, 340, 348\n",
"\n",
"3 words:\n",
"\t19, 20, 33, 44, 62, 66, 67, 84, 85, 109, 119, 120, 121, 125, 170, 177, 182, 203, 206, 242, 249, 264, 272, 304, 309, 311, 322, 338, 346\n",
"\n",
"4 words:\n",
"\t10, 34, 51, 54, 79, 97, 183, 217, 223, 283, 317, 332\n",
"\n",
"5 words:\n",
"\t71, 95, 160, 195, 202, 228, 307\n",
"\n",
"6 words:\n",
"\t214, 248, 339, 341\n",
"\n",
"7 words:\n",
"\t9, 146, 257, 265, 321, 353\n",
"\n",
"8 words:\n",
"\t81, 135, 185, 199, 235, 286, 303, 323\n",
"\n",
"9 words:\n",
"\t8, 41, 90, 252, 316, 359\n",
"\n",
"10 words:\n",
"\t31, 38, 103\n",
"\n",
"11 words:\n",
"\t87, 137, 198\n",
"\n",
"12 words:\n",
"\t270, 320\n",
"\n",
"13 words:\n",
"\t181, 209, 229\n",
"\n",
"14 words:\n",
"\t127\n",
"\n",
"16 words:\n",
"\t201, 319\n",
"\n",
"17 words:\n",
"\t241\n",
"\n",
"18 words:\n",
"\t134, 269, 318\n",
"\n",
"19 words:\n",
"\t129, 191\n",
"\n",
"20 words:\n",
"\t82\n",
"\n",
"21 words:\n",
"\t324\n",
"\n",
"23 words:\n",
"\t36, 294\n",
"\n",
"24 words:\n",
"\t46, 168, 325\n",
"\n",
"25 words:\n",
"\t162\n",
"\n",
"26 words:\n",
"\t48, 280\n",
"\n",
"27 words:\n",
"\t291\n",
"\n",
"34 words:\n",
"\t26, 260\n",
"\n",
"35 words:\n",
"\t196, 314, 327\n",
"\n",
"36 words:\n",
"\t258\n",
"\n",
"38 words:\n",
"\t6\n",
"\n",
"39 words:\n",
"\t72\n",
"\n",
"40 words:\n",
"\t35\n",
"\n",
"41 words:\n",
"\t237, 246\n",
"\n",
"43 words:\n",
"\t4\n",
"\n",
"45 words:\n",
"\t256\n",
"\n",
"49 words:\n",
"\t285\n",
"\n",
"50 words:\n",
"\t261, 298\n",
"\n",
"52 words:\n",
"\t282, 289, 290\n",
"\n",
"53 words:\n",
"\t5\n",
"\n",
"55 words:\n",
"\t288\n",
"\n",
"61 words:\n",
"\t105, 263, 295\n",
"\n",
"63 words:\n",
"\t14, 43\n",
"\n",
"67 words:\n",
"\t292\n",
"\n",
"75 words:\n",
"\t101\n",
"\n",
"77 words:\n",
"\t296\n",
"\n",
"78 words:\n",
"\t64, 108, 278, 328\n",
"\n",
"83 words:\n",
"\t102\n",
"\n",
"92 words:\n",
"\t100, 158\n",
"\n",
"102 words:\n",
"\t37\n",
"\n",
"104 words:\n",
"\t130\n",
"\n",
"115 words:\n",
"\t106\n",
"\n",
"117 words:\n",
"\t262\n",
"\n",
"119 words:\n",
"\t247\n",
"\n",
"132 words:\n",
"\t284\n",
"\n",
"136 words:\n",
"\t299\n",
"\n",
"143 words:\n",
"\t164\n",
"\n",
"146 words:\n",
"\t49\n",
"\n",
"192 words:\n",
"\t275\n",
"\n",
"203 words:\n",
"\t161\n",
"\n",
"248 words:\n",
"\t163\n",
"\n",
"249 words:\n",
"\t27\n",
"\n",
"286 words:\n",
"\t277\n",
"\n",
"296 words:\n",
"\t132\n",
"\n",
"300 words:\n",
"\t24\n",
"\n",
"301 words:\n",
"\t25, 301\n",
"\n",
"303 words:\n",
"\t287\n",
"\n",
"313 words:\n",
"\t107\n",
"\n",
"314 words:\n",
"\t133\n",
"\n",
"317 words:\n",
"\t30\n",
"\n",
"360 words:\n",
"\t70\n",
"\n",
"389 words:\n",
"\t239\n",
"\n",
"393 words:\n",
"\t255\n",
"\n",
"426 words:\n",
"\t9999\n",
"\n",
"435 words:\n",
"\t123\n",
"\n",
"445 words:\n",
"\t167\n",
"\n",
"500 words:\n",
"\t104\n",
"\n",
"523 words:\n",
"\t297\n",
"\n",
"527 words:\n",
"\t268\n",
"\n",
"671 words:\n",
"\t267\n",
"\n",
"684 words:\n",
"\t126\n",
"\n",
"704 words:\n",
"\t266\n",
"\n",
"718 words:\n",
"\t245\n",
"\n",
"732 words:\n",
"\t89\n",
"\n",
"779 words:\n",
"\t61\n",
"\n",
"911 words:\n",
"\t159\n",
"\n",
"929 words:\n",
"\t29\n",
"\n",
"938 words:\n",
"\t230\n",
"\n",
"989 words:\n",
"\t300\n",
"\n",
"997 words:\n",
"\t166\n",
"\n",
"1023 words:\n",
"\t63\n",
"\n",
"1107 words:\n",
"\t69\n",
"\n",
"1273 words:\n",
"\t279\n",
"\n",
"1414 words:\n",
"\t293\n",
"\n",
"1419 words:\n",
"\t128\n",
"\n",
"1494 words:\n",
"\t276\n",
"\n",
"1701 words:\n",
"\t244\n",
"\n",
"1788 words:\n",
"\t65\n",
"\n",
"2279 words:\n",
"\t1003\n",
"\n",
"2350 words:\n",
"\t124\n",
"\n",
"2488 words:\n",
"\t273\n",
"\n",
"2718 words:\n",
"\t233\n",
"\n",
"2963 words:\n",
"\t60\n",
"\n",
"3035 words:\n",
"\t131\n",
"\n",
"3391 words:\n",
"\t274\n",
"\n",
"3536 words:\n",
"\t1002\n",
"\n",
"4026 words:\n",
"\t28\n",
"\n",
"4480 words:\n",
"\t1004\n",
"\n",
"4536 words:\n",
"\t238\n",
"\n",
"6183 words:\n",
"\t240\n",
"\n",
"6887 words:\n",
"\t68\n",
"\n",
"7941 words:\n",
"\t271\n",
"\n",
"8136 words:\n",
"\t1005\n",
"\n",
"8562 words:\n",
"\t1001\n",
"\n",
"9753 words:\n",
"\t1000\n",
"\n",
"12018 words:\n",
"\t0\n",
"\n",
"13477 words:\n",
"\t234\n",
"\n",
"18001 words:\n",
"\t243\n",
"\n"
]
}
],
"source": [
"_rcodes = {}\n",
"for rs in list(dk[1:] for dk in dic.keys()):\n",
" for r in rs:\n",
" _rcodes[r] = _rcodes.get(r, 0) + 1\n",
" if not rs:\n",
" _rcodes['0'] = _rcodes.get('0', 0) + 1\n",
"\n",
"for num in sorted(set(_rcodes.values())):\n",
" print('{} word{}:\\n\\t{}\\n'.format(\n",
" num, 's' if num != 1 else '',\n",
" ', '.join(map(str, sorted(int(rc) for rc, nn in _rcodes.items() if nn == num)))\n",
" ))"
]
},
{
"cell_type": "code",
"execution_count": 289,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-16T20:29:57.641083Z",
"start_time": "2017-11-16T20:29:57.621896Z"
},
"hidden": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"12018 .dic entries have no rules\n",
"18001 .dic entries have rule 243\n",
"\n",
"these rules are used for only a single .dic entry:\n",
"1, 3, 7, 11, 12, 13, 18, 22, 23, 32, 39, 42, 45, 50, 53, 55, 56, 57, 58, 59, 73, 75, 77, 78, 83, 86, 92, 93, 94, 98, 99, 110, 111, 112, 113, 114, 117, 122, 136, 138, 139, 141, 142, 143, 144, 148, 149, 150, 151, 152, 155, 156, 157, 165, 169, 171, 172, 174, 176, 179, 184, 186, 188, 193, 200, 204, 205, 207, 208, 210, 211, 213, 220, 221, 222, 227, 232, 236, 250, 251, 254, 302, 305, 306, 308, 310, 313, 315, 329, 330, 333, 334, 335, 342, 343, 344, 345, 347, 349, 350, 351, 352, 354, 355, 356, 357, 358\n"
]
}
],
"source": [
"print('{} .dic entries have no rules'.format(_rcodes['0']))\n",
"print('{} .dic entries have rule {}'.format(_rcodes['243'], '243'))\n",
"print()\n",
"print('these rules are used for only a single .dic entry:\\n{}'.format(\n",
" ', '.join(map(str,\n",
" sorted(int(rc) for rc, nn in _rcodes.items() if nn == 1)))))"
]
},
{
"cell_type": "code",
"execution_count": 301,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-16T20:35:22.932514Z",
"start_time": "2017-11-16T20:35:12.394894Z"
},
"hidden": true
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEKCAYAAAA4t9PUAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAE51JREFUeJzt3X+w5XV93/Hny0VREIkI2dkBzEJna4I2WckWTCXWhESQ\ntICNkrVpslZTmhYzsdYki1p10mFq0sZOk1aSzci4MUTEEcbNQKOwxtBMY3ChC7LgllWXwmZhB43A\nSIa4m3f/OJ+Lh/Xu3Xv3c8+v5fmYuXO+38/3x3nf7zn3+7rf36kqJEk6Us+ZdAGSpNlmkEiSuhgk\nkqQuBokkqYtBIknqYpBIkroYJJKkLgaJJKmLQSJJ6nLMpAvocfLJJ9fq1asnXYYkzZQ77rjj0ao6\nZbnmN9NBsnr1arZt2zbpMiRppiR5YDnn564tSVIXg0SS1MUgkSR1MUgkSV0MEklSF4NEktTFIJEk\ndTFIJEldDBJJUpeZDpIv7XmM1RtvYvXGmyZdiiQ9a810kEiSJs8gkSR1MUgkSV0MEklSF4NEktTF\nIJEkdTFIJEldDBJJUheDRJLUxSCRJHUxSCRJXQwSSVIXg0SS1MUgkSR1MUgkSV0MEklSF4NEktTF\nIJEkdTFIJEldDBJJUheDRJLUxSCRJHUxSCRJXQwSSVIXg0SS1GVkQZLk9CR/muTeJDuS/HJrPynJ\nLUnub68vHprmyiS7kuxMcsGoapMkLZ9RbpHsB/59VZ0FvAq4IslZwEZga1WtAba2ftqw9cDLgQuB\nDydZMcL6JEnLYGRBUlV7q+rO1v0EcB9wKnAJsLmNthm4tHVfAlxXVU9V1deAXcA5o6pPkrQ8xnKM\nJMlq4JXAXwIrq2pvG/QwsLJ1nwo8ODTZQ61NkjTFRh4kSV4IfAp4R1U9PjysqgqoJc7v8iTbkmw7\n8ORjy1ipJOlIjDRIkjyXQYhcW1U3tOZHkqxqw1cB+1r7HuD0oclPa23PUFWbqmpdVa1bcdyJoyte\nkrQoozxrK8BHgPuq6kNDg7YAG1r3BuDTQ+3rkxyb5AxgDXD7qOqTJC2PY0Y471cDPwd8Kcn21vZu\n4IPA9UneBjwAXAZQVTuSXA/cy+CMryuq6sAI65MkLYORBUlV/TmQQww+/xDTXAVcNaqaJEnLzyvb\nJUldDBJJUheDRJLUxSCRJHUxSCRJXQwSSVIXg0SS1MUgkSR1MUgkSV0MEklSF4NEktTFIJEkdTFI\nJEldDBJJUheDRJLUxSCRJHUxSCRJXQwSSVIXg0SS1MUgkSR1MUgkSV0MEklSF4NEktTFIJEkdTFI\nJEldDBJJUheDRJLUxSCRJHUxSCRJXQwSSVIXg0SS1MUgkSR1MUgkSV0MEklSF4NEktRlZEGS5Jok\n+5LcM9T2gSR7kmxvPxcNDbsyya4kO5NcMKq6JEnLa5RbJB8FLpyn/b9W1dr2czNAkrOA9cDL2zQf\nTrJihLVJkpbJyIKkqm4DvrHI0S8Brquqp6rqa8Au4JxR1SZJWj6TOEbyS0nubru+XtzaTgUeHBrn\nodYmSZpy4w6Sq4EzgbXAXuC3ljqDJJcn2ZZk24EnH1vu+iRJSzTWIKmqR6rqQFX9HfD7fGf31R7g\n9KFRT2tt881jU1Wtq6p1K447cbQFS5IOa6xBkmTVUO8bgLkzurYA65Mcm+QMYA1w+zhrkyQdmWNG\nNeMkHwdeC5yc5CHg/cBrk6wFCtgN/GuAqtqR5HrgXmA/cEVVHRhVbZKk5TOyIKmqN8/T/JEFxr8K\nuGpU9UiSRsMr2yVJXQwSSVIXg0SS1OWwQZLkTUlOaN3vTXJDkrNHX5okaRYsZovkP1TVE0nOA36C\nwQHzq0dbliRpViwmSOZOw/0pYFNV3QQ8b3QlSZJmyWKCZE+S3wN+Brg5ybGLnE6S9CywmEC4DPgM\ncEFVfRM4CfiVkVYlSZoZhw2SqnoS2Aec15r2A/ePsihJ0uxYzFlb7wd+DbiyNT0X+MNRFiVJmh2L\n2bX1BuBi4FsAVfVXwAmjLEqSNDsWEyR/W1XF4EaLJDl+tCVJkmbJYoLk+nbW1vck+VfArQyeJSJJ\n0uHv/ltV/yXJTwKPAy8D3ldVt4y8MknSTFjUbeRbcBgekqTvcsggSfIEg+Miaa9PDwKqql404tok\nSTPgkEFSVZ6ZJUk6rMPu2kry0vnaq+r/LX85kqRZs5hjJDcNdT8fOAPYCbx8JBVJkmbKYs7a+gfD\n/e1ZJP92ZBVJkmbKku/iW1V3AueOoBZJ0gxazDGSdw71Pgc4G/irkVUkSZopizlGMnz21n4Gx0w+\nNZpyJEmzZsEgSbICOKGq3jWmeiRJM2bBYyRVdQB49ZhqkSTNoMXs2tqeZAvwSdqt5AGq6oaRVSVJ\nmhmLCZLnA18HfnyorQCDRJK0qOtI/uU4CpEkzaYlX0ciSdIwg0SS1MUgkSR1OWyQJHnvUPexoy1H\nkjRrDhkkSX4tyY8Abxxq/ovRlyRJmiULnbX1ZeBNwJlJ/lfrf0mSl1XVzrFUJ0maegvt2vom8G5g\nF/Ba4L+19o1J/veI65IkzYiFtkguAN4H/D3gQ8DdwLe8rkSSNOyQWyRV9e6qOh/YDXwMWAGckuTP\nk/zx4Wac5Jok+5LcM9R2UpJbktzfXl88NOzKJLuS7ExyQddvJUkam8Wc/vuZqtpWVZuAh6rqPGAx\nWyUfBS48qG0jsLWq1gBbWz9JzgLWM3h874XAh9udhyVJU+6wQVJVvzrU+5bW9ugiprsN+MZBzZcA\nm1v3ZuDSofbrquqpqvoag+My5xzuPSRJk7ekCxKr6q7O91tZVXtb98PAytZ9KvDg0HgPtbbvkuTy\nJNuSbDvw5GOd5UiSek3syvaqKgZ3EV7qdJuqal1VrVtx3IkjqEyStBTjDpJHkqwCaK/7Wvse4PSh\n8U5rbZKkKTfuINkCbGjdG4BPD7WvT3JskjOANcDtY65NknQEFvNgqyOS5OMMLmQ8OclDwPuBDwLX\nJ3kb8ABwGUBV7UhyPXAvsB+4oj3mV5I05UYWJFX15kMMOv8Q418FXDWqeiRJo+Ft5CVJXQwSSVIX\ng0SS1MUgkSR1MUgkSV0MEklSF4NEktTFIJEkdTFIJEldDBJJUheDRJLUxSCRJHUxSCRJXQwSSVIX\ng0SS1MUgkSR1MUgkSV0MEklSF4NEktTFIJEkdTFIJEldDBJJUheDRJLUxSCRJHUxSCRJXQwSSVIX\ng0SS1MUgkSR1MUgkSV0MEklSF4NEktTFIJEkdTFIJEldDBJJUheDRJLU5ZhJvGmS3cATwAFgf1Wt\nS3IS8AlgNbAbuKyq/noS9UmSFm+SWyQ/VlVrq2pd698IbK2qNcDW1i9JmnLTtGvrEmBz694MXDrB\nWiRJizSpICng1iR3JLm8ta2sqr2t+2Fg5WRKkyQtxUSOkQDnVdWeJN8L3JLky8MDq6qS1HwTtuC5\nHGDFi04ZfaWSpAVNZIukqva0133AjcA5wCNJVgG0132HmHZTVa2rqnUrjjtxXCVLkg5h7EGS5Pgk\nJ8x1A68D7gG2ABvaaBuAT4+7NknS0k1i19ZK4MYkc+//R1X1J0m+CFyf5G3AA8BlE6hNkrREYw+S\nqvoq8EPztH8dOH/c9UiS+kzT6b+SpBlkkEiSuhgkkqQuBokkqYtBIknqYpBIkroYJJKkLgaJJKmL\nQSJJ6mKQSJK6GCSSpC4GiSSpi0EiSepikEiSuhgkkqQuBokkqYtBIknqYpBIkroYJJKkLgaJJKmL\nQSJJ6mKQSJK6HDVBsnrjTazeeNOky5CkZ52jJkgkSZNhkEiSuhgkkqQuBokkqYtBIknqcsykC1hu\nw2du7f7gT02wEkl6dnCLRJLUxSCRJHU5qoPEixQlafSO6iCRJI3eUXewfT7zbZV4IF6SlodbJJKk\nLlMXJEkuTLIzya4kGyddzzh4LEfSLJuqXVtJVgD/A/hJ4CHgi0m2VNW9o37vabj+ZBpqkKSlmqog\nAc4BdlXVVwGSXAdcAix7kCy0BXDwsJ6V+ty85ptH71bIYuY9qto1P/8ZODS/T0evaQuSU4EHh/of\nAs6dUC0LGvcB/CNdQS12unH/kS81RMdd+0L1LfV9jrS+SX4m07Syn4YA6q1hWpftcklVTbqGpyV5\nI3BhVf1C6/854NyqevvQOJcDl7feVwD3jL3QpTsZeHTSRSyCdS4v61w+s1AjzE6dL6uqE5ZrZtO2\nRbIHOH2o/7TW9rSq2gRsAkiyrarWja+8I2Ody8s6l9cs1DkLNcJs1bmc85u2s7a+CKxJckaS5wHr\ngS0TrkmStICp2iKpqv1J3g58BlgBXFNVOyZcliRpAVMVJABVdTNw8yJH3zTKWpaRdS4v61xes1Dn\nLNQIz9I6p+pguyRp9kzbMRJJ0oyZ2SCZllupJDk9yZ8muTfJjiS/3No/kGRPku3t56Khaa5sde9M\ncsEYa92d5Eutnm2t7aQktyS5v72+eJJ1JnnZ0DLbnuTxJO+YhuWZ5Jok+5LcM9S25OWX5Ifb57Ar\nyW8nyRjq/M9Jvpzk7iQ3Jvme1r46yd8MLdffnXCdS/6cJ1TnJ4Zq3J1ke2ufyPJcYD00nu9nVc3c\nD4MD8V8BzgSeB9wFnDWhWlYBZ7fuE4D/C5wFfAB41zzjn9XqPRY4o/0eK8ZU627g5IPafhPY2Lo3\nAr8x6ToP+pwfBr5vGpYn8BrgbOCenuUH3A68CgjwP4HXj6HO1wHHtO7fGKpz9fB4B81nEnUu+XOe\nRJ0HDf8t4H2TXJ4cej00lu/nrG6RPH0rlar6W2DuVipjV1V7q+rO1v0EcB+DK/QP5RLguqp6qqq+\nBuxi8PtMyiXA5ta9Gbh0qH3SdZ4PfKWqHlhgnLHVWVW3Ad+Y5/0XvfySrAJeVFVfqMFf7R8MTTOy\nOqvqs1W1v/V+gcE1Woc0qToXMFXLc077b/0y4OMLzWPUdS6wHhrL93NWg2S+W6kstPIeiySrgVcC\nf9mafqntSrhmaJNykrUXcGuSOzK4QwDAyqra27ofBla27mlYxut55h/otC1PWPryO7V1H9w+Tm9l\n8J/mnDPabpg/S/KjrW2SdS7lc5708vxR4JGqun+obaLL86D10Fi+n7MaJFMnyQuBTwHvqKrHgasZ\n7HpbC+xlsPk7aedV1Vrg9cAVSV4zPLD9BzIVp/FlcEHqxcAnW9M0Ls9nmKbldyhJ3gPsB65tTXuB\nl7bvxTuBP0ryoknVxwx8zgd5M8/8Z2eiy3Oe9dDTRvn9nNUgOeytVMYpyXMZfHjXVtUNAFX1SFUd\nqKq/A36f7+xumVjtVbWnve4Dbmw1PdI2Z+c2v/dNus7m9cCdVfUITOfybJa6/PbwzN1KY6s3yVuA\nfwL8bFup0HZtfL1138FgX/nfn1SdR/A5T3J5HgP8M+ATc22TXJ7zrYcY0/dzVoNkam6l0vaRfgS4\nr6o+NNS+ami0N/Cdm0tuAdYnOTbJGcAaBge3Rl3n8UlOmOtmcPD1nlbPhjbaBuDTk6xzyDP+05u2\n5TlkScuv7WZ4PMmr2nfn54emGZkkFwK/ClxcVU8OtZ+SwXOASHJmq/OrE6xzSZ/zpOpsfgL4clU9\nvStoUsvzUOshxvX9XK6zBsb9A1zE4MyErwDvmWAd5zHYXLwb2N5+LgI+BnyptW8BVg1N855W906W\n+QyTBeo8k8FZGncBO+aWGfASYCtwP3ArcNIk62zvezzwdeDEobaJL08GwbYX+DaDfcdvO5LlB6xj\nsIL8CvDfaRcGj7jOXQz2ic99R3+3jfvT7fuwHbgT+KcTrnPJn/Mk6mztHwV+8aBxJ7I8OfR6aCzf\nT69slyR1mdVdW5KkKWGQSJK6GCSSpC4GiSSpi0EiSepikOioluQ/JfmxJJcmufIIpv98knWt++a0\nu+aOQpJ3H2b4SN9fOlIGiY525zK4SeE/Bm7rmVFVXVRV31yWquY3b5Bk4DljeH/piBgkOipl8PyN\nu4F/CPwF8AvA1Uned5jpXpDkuiT3JbkReMHQsN1JTm7dP99uLHhXko/NM5/j200Hb0/yf5Jc0trf\nkuSGJH+SwTMifrO1fxB4QbvZ37UZPNdiZ5I/YHBx2OkHvf+/aPPenuT3kqxoPx9Nck8Gz5P4d8ux\nLKXDmbpntkvLoap+Jcn1DG7x8E7g81X16kVM+m+AJ6vqB5L8IIOrk58hycuB9wL/qKoeTXLSPPN5\nD/C5qnpr2x11e5Jb27C1DO7O+hSwM8nvVNXGJG+vwc3+5u7gugbYUFVfaG1z7/8DwM8Ar66qbyf5\nMPCzDK6oPrWqXtHGczeYxsIg0dHsbAa3hPl+Bs9nWIzXAL8NUFV3t62ag/048MmqerSNN9+zKl4H\nXJzkXa3/+cBLW/fWqnoMIMm9DB7c9eB3z4IH5kLkIOcDPwx8sYXLCxjcjO+PgTOT/A5wE/DZw/yu\n0rIwSHTUSbKWwX2QTgMeBY4bNGc78CNV9TfjKAP46araeVBt5zLYEplzgEP/HX5rgXlvrqrvOnkg\nyQ8BFwC/yOCBS29dYt3SknmMREedqtredhHNPW70c8AFVbV2ESFyG/DPAZK8AvjBecb5HPCmJC9p\n4823a+szDB7QlDbOKxdR+rczuBX44WwF3pjke+feP8n3teMnz6mqTzHY9Xb2IuYldTNIdFRKcgrw\n1zV4rsX3V9W9C4x7cZJfb71XAy9Mch/w68AdB49fVTuAq4A/S3IX8KGDxwH+I/Bc4O4kO1r/4Wxq\n41+70Ejtd3kv8Nm26+0WBs/sPhX4fNvy+kNgyac7S0fCu/9Kkrq4RSJJ6mKQSJK6GCSSpC4GiSSp\ni0EiSepikEiSuhgkkqQuBokkqcv/BxsV9dfUmeJ9AAAAAElFTkSuQmCC\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x7f6bb4fccba8>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"plt.hist(list(_rcodes.values()), bins='auto')\n",
"plt.xlabel('# .dic entries')\n",
"plt.ylabel('# rules')\n",
"plt.xlim(0, 2000)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## enumerating entries"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"let's find which rules have restricted applications:"
]
},
{
"cell_type": "code",
"execution_count": 332,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-16T20:50:55.526054Z",
"start_time": "2017-11-16T20:50:55.356690Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 5 : tz\n",
" 6 : ts\n",
" 9 : tz\n",
" 10 : ts\n",
" 14 : n\n",
" 20 : n\n",
" 31 : tz\n",
" 32 : tx\n",
" 33 : ts\n",
" 36 : tz\n",
" 37 : ts\n",
" 38 : tz\n",
" 39 : ts\n",
" 40 : a\n",
" 42 : a\n",
" 45 : ts\n",
" 50 : n\n",
" 61 : a\n",
" 62 : k\n",
" 70 : tz\n",
" 71 : tx\n",
" 72 : ts\n",
" 80 : tz\n",
" 82 : a\n",
" 97 : ei\n",
" 98 : ni\n",
"100 : t\n",
"102 : k\n",
"124 : a\n",
"125 : k\n",
"133 : tz\n",
"134 : ts\n",
"138 : t\n",
"143 : na\n",
"144 : a\n",
"160 : k\n",
"168 : tz\n",
"169 : tx\n",
"170 : ts\n",
"178 : tz\n",
"182 : t\n",
"188 : a\n",
"190 : t\n",
"192 : tz\n",
"196 : a\n",
"206 : nak\n",
"213 : tza\n",
"234 : a\n",
"236 : a\n",
"237 : k\n",
"245 : tz\n",
"246 : tx\n",
"247 : ts\n",
"259 : t\n",
"264 : tz\n",
"266 : t\n",
"268 : k\n",
"270 : a\n",
"271 : n\n",
"278 : nan\n",
"279 : n\n",
"280 : an\n",
"281 : a\n",
"289 : a\n",
"294 : k\n",
"299 : tz\n",
"302 : tx\n",
"303 : ts\n",
"304 : ts\n",
"308 : t\n",
"313 : an\n",
"321 : a\n",
"322 : k\n",
"329 : ts\n",
"338 : a\n",
"346 : ei\n",
"347 : rri\n",
"357 : ei\n",
"358 : rri\n"
]
}
],
"source": [
"for rcode in map(str, sorted(map(int, rules.keys()))):\n",
" restrictions = sorted(set([k[-1] for k in rules[rcode].keys() if k[-1] != '.']))\n",
" if restrictions:\n",
" print(rcode.rjust(3), ':', ', '.join(restrictions))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"a function to determine how many words (and characters!) result from applying a rule to a given starting word:"
]
},
{
"cell_type": "code",
"execution_count": 385,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-17T00:26:26.298106Z",
"start_time": "2017-11-17T00:26:25.579637Z"
}
},
"outputs": [],
"source": [
"import re\n",
"\n",
"# cache lengths:\n",
"rlengths = {}\n",
"\n",
"def apply_rule(rulecode, nsfx=0, npfx=0, disp=False, word=('ai',)):\n",
" \"\"\"Count words and chars created by applying a rule to a starting word.\"\"\"\n",
" rulehash = (rulecode, nsfx, npfx)\n",
" try:\n",
" return rlengths[rulehash]\n",
" except KeyError:\n",
" pass\n",
" rkeys = sorted(list(rules.get(rulecode, {}).keys()))\n",
" if not rkeys:\n",
" rlengths[rulehash] = 0, 0\n",
" if disp:\n",
" print('rlengths[%s] -> %s' % (rulehash, rlengths[rulehash]))\n",
" return rlengths[rulehash]\n",
"\n",
" # can't apply rule if we would now have 2 pre/suffixes\n",
" if nsfx > 1 or npfx > 1:\n",
" if rulehash not in rlengths:\n",
" rlengths[rulehash] = 0, 0\n",
" if disp:\n",
" print('. rlengths[{}] -> {}'.format(rulehash, rlengths[rulehash]))\n",
" return rlengths[rulehash]\n",
" if disp:\n",
" print('apply_rule{}'.format(rulehash))\n",
"\n",
" num, chars = 0, 0\n",
" ruletype = rkeys[0][0]\n",
" is_sfx, is_pfx = ruletype == \"SFX\", ruletype != \"SFX\"\n",
" for ii, variant in enumerate(rkeys):\n",
" if not (variant[4] == '.' or\n",
" re.search(variant[4] + '$', ''.join(word[:npfx+nsfx+1]))):\n",
" continue\n",
" to_remove = '' if variant[2] == '0' else variant[2]\n",
" parts = variant[3].split('/')\n",
" new_word = list(word[:npfx+nsfx+1])\n",
" if is_sfx:\n",
" if to_remove:\n",
" new_word[-1] = new_word[-1][:-len(to_remove)]\n",
" new_word = tuple(new_word + parts[:1])\n",
" else:\n",
" if to_remove:\n",
" new_word[0] = new_word[0][len(to_remove):]\n",
" new_word = tuple(parts[:1] + new_word)\n",
" if disp:\n",
" print('{} {} {}'.format(nsfx + is_sfx, npfx + is_pfx, '-'.join(new_word)))\n",
" num += 1 # add just this affix\n",
" l_affix = len(''.join(new_word)) - len(''.join(word))\n",
" chars += l_affix # add chars for the affix\n",
" if len(parts) > 1:\n",
" # also add compound affixes\n",
" for subrulecode in map(str, sorted(list(map(int, parts[1].split(','))))):\n",
" to_add = apply_rule(subrulecode, nsfx + is_sfx, npfx + is_pfx,\n",
" disp=disp, word=new_word)\n",
" if to_add[0] and disp:\n",
" print('/{} -> {}'.format(subrulecode, to_add))\n",
" num += to_add[0]\n",
" # add chars for new affixes, plus the current affix for each\n",
" chars += (to_add[0] * l_affix) + to_add[1]\n",
" rlengths[rulehash] = num, chars\n",
" if disp:\n",
" print('. . rlengths[{}] -> {}'.format(rulehash, rlengths[rulehash]))\n",
" return num, chars"
]
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-17T12:46:24.416407Z",
"start_time": "2017-11-17T12:46:24.405797Z"
}
},
"source": [
"how many words have the last rule (358) applied to them?"
]
},
{
"cell_type": "code",
"execution_count": 387,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-17T00:27:51.454376Z",
"start_time": "2017-11-17T00:27:51.284350Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[('berorri', '358')]"
]
},
"execution_count": 387,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(dk for dk in dic.keys() if '358' in dk[1:])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Only one. Weird. Anyway, let's apply it, and see whether we've got the removal working correctly:"
]
},
{
"cell_type": "code",
"execution_count": 271,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-16T20:12:01.388411Z",
"start_time": "2017-11-16T20:12:00.876330Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 0 berorri\n",
"apply_rule('358', 0, 0)\n",
"1 0 bero-rregatik\n",
"1 0 bero-rrek\n",
"1 0 bero-rrekiko\n",
"apply_rule('243', 1, 0)\n",
"2 0 bero-rrekiko-a\n",
"2 0 bero-rrekiko-agan\n",
"2 0 bero-rrekiko-agana\n",
"2 0 bero-rrekiko-aganaino\n",
"2 0 bero-rrekiko-aganantz\n",
"2 0 bero-rrekiko-agandik\n",
"2 0 bero-rrekiko-agatik\n",
"2 0 bero-rrekiko-ago\n",
". rlengths[('243', 2, 0)] -> (0, 0)\n",
"2 0 bero-rrekiko-agoa\n",
"2 0 bero-rrekiko-agoak\n",
"2 0 bero-rrekiko-agoan\n",
"2 0 bero-rrekiko-agoarekin\n",
"2 0 bero-rrekiko-agoaren\n",
". rlengths[('238', 2, 0)] -> (0, 0)\n",
"2 0 bero-rrekiko-agoarendako\n",
"2 0 bero-rrekiko-agoarentzat\n",
"2 0 bero-rrekiko-agoari\n",
"2 0 bero-rrekiko-agoaz\n",
"2 0 bero-rrekiko-agoei\n",
"2 0 bero-rrekiko-agoek\n",
"2 0 bero-rrekiko-agoekin\n",
"2 0 bero-rrekiko-agoen\n",
"2 0 bero-rrekiko-agoendako\n",
"2 0 bero-rrekiko-agoentzat\n",
"2 0 bero-rrekiko-agoez\n",
"2 0 bero-rrekiko-agogatik\n",
"2 0 bero-rrekiko-agok\n",
"2 0 bero-rrekiko-agoko\n",
"2 0 bero-rrekiko-agooi\n",
"2 0 bero-rrekiko-agook\n",
"2 0 bero-rrekiko-agookin\n",
"2 0 bero-rrekiko-agoon\n",
"2 0 bero-rrekiko-agoontzat\n",
"2 0 bero-rrekiko-agootaz\n",
"2 0 bero-rrekiko-agooz\n",
"2 0 bero-rrekiko-agora\n",
"2 0 bero-rrekiko-agoraino\n",
"2 0 bero-rrekiko-agorantz\n",
"2 0 bero-rrekiko-agorat\n",
"2 0 bero-rrekiko-agorekin\n",
"2 0 bero-rrekiko-agoren\n",
"2 0 bero-rrekiko-agorendako\n",
"2 0 bero-rrekiko-agorentzat\n",
"2 0 bero-rrekiko-agori\n",
"2 0 bero-rrekiko-agorik\n",
"2 0 bero-rrekiko-agotik\n",
"2 0 bero-rrekiko-agotzat\n",
"2 0 bero-rrekiko-agoz\n",
"2 0 bero-rrekiko-ak\n",
"2 0 bero-rrekiko-an\n",
"2 0 bero-rrekiko-arekiko\n",
"2 0 bero-rrekiko-arekin\n",
"2 0 bero-rrekiko-aren\n",
"2 0 bero-rrekiko-arena\n",
"2 0 bero-rrekiko-arenak\n",
"2 0 bero-rrekiko-arenarekin\n",
"2 0 bero-rrekiko-arenarendako\n",
"2 0 bero-rrekiko-arenarentzat\n",
"2 0 bero-rrekiko-arenari\n",
"2 0 bero-rrekiko-arenaz\n",
"2 0 bero-rrekiko-arendako\n",
"2 0 bero-rrekiko-arenean\n",
"2 0 bero-rrekiko-arenei\n",
"2 0 bero-rrekiko-arenek\n",
"2 0 bero-rrekiko-arenekin\n",
"2 0 bero-rrekiko-arenendako\n",
"2 0 bero-rrekiko-arenentzat\n",
"2 0 bero-rrekiko-arenera\n",
"2 0 bero-rrekiko-areneraino\n",
"2 0 bero-rrekiko-arenerantz\n",
"2 0 bero-rrekiko-arenerat\n",
"2 0 bero-rrekiko-arenetan\n",
"2 0 bero-rrekiko-arenetara\n",
"2 0 bero-rrekiko-arenetaraino\n",
"2 0 bero-rrekiko-arenetarantz\n",
"2 0 bero-rrekiko-arenetarat\n",
"2 0 bero-rrekiko-arenetarik\n",
"2 0 bero-rrekiko-arenetatik\n",
"2 0 bero-rrekiko-arenetik\n",
"2 0 bero-rrekiko-arenez\n",
"2 0 bero-rrekiko-arengan\n",
"2 0 bero-rrekiko-arengana\n",
"2 0 bero-rrekiko-arenganaino\n",
"2 0 bero-rrekiko-arenganantz\n",
"2 0 bero-rrekiko-arengandik\n",
"2 0 bero-rrekiko-arengatik\n",
"2 0 bero-rrekiko-areni\n",
"2 0 bero-rrekiko-arenik\n",
"2 0 bero-rrekiko-arenoi\n",
"2 0 bero-rrekiko-arenok\n",
"2 0 bero-rrekiko-arenokin\n",
"2 0 bero-rrekiko-arenontzat\n",
"2 0 bero-rrekiko-arenotaz\n",
"2 0 bero-rrekiko-arenoz\n",
"2 0 bero-rrekiko-arentzako\n",
"2 0 bero-rrekiko-arentzat\n",
"2 0 bero-rrekiko-ari\n",
"2 0 bero-rrekiko-az\n",
"2 0 bero-rrekiko-ei\n",
"2 0 bero-rrekiko-ek\n",
"2 0 bero-rrekiko-ekiko\n",
"2 0 bero-rrekiko-ekin\n",
"2 0 bero-rrekiko-en\n",
"2 0 bero-rrekiko-ena\n",
"2 0 bero-rrekiko-enak\n",
"2 0 bero-rrekiko-enarekin\n",
"2 0 bero-rrekiko-enarendako\n",
"2 0 bero-rrekiko-enarentzat\n",
"2 0 bero-rrekiko-enari\n",
"2 0 bero-rrekiko-enaz\n",
"2 0 bero-rrekiko-endako\n",
"2 0 bero-rrekiko-enean\n",
"2 0 bero-rrekiko-enei\n",
"2 0 bero-rrekiko-enek\n",
"2 0 bero-rrekiko-enekin\n",
"2 0 bero-rrekiko-enendako\n",
"2 0 bero-rrekiko-enentzat\n",
"2 0 bero-rrekiko-enera\n",
"2 0 bero-rrekiko-eneraino\n",
"2 0 bero-rrekiko-enerantz\n",
"2 0 bero-rrekiko-enerat\n",
"2 0 bero-rrekiko-enetan\n",
"2 0 bero-rrekiko-enetara\n",
"2 0 bero-rrekiko-enetaraino\n",
"2 0 bero-rrekiko-enetarantz\n",
"2 0 bero-rrekiko-enetarat\n",
"2 0 bero-rrekiko-enetarik\n",
"2 0 bero-rrekiko-enetatik\n",
"2 0 bero-rrekiko-enetik\n",
"2 0 bero-rrekiko-enez\n",
"2 0 bero-rrekiko-engan\n",
"2 0 bero-rrekiko-engana\n",
"2 0 bero-rrekiko-enganaino\n",
"2 0 bero-rrekiko-enganantz\n",
"2 0 bero-rrekiko-engandik\n",
"2 0 bero-rrekiko-engatik\n",
"2 0 bero-rrekiko-eni\n",
"2 0 bero-rrekiko-enik\n",
"2 0 bero-rrekiko-enoi\n",
"2 0 bero-rrekiko-enok\n",
"2 0 bero-rrekiko-enokin\n",
"2 0 bero-rrekiko-enontzat\n",
"2 0 bero-rrekiko-enotaz\n",
"2 0 bero-rrekiko-enoz\n",
"2 0 bero-rrekiko-entzako\n",
"2 0 bero-rrekiko-entzat\n",
"2 0 bero-rrekiko-etako\n",
"2 0 bero-rrekiko-etakoa\n",
"2 0 bero-rrekiko-etakoago\n",
"2 0 bero-rrekiko-etakoak\n",
"2 0 bero-rrekiko-etakoan\n",
"2 0 bero-rrekiko-etakoarekin\n",
"2 0 bero-rrekiko-etakoarendako\n",
"2 0 bero-rrekiko-etakoarentzat\n",
"2 0 bero-rrekiko-etakoari\n",
"2 0 bero-rrekiko-etakoaz\n",
"2 0 bero-rrekiko-etakoegi\n",
"2 0 bero-rrekiko-etakoei\n",
"2 0 bero-rrekiko-etakoek\n",
"2 0 bero-rrekiko-etakoekin\n",
"2 0 bero-rrekiko-etakoendako\n",
"2 0 bero-rrekiko-etakoentzat\n",
"2 0 bero-rrekiko-etakoetan\n",
"2 0 bero-rrekiko-etakoetara\n",
"2 0 bero-rrekiko-etakoetaraino\n",
"2 0 bero-rrekiko-etakoetarantz\n",
"2 0 bero-rrekiko-etakoetarat\n",
"2 0 bero-rrekiko-etakoetarik\n",
"2 0 bero-rrekiko-etakoetatik\n",
"2 0 bero-rrekiko-etakoez\n",
"2 0 bero-rrekiko-etakogatik\n",
"2 0 bero-rrekiko-etakok\n",
"2 0 bero-rrekiko-etakooi\n",
"2 0 bero-rrekiko-etakook\n",
"2 0 bero-rrekiko-etakookin\n",
"2 0 bero-rrekiko-etakoontzat\n",
"2 0 bero-rrekiko-etakootaz\n",
"2 0 bero-rrekiko-etakooz\n",
"2 0 bero-rrekiko-etakora\n",
"2 0 bero-rrekiko-etakoraino\n",
"2 0 bero-rrekiko-etakorantz\n",
"2 0 bero-rrekiko-etakorat\n",
"2 0 bero-rrekiko-etakorekin\n",
"2 0 bero-rrekiko-etakorendako\n",
"2 0 bero-rrekiko-etakorentzat\n",
"2 0 bero-rrekiko-etakori\n",
"2 0 bero-rrekiko-etakorik\n",
"2 0 bero-rrekiko-etakotan\n",
"2 0 bero-rrekiko-etakotara\n",
"2 0 bero-rrekiko-etakotaraino\n",
"2 0 bero-rrekiko-etakotarantz\n",
"2 0 bero-rrekiko-etakotarat\n",
"2 0 bero-rrekiko-etakotarik\n",
"2 0 bero-rrekiko-etakotatik\n",
"2 0 bero-rrekiko-etakotik\n",
"2 0 bero-rrekiko-etakotzat\n",
"2 0 bero-rrekiko-etakoz\n",
"2 0 bero-rrekiko-etan\n",
"2 0 bero-rrekiko-etara\n",
"2 0 bero-rrekiko-etarago\n",
"2 0 bero-rrekiko-etaraino\n",
"2 0 bero-rrekiko-etarainoko\n",
"2 0 bero-rrekiko-etarako\n",
"2 0 bero-rrekiko-etarantz\n",
"2 0 bero-rrekiko-etarantzago\n",
"2 0 bero-rrekiko-etarantzegi\n",
"2 0 bero-rrekiko-etaranzko\n",
"2 0 bero-rrekiko-etarat\n",
"2 0 bero-rrekiko-etarik\n",
"2 0 bero-rrekiko-etariko\n",
"2 0 bero-rrekiko-etatik\n",
"2 0 bero-rrekiko-etatiko\n",
"2 0 bero-rrekiko-ez\n",
"2 0 bero-rrekiko-gatik\n",
"2 0 bero-rrekiko-gatiko\n",
"2 0 bero-rrekiko-k\n",
"2 0 bero-rrekiko-ko\n",
"2 0 bero-rrekiko-koa\n",
"2 0 bero-rrekiko-koago\n",
"2 0 bero-rrekiko-koak\n",
"2 0 bero-rrekiko-koan\n",
"2 0 bero-rrekiko-koarekin\n",
"2 0 bero-rrekiko-koarendako\n",
"2 0 bero-rrekiko-koarentzat\n",
"2 0 bero-rrekiko-koari\n",
"2 0 bero-rrekiko-koaz\n",
"2 0 bero-rrekiko-koegi\n",
"2 0 bero-rrekiko-koei\n",
"2 0 bero-rrekiko-koek\n",
"2 0 bero-rrekiko-koekin\n",
"2 0 bero-rrekiko-koendako\n",
"2 0 bero-rrekiko-koentzat\n",
"2 0 bero-rrekiko-koetan\n",
"2 0 bero-rrekiko-koetara\n",
"2 0 bero-rrekiko-koetaraino\n",
"2 0 bero-rrekiko-koetarantz\n",
"2 0 bero-rrekiko-koetarat\n",
"2 0 bero-rrekiko-koetarik\n",
"2 0 bero-rrekiko-koetatik\n",
"2 0 bero-rrekiko-koez\n",
"2 0 bero-rrekiko-kogatik\n",
"2 0 bero-rrekiko-kok\n",
"2 0 bero-rrekiko-kooi\n",
"2 0 bero-rrekiko-kook\n",
"2 0 bero-rrekiko-kookin\n",
"2 0 bero-rrekiko-koontzat\n",
"2 0 bero-rrekiko-kootaz\n",
"2 0 bero-rrekiko-kooz\n",
"2 0 bero-rrekiko-kora\n",
"2 0 bero-rrekiko-koraino\n",
"2 0 bero-rrekiko-korantz\n",
"2 0 bero-rrekiko-korat\n",
"2 0 bero-rrekiko-korekin\n",
"2 0 bero-rrekiko-korendako\n",
"2 0 bero-rrekiko-korentzat\n",
"2 0 bero-rrekiko-kori\n",
"2 0 bero-rrekiko-korik\n",
"2 0 bero-rrekiko-kotan\n",
"2 0 bero-rrekiko-kotara\n",
"2 0 bero-rrekiko-kotaraino\n",
"2 0 bero-rrekiko-kotarantz\n",
"2 0 bero-rrekiko-kotarat\n",
"2 0 bero-rrekiko-kotarik\n",
"2 0 bero-rrekiko-kotatik\n",
"2 0 bero-rrekiko-kotik\n",
"2 0 bero-rrekiko-kotzat\n",
"2 0 bero-rrekiko-koz\n",
"2 0 bero-rrekiko-oi\n",
"2 0 bero-rrekiko-ok\n",
"2 0 bero-rrekiko-okiko\n",
"2 0 bero-rrekiko-okin\n",
"2 0 bero-rrekiko-on\n",
"2 0 bero-rrekiko-ona\n",
"2 0 bero-rrekiko-onak\n",
"2 0 bero-rrekiko-onarekin\n",
"2 0 bero-rrekiko-onarendako\n",
"2 0 bero-rrekiko-onarentzat\n",
"2 0 bero-rrekiko-onari\n",
"2 0 bero-rrekiko-onaz\n",
"2 0 bero-rrekiko-onean\n",
"2 0 bero-rrekiko-onei\n",
"2 0 bero-rrekiko-onek\n",
"2 0 bero-rrekiko-onekin\n",
"2 0 bero-rrekiko-onendako\n",
"2 0 bero-rrekiko-onentzat\n",
"2 0 bero-rrekiko-onera\n",
"2 0 bero-rrekiko-oneraino\n",
"2 0 bero-rrekiko-onerantz\n",
"2 0 bero-rrekiko-onerat\n",
"2 0 bero-rrekiko-onetik\n",
"2 0 bero-rrekiko-onez\n",
"2 0 bero-rrekiko-ongan\n",
"2 0 bero-rrekiko-ongana\n",
"2 0 bero-rrekiko-onganaino\n",
"2 0 bero-rrekiko-onganantz\n",
"2 0 bero-rrekiko-ongandik\n",
"2 0 bero-rrekiko-ongatik\n",
"2 0 bero-rrekiko-oni\n",
"2 0 bero-rrekiko-onik\n",
"2 0 bero-rrekiko-onoi\n",
"2 0 bero-rrekiko-onok\n",
"2 0 bero-rrekiko-onokin\n",
"2 0 bero-rrekiko-onontzat\n",
"2 0 bero-rrekiko-onotaz\n",
"2 0 bero-rrekiko-onoz\n",
"2 0 bero-rrekiko-ontzako\n",
"2 0 bero-rrekiko-ontzat\n",
"2 0 bero-rrekiko-otako\n",
"2 0 bero-rrekiko-otan\n",
"2 0 bero-rrekiko-otara\n",
"2 0 bero-rrekiko-otaraino\n",
"2 0 bero-rrekiko-otarantz\n",
"2 0 bero-rrekiko-otarat\n",
"2 0 bero-rrekiko-otarik\n",
"2 0 bero-rrekiko-otatik\n",
"2 0 bero-rrekiko-otaz\n",
"2 0 bero-rrekiko-oz\n",
"2 0 bero-rrekiko-ra\n",
"2 0 bero-rrekiko-rago\n",
"2 0 bero-rrekiko-raino\n",
"2 0 bero-rrekiko-rainoko\n",
"2 0 bero-rrekiko-rako\n",
"2 0 bero-rrekiko-rantz\n",
"2 0 bero-rrekiko-rantzago\n",
"2 0 bero-rrekiko-rantzegi\n",
"2 0 bero-rrekiko-ranzko\n",
"2 0 bero-rrekiko-rat\n",
"2 0 bero-rrekiko-regatik\n",
"2 0 bero-rrekiko-rekiko\n",
"2 0 bero-rrekiko-rekin\n",
"2 0 bero-rrekiko-ren\n",
"2 0 bero-rrekiko-rena\n",
"2 0 bero-rrekiko-renak\n",
"2 0 bero-rrekiko-renarekin\n",
"2 0 bero-rrekiko-renarendako\n",
"2 0 bero-rrekiko-renarentzat\n",
"2 0 bero-rrekiko-renari\n",
"2 0 bero-rrekiko-renaz\n",
"2 0 bero-rrekiko-rendako\n",
"2 0 bero-rrekiko-renean\n",
"2 0 bero-rrekiko-renei\n",
"2 0 bero-rrekiko-renek\n",
"2 0 bero-rrekiko-renekin\n",
"2 0 bero-rrekiko-renendako\n",
"2 0 bero-rrekiko-renentzat\n",
"2 0 bero-rrekiko-renera\n",
"2 0 bero-rrekiko-reneraino\n",
"2 0 bero-rrekiko-renerantz\n",
"2 0 bero-rrekiko-renerat\n",
"2 0 bero-rrekiko-renetan\n",
"2 0 bero-rrekiko-renetara\n",
"2 0 bero-rrekiko-renetaraino\n",
"2 0 bero-rrekiko-renetarantz\n",
"2 0 bero-rrekiko-renetarat\n",
"2 0 bero-rrekiko-renetarik\n",
"2 0 bero-rrekiko-renetatik\n",
"2 0 bero-rrekiko-renetik\n",
"2 0 bero-rrekiko-renez\n",
"2 0 bero-rrekiko-rengan\n",
"2 0 bero-rrekiko-rengana\n",
"2 0 bero-rrekiko-renganaino\n",
"2 0 bero-rrekiko-renganantz\n",
"2 0 bero-rrekiko-rengandik\n",
"2 0 bero-rrekiko-rengatik\n",
"2 0 bero-rrekiko-reni\n",
"2 0 bero-rrekiko-renik\n",
"2 0 bero-rrekiko-renoi\n",
"2 0 bero-rrekiko-renok\n",
"2 0 bero-rrekiko-renokin\n",
"2 0 bero-rrekiko-renontzat\n",
"2 0 bero-rrekiko-renotaz\n",
"2 0 bero-rrekiko-renoz\n",
"2 0 bero-rrekiko-rentzako\n",
"2 0 bero-rrekiko-rentzat\n",
"2 0 bero-rrekiko-ri\n",
"2 0 bero-rrekiko-rik\n",
"2 0 bero-rrekiko-tako\n",
"2 0 bero-rrekiko-takoa\n",
"2 0 bero-rrekiko-takoago\n",
"2 0 bero-rrekiko-takoak\n",
"2 0 bero-rrekiko-takoan\n",
"2 0 bero-rrekiko-takoarekin\n",
"2 0 bero-rrekiko-takoarendako\n",
"2 0 bero-rrekiko-takoarentzat\n",
"2 0 bero-rrekiko-takoari\n",
"2 0 bero-rrekiko-takoaz\n",
"2 0 bero-rrekiko-takoegi\n",
"2 0 bero-rrekiko-takoei\n",
"2 0 bero-rrekiko-takoek\n",
"2 0 bero-rrekiko-takoekin\n",
"2 0 bero-rrekiko-takoendako\n",
"2 0 bero-rrekiko-takoentzat\n",
"2 0 bero-rrekiko-takoetan\n",
"2 0 bero-rrekiko-takoetara\n",
"2 0 bero-rrekiko-takoetaraino\n",
"2 0 bero-rrekiko-takoetarantz\n",
"2 0 bero-rrekiko-takoetarat\n",
"2 0 bero-rrekiko-takoetarik\n",
"2 0 bero-rrekiko-takoetatik\n",
"2 0 bero-rrekiko-takoez\n",
"2 0 bero-rrekiko-takogatik\n",
"2 0 bero-rrekiko-takok\n",
"2 0 bero-rrekiko-takooi\n",
"2 0 bero-rrekiko-takook\n",
"2 0 bero-rrekiko-takookin\n",
"2 0 bero-rrekiko-takoontzat\n",
"2 0 bero-rrekiko-takootaz\n",
"2 0 bero-rrekiko-takooz\n",
"2 0 bero-rrekiko-takora\n",
"2 0 bero-rrekiko-takoraino\n",
"2 0 bero-rrekiko-takorantz\n",
"2 0 bero-rrekiko-takorat\n",
"2 0 bero-rrekiko-takorekin\n",
"2 0 bero-rrekiko-takorendako\n",
"2 0 bero-rrekiko-takorentzat\n",
"2 0 bero-rrekiko-takori\n",
"2 0 bero-rrekiko-takorik\n",
"2 0 bero-rrekiko-takotan\n",
"2 0 bero-rrekiko-takotara\n",
"2 0 bero-rrekiko-takotaraino\n",
"2 0 bero-rrekiko-takotarantz\n",
"2 0 bero-rrekiko-takotarat\n",
"2 0 bero-rrekiko-takotarik\n",
"2 0 bero-rrekiko-takotatik\n",
"2 0 bero-rrekiko-takotik\n",
"2 0 bero-rrekiko-takotzat\n",
"2 0 bero-rrekiko-takoz\n",
"2 0 bero-rrekiko-tan\n",
"2 0 bero-rrekiko-tara\n",
"2 0 bero-rrekiko-tarago\n",
"2 0 bero-rrekiko-taraino\n",
"2 0 bero-rrekiko-tarainoko\n",
"2 0 bero-rrekiko-tarako\n",
"2 0 bero-rrekiko-tarantz\n",
"2 0 bero-rrekiko-tarantzago\n",
"2 0 bero-rrekiko-tarantzegi\n",
"2 0 bero-rrekiko-taranzko\n",
"2 0 bero-rrekiko-tarat\n",
"2 0 bero-rrekiko-tarik\n",
"2 0 bero-rrekiko-tariko\n",
"2 0 bero-rrekiko-tatik\n",
"2 0 bero-rrekiko-tatiko\n",
"2 0 bero-rrekiko-tik\n",
"2 0 bero-rrekiko-tiko\n",
"2 0 bero-rrekiko-txo\n",
"2 0 bero-rrekiko-txoa\n",
"2 0 bero-rrekiko-txoak\n",
"2 0 bero-rrekiko-txoan\n",
"2 0 bero-rrekiko-txoarekin\n",
"2 0 bero-rrekiko-txoaren\n",
"2 0 bero-rrekiko-txoarendako\n",
"2 0 bero-rrekiko-txoarentzat\n",
"2 0 bero-rrekiko-txoari\n",
"2 0 bero-rrekiko-txoaz\n",
"2 0 bero-rrekiko-txoei\n",
"2 0 bero-rrekiko-txoek\n",
"2 0 bero-rrekiko-txoekin\n",
"2 0 bero-rrekiko-txoen\n",
"2 0 bero-rrekiko-txoendako\n",
"2 0 bero-rrekiko-txoentzat\n",
"2 0 bero-rrekiko-txoez\n",
"2 0 bero-rrekiko-txogatik\n",
"2 0 bero-rrekiko-txok\n",
"2 0 bero-rrekiko-txoko\n",
"2 0 bero-rrekiko-txooi\n",
"2 0 bero-rrekiko-txook\n",
"2 0 bero-rrekiko-txookin\n",
"2 0 bero-rrekiko-txoon\n",
"2 0 bero-rrekiko-txoontzat\n",
"2 0 bero-rrekiko-txootaz\n",
"2 0 bero-rrekiko-txooz\n",
"2 0 bero-rrekiko-txora\n",
"2 0 bero-rrekiko-txoraino\n",
"2 0 bero-rrekiko-txorantz\n",
"2 0 bero-rrekiko-txorat\n",
"2 0 bero-rrekiko-txorekin\n",
"2 0 bero-rrekiko-txoren\n",
"2 0 bero-rrekiko-txorendako\n",
"2 0 bero-rrekiko-txorentzat\n",
"2 0 bero-rrekiko-txori\n",
"2 0 bero-rrekiko-txorik\n",
"2 0 bero-rrekiko-txotik\n",
"2 0 bero-rrekiko-txotzat\n",
"2 0 bero-rrekiko-txoz\n",
"2 0 bero-rrekiko-tzar\n",
"2 0 bero-rrekiko-tzargatik\n",
"2 0 bero-rrekiko-tzarra\n",
"2 0 bero-rrekiko-tzarrak\n",
"2 0 bero-rrekiko-tzarrarekin\n",
"2 0 bero-rrekiko-tzarraren\n",
"2 0 bero-rrekiko-tzarrarendako\n",
"2 0 bero-rrekiko-tzarrarentzat\n",
"2 0 bero-rrekiko-tzarrari\n",
"2 0 bero-rrekiko-tzarraz\n",
"2 0 bero-rrekiko-tzarrean\n",
"2 0 bero-rrekiko-tzarrei\n",
"2 0 bero-rrekiko-tzarrek\n",
"2 0 bero-rrekiko-tzarrekin\n",
"2 0 bero-rrekiko-tzarreko\n",
"2 0 bero-rrekiko-tzarren\n",
"2 0 bero-rrekiko-tzarrendako\n",
"2 0 bero-rrekiko-tzarrentzat\n",
"2 0 bero-rrekiko-tzarrera\n",
"2 0 bero-rrekiko-tzarreraino\n",
"2 0 bero-rrekiko-tzarrerantz\n",
"2 0 bero-rrekiko-tzarrerat\n",
"2 0 bero-rrekiko-tzarretik\n",
"2 0 bero-rrekiko-tzarrez\n",
"2 0 bero-rrekiko-tzarri\n",
"2 0 bero-rrekiko-tzarrik\n",
"2 0 bero-rrekiko-tzarroi\n",
"2 0 bero-rrekiko-tzarrok\n",
"2 0 bero-rrekiko-tzarrokin\n",
"2 0 bero-rrekiko-tzarron\n",
"2 0 bero-rrekiko-tzarrontzat\n",
"2 0 bero-rrekiko-tzarrotaz\n",
"2 0 bero-rrekiko-tzarroz\n",
"2 0 bero-rrekiko-tzartzat\n",
"2 0 bero-rrekiko-tzat\n",
"2 0 bero-rrekiko-z\n",
"2 0 bero-rrekiko-zko\n",
". . rlengths[('243', 1, 0)] -> (520, 9356)\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rrekin\n",
"1 0 bero-rren\n",
"apply_rule('238', 1, 0)\n",
"2 0 bero-rren-a\n",
"2 0 bero-rren-agan\n",
"2 0 bero-rren-agana\n",
"2 0 bero-rren-aganaino\n",
"2 0 bero-rren-aganantz\n",
"2 0 bero-rren-agandik\n",
"2 0 bero-rren-agatik\n",
"2 0 bero-rren-ago\n",
"2 0 bero-rren-agoa\n",
"2 0 bero-rren-agoak\n",
"2 0 bero-rren-agoan\n",
"2 0 bero-rren-agoarekin\n",
"2 0 bero-rren-agoaren\n",
"2 0 bero-rren-agoarendako\n",
"2 0 bero-rren-agoarentzat\n",
"2 0 bero-rren-agoari\n",
"2 0 bero-rren-agoaz\n",
"2 0 bero-rren-agoei\n",
"2 0 bero-rren-agoek\n",
"2 0 bero-rren-agoekin\n",
"2 0 bero-rren-agoen\n",
"2 0 bero-rren-agoendako\n",
"2 0 bero-rren-agoentzat\n",
"2 0 bero-rren-agoez\n",
"2 0 bero-rren-agogatik\n",
"2 0 bero-rren-agok\n",
"2 0 bero-rren-agoko\n",
"2 0 bero-rren-agooi\n",
"2 0 bero-rren-agook\n",
"2 0 bero-rren-agookin\n",
"2 0 bero-rren-agoon\n",
"2 0 bero-rren-agoontzat\n",
"2 0 bero-rren-agootaz\n",
"2 0 bero-rren-agooz\n",
"2 0 bero-rren-agora\n",
"2 0 bero-rren-agoraino\n",
"2 0 bero-rren-agorantz\n",
"2 0 bero-rren-agorat\n",
"2 0 bero-rren-agorekin\n",
"2 0 bero-rren-agoren\n",
"2 0 bero-rren-agorendako\n",
"2 0 bero-rren-agorentzat\n",
"2 0 bero-rren-agori\n",
"2 0 bero-rren-agorik\n",
"2 0 bero-rren-agotik\n",
"2 0 bero-rren-agotzat\n",
"2 0 bero-rren-agoz\n",
"2 0 bero-rren-ak\n",
"2 0 bero-rren-arekiko\n",
"2 0 bero-rren-arekin\n",
"2 0 bero-rren-aren\n",
"2 0 bero-rren-arena\n",
"2 0 bero-rren-arenak\n",
"2 0 bero-rren-arenarekin\n",
"2 0 bero-rren-arenarendako\n",
"2 0 bero-rren-arenarentzat\n",
"2 0 bero-rren-arenari\n",
"2 0 bero-rren-arenaz\n",
"2 0 bero-rren-arendako\n",
"2 0 bero-rren-arenean\n",
"2 0 bero-rren-arenei\n",
"2 0 bero-rren-arenek\n",
"2 0 bero-rren-arenekin\n",
"2 0 bero-rren-arenendako\n",
"2 0 bero-rren-arenentzat\n",
"2 0 bero-rren-arenera\n",
"2 0 bero-rren-areneraino\n",
"2 0 bero-rren-arenerantz\n",
"2 0 bero-rren-arenerat\n",
"2 0 bero-rren-arenetan\n",
"2 0 bero-rren-arenetara\n",
"2 0 bero-rren-arenetaraino\n",
"2 0 bero-rren-arenetarantz\n",
"2 0 bero-rren-arenetarat\n",
"2 0 bero-rren-arenetarik\n",
"2 0 bero-rren-arenetatik\n",
"2 0 bero-rren-arenetik\n",
"2 0 bero-rren-arenez\n",
"2 0 bero-rren-arengan\n",
"2 0 bero-rren-arengana\n",
"2 0 bero-rren-arenganaino\n",
"2 0 bero-rren-arenganantz\n",
"2 0 bero-rren-arengandik\n",
"2 0 bero-rren-arengatik\n",
"2 0 bero-rren-areni\n",
"2 0 bero-rren-arenik\n",
"2 0 bero-rren-arenoi\n",
"2 0 bero-rren-arenok\n",
"2 0 bero-rren-arenokin\n",
"2 0 bero-rren-arenontzat\n",
"2 0 bero-rren-arenotaz\n",
"2 0 bero-rren-arenoz\n",
"2 0 bero-rren-arentzako\n",
"2 0 bero-rren-arentzat\n",
"2 0 bero-rren-ari\n",
"2 0 bero-rren-az\n",
"2 0 bero-rren-ean\n",
"2 0 bero-rren-egatik\n",
"2 0 bero-rren-ei\n",
"2 0 bero-rren-ek\n",
"2 0 bero-rren-ekiko\n",
"2 0 bero-rren-ekin\n",
"2 0 bero-rren-eko\n",
"2 0 bero-rren-ekoa\n",
"2 0 bero-rren-ekoago\n",
"2 0 bero-rren-ekoak\n",
"2 0 bero-rren-ekoan\n",
"2 0 bero-rren-ekoarekin\n",
"2 0 bero-rren-ekoarendako\n",
"2 0 bero-rren-ekoarentzat\n",
"2 0 bero-rren-ekoari\n",
"2 0 bero-rren-ekoaz\n",
"2 0 bero-rren-ekoegi\n",
"2 0 bero-rren-ekoei\n",
"2 0 bero-rren-ekoek\n",
"2 0 bero-rren-ekoekin\n",
"2 0 bero-rren-ekoendako\n",
"2 0 bero-rren-ekoentzat\n",
"2 0 bero-rren-ekoetan\n",
"2 0 bero-rren-ekoetara\n",
"2 0 bero-rren-ekoetaraino\n",
"2 0 bero-rren-ekoetarantz\n",
"2 0 bero-rren-ekoetarat\n",
"2 0 bero-rren-ekoetarik\n",
"2 0 bero-rren-ekoetatik\n",
"2 0 bero-rren-ekoez\n",
"2 0 bero-rren-ekogatik\n",
"2 0 bero-rren-ekok\n",
"2 0 bero-rren-ekooi\n",
"2 0 bero-rren-ekook\n",
"2 0 bero-rren-ekookin\n",
"2 0 bero-rren-ekoontzat\n",
"2 0 bero-rren-ekootaz\n",
"2 0 bero-rren-ekooz\n",
"2 0 bero-rren-ekora\n",
"2 0 bero-rren-ekoraino\n",
"2 0 bero-rren-ekorantz\n",
"2 0 bero-rren-ekorat\n",
"2 0 bero-rren-ekorekin\n",
"2 0 bero-rren-ekorendako\n",
"2 0 bero-rren-ekorentzat\n",
"2 0 bero-rren-ekori\n",
"2 0 bero-rren-ekorik\n",
"2 0 bero-rren-ekotan\n",
"2 0 bero-rren-ekotara\n",
"2 0 bero-rren-ekotaraino\n",
"2 0 bero-rren-ekotarantz\n",
"2 0 bero-rren-ekotarat\n",
"2 0 bero-rren-ekotarik\n",
"2 0 bero-rren-ekotatik\n",
"2 0 bero-rren-ekotik\n",
"2 0 bero-rren-ekotzat\n",
"2 0 bero-rren-ekoz\n",
"2 0 bero-rren-en\n",
"2 0 bero-rren-ena\n",
"2 0 bero-rren-enak\n",
"2 0 bero-rren-enarekin\n",
"2 0 bero-rren-enarendako\n",
"2 0 bero-rren-enarentzat\n",
"2 0 bero-rren-enari\n",
"2 0 bero-rren-enaz\n",
"2 0 bero-rren-endako\n",
"2 0 bero-rren-enean\n",
"2 0 bero-rren-enei\n",
"2 0 bero-rren-enek\n",
"2 0 bero-rren-enekin\n",
"2 0 bero-rren-enendako\n",
"2 0 bero-rren-enentzat\n",
"2 0 bero-rren-enera\n",
"2 0 bero-rren-eneraino\n",
"2 0 bero-rren-enerantz\n",
"2 0 bero-rren-enerat\n",
"2 0 bero-rren-enetan\n",
"2 0 bero-rren-enetara\n",
"2 0 bero-rren-enetaraino\n",
"2 0 bero-rren-enetarantz\n",
"2 0 bero-rren-enetarat\n",
"2 0 bero-rren-enetarik\n",
"2 0 bero-rren-enetatik\n",
"2 0 bero-rren-enetik\n",
"2 0 bero-rren-enez\n",
"2 0 bero-rren-engan\n",
"2 0 bero-rren-engana\n",
"2 0 bero-rren-enganaino\n",
"2 0 bero-rren-enganantz\n",
"2 0 bero-rren-engandik\n",
"2 0 bero-rren-engatik\n",
"2 0 bero-rren-eni\n",
"2 0 bero-rren-enik\n",
"2 0 bero-rren-enoi\n",
"2 0 bero-rren-enok\n",
"2 0 bero-rren-enokin\n",
"2 0 bero-rren-enontzat\n",
"2 0 bero-rren-enotaz\n",
"2 0 bero-rren-enoz\n",
"2 0 bero-rren-entzako\n",
"2 0 bero-rren-entzat\n",
"2 0 bero-rren-era\n",
"2 0 bero-rren-erago\n",
"2 0 bero-rren-eraino\n",
"2 0 bero-rren-erainoko\n",
"2 0 bero-rren-erako\n",
"2 0 bero-rren-erantz\n",
"2 0 bero-rren-erantzago\n",
"2 0 bero-rren-erantzegi\n",
"2 0 bero-rren-eranzko\n",
"2 0 bero-rren-erat\n",
"2 0 bero-rren-etako\n",
"2 0 bero-rren-etakoa\n",
"2 0 bero-rren-etakoago\n",
"2 0 bero-rren-etakoak\n",
"2 0 bero-rren-etakoan\n",
"2 0 bero-rren-etakoarekin\n",
"2 0 bero-rren-etakoarendako\n",
"2 0 bero-rren-etakoarentzat\n",
"2 0 bero-rren-etakoari\n",
"2 0 bero-rren-etakoaz\n",
"2 0 bero-rren-etakoegi\n",
"2 0 bero-rren-etakoei\n",
"2 0 bero-rren-etakoek\n",
"2 0 bero-rren-etakoekin\n",
"2 0 bero-rren-etakoendako\n",
"2 0 bero-rren-etakoentzat\n",
"2 0 bero-rren-etakoetan\n",
"2 0 bero-rren-etakoetara\n",
"2 0 bero-rren-etakoetaraino\n",
"2 0 bero-rren-etakoetarantz\n",
"2 0 bero-rren-etakoetarat\n",
"2 0 bero-rren-etakoetarik\n",
"2 0 bero-rren-etakoetatik\n",
"2 0 bero-rren-etakoez\n",
"2 0 bero-rren-etakogatik\n",
"2 0 bero-rren-etakok\n",
"2 0 bero-rren-etakooi\n",
"2 0 bero-rren-etakook\n",
"2 0 bero-rren-etakookin\n",
"2 0 bero-rren-etakoontzat\n",
"2 0 bero-rren-etakootaz\n",
"2 0 bero-rren-etakooz\n",
"2 0 bero-rren-etakora\n",
"2 0 bero-rren-etakoraino\n",
"2 0 bero-rren-etakorantz\n",
"2 0 bero-rren-etakorat\n",
"2 0 bero-rren-etakorekin\n",
"2 0 bero-rren-etakorendako\n",
"2 0 bero-rren-etakorentzat\n",
"2 0 bero-rren-etakori\n",
"2 0 bero-rren-etakorik\n",
"2 0 bero-rren-etakotan\n",
"2 0 bero-rren-etakotara\n",
"2 0 bero-rren-etakotaraino\n",
"2 0 bero-rren-etakotarantz\n",
"2 0 bero-rren-etakotarat\n",
"2 0 bero-rren-etakotarik\n",
"2 0 bero-rren-etakotatik\n",
"2 0 bero-rren-etakotik\n",
"2 0 bero-rren-etakotzat\n",
"2 0 bero-rren-etakoz\n",
"2 0 bero-rren-etan\n",
"2 0 bero-rren-etara\n",
"2 0 bero-rren-etarago\n",
"2 0 bero-rren-etaraino\n",
"2 0 bero-rren-etarainoko\n",
"2 0 bero-rren-etarako\n",
"2 0 bero-rren-etarantz\n",
"2 0 bero-rren-etarantzago\n",
"2 0 bero-rren-etarantzegi\n",
"2 0 bero-rren-etaranzko\n",
"2 0 bero-rren-etarat\n",
"2 0 bero-rren-etarik\n",
"2 0 bero-rren-etariko\n",
"2 0 bero-rren-etatik\n",
"2 0 bero-rren-etatiko\n",
"2 0 bero-rren-etik\n",
"2 0 bero-rren-etiko\n",
"2 0 bero-rren-ez\n",
"2 0 bero-rren-ezko\n",
"2 0 bero-rren-gatik\n",
"2 0 bero-rren-gatiko\n",
"2 0 bero-rren-i\n",
"2 0 bero-rren-ik\n",
"2 0 bero-rren-oi\n",
"2 0 bero-rren-ok\n",
"2 0 bero-rren-okiko\n",
"2 0 bero-rren-okin\n",
"2 0 bero-rren-on\n",
"2 0 bero-rren-ona\n",
"2 0 bero-rren-onak\n",
"2 0 bero-rren-onarekin\n",
"2 0 bero-rren-onarendako\n",
"2 0 bero-rren-onarentzat\n",
"2 0 bero-rren-onari\n",
"2 0 bero-rren-onaz\n",
"2 0 bero-rren-onean\n",
"2 0 bero-rren-onei\n",
"2 0 bero-rren-onek\n",
"2 0 bero-rren-onekin\n",
"2 0 bero-rren-onendako\n",
"2 0 bero-rren-onentzat\n",
"2 0 bero-rren-onera\n",
"2 0 bero-rren-oneraino\n",
"2 0 bero-rren-onerantz\n",
"2 0 bero-rren-onerat\n",
"2 0 bero-rren-onetik\n",
"2 0 bero-rren-onez\n",
"2 0 bero-rren-ongan\n",
"2 0 bero-rren-ongana\n",
"2 0 bero-rren-onganaino\n",
"2 0 bero-rren-onganantz\n",
"2 0 bero-rren-ongandik\n",
"2 0 bero-rren-ongatik\n",
"2 0 bero-rren-oni\n",
"2 0 bero-rren-onik\n",
"2 0 bero-rren-onoi\n",
"2 0 bero-rren-onok\n",
"2 0 bero-rren-onokin\n",
"2 0 bero-rren-onontzat\n",
"2 0 bero-rren-onotaz\n",
"2 0 bero-rren-onoz\n",
"2 0 bero-rren-ontzako\n",
"2 0 bero-rren-ontzat\n",
"2 0 bero-rren-otako\n",
"2 0 bero-rren-otan\n",
"2 0 bero-rren-otara\n",
"2 0 bero-rren-otaraino\n",
"2 0 bero-rren-otarantz\n",
"2 0 bero-rren-otarat\n",
"2 0 bero-rren-otarik\n",
"2 0 bero-rren-otatik\n",
"2 0 bero-rren-otaz\n",
"2 0 bero-rren-oz\n",
"2 0 bero-rren-txo\n",
"2 0 bero-rren-txoa\n",
"2 0 bero-rren-txoak\n",
"2 0 bero-rren-txoan\n",
"2 0 bero-rren-txoarekin\n",
"2 0 bero-rren-txoaren\n",
"2 0 bero-rren-txoarendako\n",
"2 0 bero-rren-txoarentzat\n",
"2 0 bero-rren-txoari\n",
"2 0 bero-rren-txoaz\n",
"2 0 bero-rren-txoei\n",
"2 0 bero-rren-txoek\n",
"2 0 bero-rren-txoekin\n",
"2 0 bero-rren-txoen\n",
"2 0 bero-rren-txoendako\n",
"2 0 bero-rren-txoentzat\n",
"2 0 bero-rren-txoez\n",
"2 0 bero-rren-txogatik\n",
"2 0 bero-rren-txok\n",
"2 0 bero-rren-txoko\n",
"2 0 bero-rren-txooi\n",
"2 0 bero-rren-txook\n",
"2 0 bero-rren-txookin\n",
"2 0 bero-rren-txoon\n",
"2 0 bero-rren-txoontzat\n",
"2 0 bero-rren-txootaz\n",
"2 0 bero-rren-txooz\n",
"2 0 bero-rren-txora\n",
"2 0 bero-rren-txoraino\n",
"2 0 bero-rren-txorantz\n",
"2 0 bero-rren-txorat\n",
"2 0 bero-rren-txorekin\n",
"2 0 bero-rren-txoren\n",
"2 0 bero-rren-txorendako\n",
"2 0 bero-rren-txorentzat\n",
"2 0 bero-rren-txori\n",
"2 0 bero-rren-txorik\n",
"2 0 bero-rren-txotik\n",
"2 0 bero-rren-txotzat\n",
"2 0 bero-rren-txoz\n",
"2 0 bero-rren-tzar\n",
"2 0 bero-rren-tzargatik\n",
"2 0 bero-rren-tzarra\n",
"2 0 bero-rren-tzarrak\n",
"2 0 bero-rren-tzarrarekin\n",
"2 0 bero-rren-tzarraren\n",
"2 0 bero-rren-tzarrarendako\n",
"2 0 bero-rren-tzarrarentzat\n",
"2 0 bero-rren-tzarrari\n",
"2 0 bero-rren-tzarraz\n",
"2 0 bero-rren-tzarrean\n",
"2 0 bero-rren-tzarrei\n",
"2 0 bero-rren-tzarrek\n",
"2 0 bero-rren-tzarrekin\n",
"2 0 bero-rren-tzarreko\n",
"2 0 bero-rren-tzarren\n",
"2 0 bero-rren-tzarrendako\n",
"2 0 bero-rren-tzarrentzat\n",
"2 0 bero-rren-tzarrera\n",
"2 0 bero-rren-tzarreraino\n",
"2 0 bero-rren-tzarrerantz\n",
"2 0 bero-rren-tzarrerat\n",
"2 0 bero-rren-tzarretik\n",
"2 0 bero-rren-tzarrez\n",
"2 0 bero-rren-tzarri\n",
"2 0 bero-rren-tzarrik\n",
"2 0 bero-rren-tzarroi\n",
"2 0 bero-rren-tzarrok\n",
"2 0 bero-rren-tzarrokin\n",
"2 0 bero-rren-tzarron\n",
"2 0 bero-rren-tzarrontzat\n",
"2 0 bero-rren-tzarrotaz\n",
"2 0 bero-rren-tzarroz\n",
"2 0 bero-rren-tzartzat\n",
"2 0 bero-rren-tzat\n",
". . rlengths[('238', 1, 0)] -> (406, 6099)\n",
"/238 -> (406, 6099)\n",
"1 0 bero-rrena\n",
"1 0 bero-rrenak\n",
"1 0 bero-rrenarekin\n",
"1 0 bero-rrenarendako\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rrenarentzat\n",
"1 0 bero-rrenari\n",
"1 0 bero-rrenaz\n",
"1 0 bero-rrendako\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rrenean\n",
"1 0 bero-rrenei\n",
"1 0 bero-rrenek\n",
"1 0 bero-rrenekin\n",
"1 0 bero-rrenendako\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rrenentzat\n",
"1 0 bero-rrenera\n",
"1 0 bero-rreneraino\n",
"1 0 bero-rrenerantz\n",
"1 0 bero-rrenerat\n",
"1 0 bero-rrenetan\n",
"1 0 bero-rrenetara\n",
"1 0 bero-rrenetaraino\n",
"1 0 bero-rrenetarantz\n",
"1 0 bero-rrenetarat\n",
"1 0 bero-rrenetarik\n",
"1 0 bero-rrenetatik\n",
"1 0 bero-rrenetik\n",
"1 0 bero-rrenez\n",
"1 0 bero-rrengan\n",
"1 0 bero-rrengana\n",
"1 0 bero-rrenganaino\n",
"1 0 bero-rrenganantz\n",
"1 0 bero-rrengandik\n",
"1 0 bero-rrengatik\n",
"1 0 bero-rreni\n",
"1 0 bero-rrenik\n",
"1 0 bero-rrenoi\n",
"1 0 bero-rrenok\n",
"1 0 bero-rrenokin\n",
"1 0 bero-rrenontzat\n",
"1 0 bero-rrenotaz\n",
"1 0 bero-rrenoz\n",
"1 0 bero-rrentzako\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rrentzat\n",
"1 0 bero-rretako\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rretakoa\n",
"1 0 bero-rretakoago\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rretakoak\n",
"1 0 bero-rretakoan\n",
"1 0 bero-rretakoarekin\n",
"1 0 bero-rretakoarendako\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rretakoarentzat\n",
"1 0 bero-rretakoari\n",
"1 0 bero-rretakoaz\n",
"1 0 bero-rretakoegi\n",
"1 0 bero-rretakoei\n",
"1 0 bero-rretakoek\n",
"1 0 bero-rretakoekin\n",
"1 0 bero-rretakoendako\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rretakoentzat\n",
"1 0 bero-rretakoetan\n",
"1 0 bero-rretakoetara\n",
"1 0 bero-rretakoetaraino\n",
"1 0 bero-rretakoetarantz\n",
"1 0 bero-rretakoetarat\n",
"1 0 bero-rretakoetarik\n",
"1 0 bero-rretakoetatik\n",
"1 0 bero-rretakoez\n",
"1 0 bero-rretakogatik\n",
"1 0 bero-rretakok\n",
"1 0 bero-rretakooi\n",
"1 0 bero-rretakook\n",
"1 0 bero-rretakookin\n",
"1 0 bero-rretakoontzat\n",
"1 0 bero-rretakootaz\n",
"1 0 bero-rretakooz\n",
"1 0 bero-rretakora\n",
"1 0 bero-rretakoraino\n",
"1 0 bero-rretakorantz\n",
"1 0 bero-rretakorat\n",
"1 0 bero-rretakorekin\n",
"1 0 bero-rretakorendako\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rretakorentzat\n",
"1 0 bero-rretakori\n",
"1 0 bero-rretakorik\n",
"1 0 bero-rretakotan\n",
"1 0 bero-rretakotara\n",
"1 0 bero-rretakotaraino\n",
"1 0 bero-rretakotarantz\n",
"1 0 bero-rretakotarat\n",
"1 0 bero-rretakotarik\n",
"1 0 bero-rretakotatik\n",
"1 0 bero-rretakotik\n",
"1 0 bero-rretakotzat\n",
"1 0 bero-rretakoz\n",
"1 0 bero-rretan\n",
"1 0 bero-rretara\n",
"1 0 bero-rretarago\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rretaraino\n",
"1 0 bero-rretarainoko\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rretarako\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rretarantz\n",
"1 0 bero-rretarantzago\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rretarantzegi\n",
"1 0 bero-rretaranzko\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rretarat\n",
"1 0 bero-rretarik\n",
"1 0 bero-rretariko\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rretatik\n",
"1 0 bero-rretatiko\n",
"/243 -> (520, 9356)\n",
"1 0 bero-rretaz\n",
"1 0 bero-rrez\n",
"1 0 bero-rrezaz\n",
". . rlengths[('358', 0, 0)] -> (9363, 166736)\n"
]
},
{
"data": {
"text/plain": [
"(9363, 166736)"
]
},
"execution_count": 271,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word = list(dk for dk in dic.keys() if '358' in dk[1:])[0][:1]\n",
"print('0 0 {}'.format('-'.join(word)))\n",
"apply_rule('358', disp=True, word=word)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"and now let's use `apply_rule` to try to decide how big the dictionary would be if we expanded all of the rules:"
]
},
{
"cell_type": "code",
"execution_count": 460,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-17T00:56:22.124226Z",
"start_time": "2017-11-17T00:56:21.279549Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5,757,832,836 words, 123.9 Gb\n"
]
}
],
"source": [
"nwords, nchars = 0, 0\n",
"for lineparts in dic:\n",
" nwords += 1\n",
" word = lineparts[0]\n",
" l_word = len(word)\n",
" nchars += l_word\n",
" for rc in lineparts[1:]:\n",
" to_add = apply_rule(rc, word=(word,))\n",
" nwords += to_add[0]\n",
" nchars += to_add[0] * l_word + to_add[1]\n",
"\n",
"print('{:,} words, {:.1f} Gb'.format(nwords, nchars / 1024**3))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"quite big 😲"
]
},
{
"cell_type": "code",
"execution_count": 511,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-17T12:53:41.065118Z",
"start_time": "2017-11-17T12:53:41.053504Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"~ 43,219 constructed words per .dic entry\n"
]
}
],
"source": [
"wpe = nwords / len(dic)\n",
"print('~ {:,.0f} constructed words per .dic entry'.format(wpe))"
]
},
{
"cell_type": "code",
"execution_count": 494,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-17T01:04:35.485601Z",
"start_time": "2017-11-17T01:04:35.469838Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"~ 23.1 chars per constructed word\n"
]
}
],
"source": [
"avg_wlength = nchars / nwords\n",
"print('~ {:.3g} chars per constructed word'.format(avg_wlength))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"let's see which rules chain (i.e. have been applied to words which already have an affix):"
]
},
{
"cell_type": "code",
"execution_count": 309,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-16T20:40:08.205170Z",
"start_time": "2017-11-16T20:40:08.174143Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"{('238', 1, 0): 406, ('243', 1, 0): 520}"
]
},
"execution_count": 309,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"{rh: nw for rh, (nw, chars) in rlengths.items() if (rh[1] or rh[2]) and nw}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"so, only rules 238 and 243 can be applied to words which already have an affix."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## look for similar rules"
]
},
{
"cell_type": "code",
"execution_count": 381,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-17T00:06:53.980527Z",
"start_time": "2017-11-17T00:06:53.934844Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"def hashrule(rcode):\n",
" \"\"\"return a string representation of a rule, with sorted variants\"\"\"\n",
" return '\\n'.join(' '.join(rk[2:]) for rk in sorted(rules.get(str(rcode), {}).keys()))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"let's check for similar rules, using difflib. This will take quite some time:"
]
},
{
"cell_type": "code",
"execution_count": 504,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-17T12:46:24.398956Z",
"start_time": "2017-11-17T11:51:12.363263Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\n",
" 18: 100%\n",
" 29: 99.9507%\n",
" 27: 99.9507%\n",
"2\n",
" 28: 99.9194%\n",
" 13: 99.6229%\n",
" 22: 99.6227%\n",
"3\n",
"4\n",
" 8: 99.6655%\n",
"5\n",
" 9: 99.7228%\n",
"6\n",
" 10: 99.7228%\n",
"7\n",
" 47: 99.9676%\n",
" 48: 99.9073%\n",
"8\n",
"9\n",
"10\n",
"11\n",
" 22: 100%\n",
" 21: 100%\n",
" 13: 99.9549%\n",
"12\n",
"13\n",
" 22: 99.9549%\n",
" 21: 99.9549%\n",
" 11: 99.9549%\n",
"14\n",
"15\n",
"16\n",
"17\n",
"18\n",
" 1: 100%\n",
" 29: 99.9507%\n",
" 27: 99.9507%\n",
"19\n",
" 68: 99.2916%\n",
" 66: 99.2916%\n",
" 65: 99.2916%\n",
"20\n",
"21\n",
" 22: 100%\n",
" 11: 100%\n",
" 13: 99.9549%\n",
"22\n",
" 21: 100%\n",
" 11: 100%\n",
" 13: 99.9549%\n",
"23\n",
" 29: 100%\n",
" 27: 100%\n",
" 18: 99.9507%\n",
"24\n",
" 13: 99.6497%\n",
" 22: 99.6496%\n",
" 21: 99.6496%\n",
"25\n",
" 30: 99.8205%\n",
"26\n",
"27\n",
" 29: 100%\n",
" 23: 100%\n",
" 18: 99.9507%\n",
"28\n",
" 2: 99.9194%\n",
" 13: 99.7035%\n",
" 22: 99.7033%\n",
"29\n",
" 27: 100%\n",
" 23: 100%\n",
" 18: 99.9507%\n",
"30\n",
" 25: 99.8205%\n",
"31\n",
"32\n",
"33\n",
"34\n",
" 35: 100%\n",
" 49: 99.5953%\n",
"35\n",
" 34: 100%\n",
" 49: 99.5953%\n",
"36\n",
"37\n",
"38\n",
" 36: 99.3461%\n",
"39\n",
" 37: 99.4115%\n",
"40\n",
"41\n",
" 345: 85.2934%\n",
"42\n",
"43\n",
"44\n",
"45\n",
"46\n",
"47\n",
" 7: 99.9676%\n",
" 48: 99.9397%\n",
"48\n",
" 47: 99.9397%\n",
" 7: 99.9073%\n",
"49\n",
"50\n",
"51\n",
" 355: 100%\n",
" 354: 100%\n",
"53\n",
" 55: 100%\n",
" 54: 100%\n",
"54\n",
" 55: 100%\n",
" 53: 100%\n",
"55\n",
" 54: 100%\n",
" 53: 100%\n",
"56\n",
"57\n",
" 327: 90.8714%\n",
" 326: 90.8714%\n",
" 325: 90.8714%\n",
"58\n",
" 60: 90.3279%\n",
" 227: 83.6715%\n",
"59\n",
" 64: 91.3816%\n",
" 69: 89.6601%\n",
" 67: 89.6601%\n",
"60\n",
" 227: 94.491%\n",
" 58: 90.3279%\n",
" 233: 85.0395%\n",
"61\n",
"62\n",
" 69: 90.7178%\n",
" 67: 90.7178%\n",
" 63: 90.7178%\n",
"63\n",
" 69: 100%\n",
" 67: 100%\n",
" 231: 94.5827%\n",
"64\n",
" 59: 91.3816%\n",
" 69: 88.8471%\n",
" 67: 88.8471%\n",
"65\n",
" 68: 100%\n",
" 66: 100%\n",
" 19: 99.2916%\n",
"66\n",
" 68: 100%\n",
" 65: 100%\n",
" 19: 99.2916%\n",
"67\n",
" 69: 100%\n",
" 63: 100%\n",
" 231: 94.5827%\n",
"68\n",
" 66: 100%\n",
" 65: 100%\n",
" 19: 99.2916%\n",
"69\n",
" 67: 100%\n",
" 63: 100%\n",
" 231: 94.5827%\n",
"70\n",
" 69: 88.2988%\n",
" 67: 88.2988%\n",
" 63: 88.2988%\n",
"71\n",
" 59: 87.7655%\n",
" 246: 85.2364%\n",
" 64: 83.1837%\n",
"72\n",
" 69: 88.2988%\n",
" 67: 88.2988%\n",
" 63: 88.2988%\n",
"73\n",
"75\n",
" 78: 96%\n",
" 77: 96%\n",
" 76: 96%\n",
"76\n",
" 78: 100%\n",
" 77: 100%\n",
" 75: 96%\n",
"77\n",
" 78: 100%\n",
" 76: 100%\n",
" 75: 96%\n",
"78\n",
" 77: 100%\n",
" 76: 100%\n",
" 75: 96%\n",
"79\n",
" 263: 92.6471%\n",
" 262: 92.6471%\n",
" 261: 92.6471%\n",
"80\n",
"81\n",
" 92: 95.5882%\n",
" 91: 95.5882%\n",
" 89: 95.5882%\n",
"82\n",
" 92: 85.9375%\n",
" 91: 85.9375%\n",
" 89: 85.9375%\n",
"83\n",
" 90: 99.3007%\n",
" 92: 95.5882%\n",
" 91: 95.5882%\n",
"84\n",
"85\n",
" 92: 100%\n",
" 91: 100%\n",
" 89: 100%\n",
"86\n",
" 92: 100%\n",
" 91: 100%\n",
" 89: 100%\n",
"87\n",
" 92: 100%\n",
" 91: 100%\n",
" 89: 100%\n",
"88\n",
" 92: 100%\n",
" 91: 100%\n",
" 89: 100%\n",
"89\n",
" 92: 100%\n",
" 91: 100%\n",
" 88: 100%\n",
"90\n",
" 83: 99.3007%\n",
" 92: 94.8905%\n",
" 91: 94.8905%\n",
"91\n",
" 92: 100%\n",
" 89: 100%\n",
" 88: 100%\n",
"92\n",
" 91: 100%\n",
" 89: 100%\n",
" 88: 100%\n",
"93\n",
" 263: 99.4924%\n",
" 262: 99.4924%\n",
" 261: 99.4924%\n",
"94\n",
" 263: 99.4083%\n",
" 262: 99.4083%\n",
" 261: 99.4083%\n",
"95\n",
" 265: 94.7036%\n",
"96\n",
" 185: 98.1937%\n",
" 350: 97.0473%\n",
" 184: 97.046%\n",
"97\n",
" 357: 100%\n",
"98\n",
"99\n",
" 224: 96.5574%\n",
"100\n",
"101\n",
" 107: 90.9091%\n",
" 106: 90.9091%\n",
" 104: 90.9091%\n",
"102\n",
"103\n",
" 109: 95.2381%\n",
" 108: 95.2381%\n",
" 105: 95.2381%\n",
"104\n",
" 107: 100%\n",
" 106: 100%\n",
" 101: 90.9091%\n",
"105\n",
" 109: 100%\n",
" 108: 100%\n",
" 103: 95.2381%\n",
"106\n",
" 107: 100%\n",
" 104: 100%\n",
" 101: 90.9091%\n",
"107\n",
" 106: 100%\n",
" 104: 100%\n",
" 101: 90.9091%\n",
"108\n",
" 109: 100%\n",
" 105: 100%\n",
" 103: 95.2381%\n",
"109\n",
" 108: 100%\n",
" 105: 100%\n",
" 103: 95.2381%\n",
"110\n",
" 111: 100%\n",
"111\n",
" 110: 100%\n",
"112\n",
" 135: 90.1322%\n",
" 132: 82.3011%\n",
" 130: 82.3011%\n",
"113\n",
" 114: 93.7613%\n",
"114\n",
" 113: 93.7613%\n",
" 137: 84.8217%\n",
" 129: 80.9109%\n",
"115\n",
" 140: 93.5969%\n",
"117\n",
" 120: 100%\n",
" 344: 95.9811%\n",
" 343: 95.9811%\n",
"119\n",
"120\n",
" 117: 100%\n",
" 344: 95.9811%\n",
" 343: 95.9811%\n",
"121\n",
" 154: 99.5544%\n",
" 307: 96.6601%\n",
" 311: 96.1842%\n",
"122\n",
" 129: 100%\n",
" 128: 100%\n",
" 137: 88.8496%\n",
"123\n",
" 186: 82.6018%\n",
"124\n",
"125\n",
" 132: 99.782%\n",
" 130: 99.782%\n",
" 126: 99.782%\n",
"126\n",
" 132: 100%\n",
" 130: 100%\n",
" 125: 99.782%\n",
"127\n",
" 134: 99.7543%\n",
" 133: 99.7543%\n",
" 132: 99.7408%\n",
"128\n",
" 129: 100%\n",
" 122: 100%\n",
" 137: 88.8496%\n",
"129\n",
" 128: 100%\n",
" 122: 100%\n",
" 137: 88.8496%\n",
"130\n",
" 132: 100%\n",
" 126: 100%\n",
" 125: 99.782%\n",
"131\n",
" 129: 99.347%\n",
" 128: 99.347%\n",
" 122: 99.347%\n",
"132\n",
" 130: 100%\n",
" 126: 100%\n",
" 125: 99.782%\n",
"133\n",
" 127: 99.7543%\n",
" 134: 99.7271%\n",
" 132: 99.6319%\n",
"134\n",
" 127: 99.7543%\n",
" 133: 99.7271%\n",
" 132: 99.6319%\n",
"135\n",
" 112: 90.1322%\n",
"136\n",
" 114: 92.4336%\n",
" 113: 87.3561%\n",
" 137: 81.6458%\n",
"137\n",
"138\n",
"139\n",
" 141: 99.6754%\n",
"140\n",
"141\n",
" 139: 99.6754%\n",
"142\n",
"143\n",
"144\n",
" 338: 88.8314%\n",
" 307: 81.1396%\n",
" 311: 80.8924%\n",
"145\n",
"146\n",
" 307: 80.9308%\n",
" 311: 80.5698%\n",
" 153: 80.5698%\n",
"147\n",
" 146: 80.2244%\n",
"148\n",
" 258: 85.921%\n",
" 257: 85.921%\n",
" 254: 85.921%\n",
"149\n",
" 93: 87.881%\n",
" 263: 87.8267%\n",
" 262: 87.8267%\n",
"150\n",
" 311: 100%\n",
" 153: 100%\n",
" 152: 100%\n",
"151\n",
" 341: 92.3166%\n",
"152\n",
" 311: 100%\n",
" 153: 100%\n",
" 150: 100%\n",
"153\n",
" 311: 100%\n",
" 152: 100%\n",
" 150: 100%\n",
"154\n",
" 121: 99.5544%\n",
" 307: 96.8421%\n",
" 311: 96.6292%\n",
"155\n",
" 161: 98.8095%\n",
" 162: 98.8077%\n",
" 167: 97.619%\n",
"156\n",
" 166: 95.6036%\n",
" 165: 95.6036%\n",
" 164: 95.6036%\n",
"157\n",
" 166: 100%\n",
" 165: 100%\n",
" 164: 100%\n",
"158\n",
" 156: 94.3711%\n",
" 166: 93.2692%\n",
" 165: 93.2692%\n",
"159\n",
" 166: 100%\n",
" 165: 100%\n",
" 164: 100%\n",
"160\n",
" 162: 97.7909%\n",
" 167: 97.6471%\n",
" 170: 97.2182%\n",
"161\n",
" 162: 99.1163%\n",
" 155: 98.8095%\n",
" 167: 98.0882%\n",
"162\n",
" 167: 98.0854%\n",
" 160: 97.7909%\n",
" 170: 97.3607%\n",
"163\n",
" 166: 100%\n",
" 165: 100%\n",
" 164: 100%\n",
"164\n",
" 166: 100%\n",
" 165: 100%\n",
" 163: 100%\n",
"165\n",
" 166: 100%\n",
" 164: 100%\n",
" 163: 100%\n",
"166\n",
" 165: 100%\n",
" 164: 100%\n",
" 163: 100%\n",
"167\n",
" 162: 98.0854%\n",
" 160: 97.6471%\n",
" 170: 97.2182%\n",
"168\n",
" 162: 97.3607%\n",
" 167: 97.2182%\n",
" 160: 97.2182%\n",
"169\n",
" 162: 97.3607%\n",
" 167: 97.2182%\n",
" 160: 97.2182%\n",
"170\n",
" 162: 97.3607%\n",
" 167: 97.2182%\n",
" 160: 97.2182%\n",
"171\n",
" 173: 100%\n",
"172\n",
" 175: 100%\n",
" 178: 99.5902%\n",
" 190: 87.696%\n",
"173\n",
" 171: 100%\n",
"174\n",
" 181: 100%\n",
" 177: 100%\n",
" 176: 100%\n",
"175\n",
" 172: 100%\n",
" 178: 99.5902%\n",
" 190: 87.696%\n",
"176\n",
" 181: 100%\n",
" 177: 100%\n",
" 174: 100%\n",
"177\n",
" 181: 100%\n",
" 176: 100%\n",
" 174: 100%\n",
"178\n",
" 175: 99.5902%\n",
" 172: 99.5902%\n",
" 190: 87.5452%\n",
"179\n",
" 219: 98.2355%\n",
" 222: 95.614%\n",
" 205: 83.2604%\n",
"180\n",
" 226: 97.0741%\n",
" 221: 97.0741%\n",
"181\n",
" 177: 100%\n",
" 176: 100%\n",
" 174: 100%\n",
"182\n",
" 175: 89.7852%\n",
" 172: 89.7852%\n",
" 178: 89.7065%\n",
"183\n",
" 184: 100%\n",
"184\n",
" 183: 100%\n",
"185\n",
" 181: 95.502%\n",
" 177: 95.502%\n",
" 176: 95.502%\n",
"186\n",
"187\n",
"188\n",
"189\n",
" 192: 83.8299%\n",
"190\n",
" 175: 87.696%\n",
" 172: 87.696%\n",
" 178: 87.5452%\n",
"191\n",
"192\n",
"193\n",
" 205: 100%\n",
" 204: 100%\n",
" 203: 100%\n",
"194\n",
" 205: 100%\n",
" 204: 100%\n",
" 203: 100%\n",
"195\n",
"196\n",
"197\n",
" 205: 100%\n",
" 204: 100%\n",
" 203: 100%\n",
"198\n",
" 205: 100%\n",
" 204: 100%\n",
" 203: 100%\n",
"199\n",
" 205: 100%\n",
" 204: 100%\n",
" 203: 100%\n",
"200\n",
" 205: 100%\n",
" 204: 100%\n",
" 203: 100%\n",
"201\n",
" 205: 100%\n",
" 204: 100%\n",
" 203: 100%\n",
"202\n",
" 205: 100%\n",
" 204: 100%\n",
" 203: 100%\n",
"203\n",
" 205: 100%\n",
" 204: 100%\n",
" 202: 100%\n",
"204\n",
" 205: 100%\n",
" 203: 100%\n",
" 202: 100%\n",
"205\n",
" 204: 100%\n",
" 203: 100%\n",
" 202: 100%\n",
"206\n",
"207\n",
" 208: 100%\n",
"208\n",
" 207: 100%\n",
"209\n",
"210\n",
" 209: 99.2492%\n",
"211\n",
"213\n",
"214\n",
"215\n",
"216\n",
"217\n",
" 226: 93.5162%\n",
" 221: 93.5162%\n",
" 180: 92.3786%\n",
"218\n",
" 219: 99.9675%\n",
" 222: 97.141%\n",
" 205: 84.0532%\n",
"219\n",
" 222: 97.1719%\n",
" 205: 84.1318%\n",
" 204: 84.1318%\n",
"220\n",
" 225: 100%\n",
"221\n",
" 226: 100%\n",
"222\n",
" 219: 97.1719%\n",
" 205: 82.0944%\n",
" 204: 82.0944%\n",
"223\n",
" 330: 93.6906%\n",
" 333: 91.4768%\n",
" 155: 88.6345%\n",
"224\n",
" 99: 96.5574%\n",
"225\n",
" 220: 100%\n",
"226\n",
" 221: 100%\n",
"227\n",
" 233: 87.2215%\n",
" 320: 86.5241%\n",
"228\n",
"229\n",
" 230: 100%\n",
" 232: 86.6614%\n",
" 231: 84.7613%\n",
"230\n",
" 229: 100%\n",
" 232: 86.6614%\n",
" 231: 84.7613%\n",
"231\n",
" 228: 91.7589%\n",
" 328: 86.7244%\n",
" 323: 86.7244%\n",
"232\n",
" 328: 86.4274%\n",
" 323: 86.4274%\n",
" 244: 86.4274%\n",
"233\n",
" 320: 99.3742%\n",
"234\n",
" 321: 100%\n",
"235\n",
"236\n",
"237\n",
" 322: 100%\n",
" 328: 89.9406%\n",
" 323: 89.9406%\n",
"238\n",
" 328: 100%\n",
" 323: 100%\n",
" 244: 100%\n",
"239\n",
" 324: 100%\n",
" 328: 87.785%\n",
" 323: 87.785%\n",
"240\n",
" 327: 100%\n",
" 326: 100%\n",
" 325: 100%\n",
"241\n",
" 327: 86.9756%\n",
" 326: 86.9756%\n",
" 325: 86.9756%\n",
"242\n",
" 327: 100%\n",
" 326: 100%\n",
" 325: 100%\n",
"243\n",
" 327: 100%\n",
" 326: 100%\n",
" 325: 100%\n",
"244\n",
" 328: 100%\n",
" 323: 100%\n",
" 238: 100%\n",
"245\n",
" 328: 87.3012%\n",
" 323: 87.3012%\n",
" 322: 87.3012%\n",
"246\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" 324: 81.7121%\n",
" 239: 81.7121%\n",
" 328: 80.3563%\n",
"247\n",
" 329: 100%\n",
" 328: 87.3012%\n",
" 323: 87.3012%\n",
"248\n",
" 328: 90.1288%\n",
" 323: 90.1288%\n",
" 244: 90.1288%\n",
"249\n",
" 305: 88.6889%\n",
"250\n",
" 263: 100%\n",
" 262: 100%\n",
" 261: 100%\n",
"251\n",
" 252: 99.4553%\n",
" 310: 89.1667%\n",
" 306: 88.674%\n",
"252\n",
" 251: 99.4553%\n",
" 310: 89.1835%\n",
" 306: 88.6884%\n",
"253\n",
" 263: 100%\n",
" 262: 100%\n",
" 261: 100%\n",
"254\n",
" 258: 100%\n",
" 257: 100%\n",
" 148: 85.921%\n",
"255\n",
" 263: 100%\n",
" 262: 100%\n",
" 261: 100%\n",
"256\n",
"257\n",
" 258: 100%\n",
" 254: 100%\n",
" 148: 85.921%\n",
"258\n",
" 257: 100%\n",
" 254: 100%\n",
" 148: 85.921%\n",
"259\n",
"260\n",
" 263: 100%\n",
" 262: 100%\n",
" 261: 100%\n",
"261\n",
" 263: 100%\n",
" 262: 100%\n",
" 260: 100%\n",
"262\n",
" 263: 100%\n",
" 261: 100%\n",
" 260: 100%\n",
"263\n",
" 262: 100%\n",
" 261: 100%\n",
" 260: 100%\n",
"264\n",
"265\n",
" 94: 94.7284%\n",
" 95: 94.7036%\n",
" 93: 94.6443%\n",
"266\n",
"267\n",
"268\n",
"269\n",
"270\n",
"271\n",
" 279: 95.3333%\n",
" 280: 91.9692%\n",
" 278: 91.939%\n",
"272\n",
" 277: 100%\n",
" 274: 100%\n",
"273\n",
" 276: 100%\n",
" 275: 100%\n",
"274\n",
" 277: 100%\n",
" 272: 100%\n",
"275\n",
" 276: 100%\n",
" 273: 100%\n",
"276\n",
" 275: 100%\n",
" 273: 100%\n",
"277\n",
" 274: 100%\n",
" 272: 100%\n",
"278\n",
" 279: 96.347%\n",
" 280: 95.1412%\n",
" 271: 91.939%\n",
"279\n",
" 280: 96.4245%\n",
" 278: 96.347%\n",
" 271: 95.3333%\n",
"280\n",
" 279: 96.4245%\n",
" 278: 95.1412%\n",
" 271: 91.9692%\n",
"281\n",
"282\n",
" 284: 100%\n",
" 283: 100%\n",
"283\n",
" 284: 100%\n",
" 282: 100%\n",
"284\n",
" 283: 100%\n",
" 282: 100%\n",
"285\n",
" 287: 100%\n",
" 286: 100%\n",
"286\n",
" 287: 100%\n",
" 285: 100%\n",
"287\n",
" 286: 100%\n",
" 285: 100%\n",
"288\n",
"289\n",
"290\n",
" 300: 96.6316%\n",
" 298: 96.6316%\n",
" 297: 96.6316%\n",
"291\n",
"292\n",
" 296: 88.4672%\n",
" 301: 87.9971%\n",
"293\n",
" 300: 100%\n",
" 298: 100%\n",
" 297: 100%\n",
"294\n",
" 304: 99.0602%\n",
"295\n",
"296\n",
" 301: 96.5442%\n",
" 292: 88.4672%\n",
"297\n",
" 300: 100%\n",
" 298: 100%\n",
" 293: 100%\n",
"298\n",
" 300: 100%\n",
" 297: 100%\n",
" 293: 100%\n",
"299\n",
"300\n",
" 298: 100%\n",
" 297: 100%\n",
" 293: 100%\n",
"301\n",
" 296: 96.5442%\n",
" 292: 87.9971%\n",
"302\n",
"303\n",
"304\n",
" 294: 99.0602%\n",
"305\n",
" 249: 88.6889%\n",
"306\n",
" 310: 99.4987%\n",
" 252: 88.6884%\n",
" 251: 88.674%\n",
"307\n",
" 311: 99.5235%\n",
" 153: 99.5235%\n",
" 152: 99.5235%\n",
"308\n",
" 259: 83.0272%\n",
"309\n",
" 311: 95.769%\n",
" 153: 95.769%\n",
" 152: 95.769%\n",
"310\n",
" 306: 99.4987%\n",
" 252: 89.1835%\n",
" 251: 89.1667%\n",
"311\n",
" 153: 100%\n",
" 152: 100%\n",
" 150: 100%\n",
"312\n",
"313\n",
"318\n",
" 327: 93.407%\n",
" 326: 93.407%\n",
" 325: 93.407%\n",
"319\n",
" 327: 100%\n",
" 326: 100%\n",
" 325: 100%\n",
"320\n",
"321\n",
" 234: 100%\n",
"322\n",
" 237: 100%\n",
" 328: 89.9406%\n",
" 323: 89.9406%\n",
"323\n",
" 328: 100%\n",
" 244: 100%\n",
" 238: 100%\n",
"324\n",
" 239: 100%\n",
" 328: 87.785%\n",
" 323: 87.785%\n",
"325\n",
" 327: 100%\n",
" 326: 100%\n",
" 319: 100%\n",
"326\n",
" 327: 100%\n",
" 325: 100%\n",
" 319: 100%\n",
"327\n",
" 326: 100%\n",
" 325: 100%\n",
" 319: 100%\n",
"328\n",
" 323: 100%\n",
" 244: 100%\n",
" 238: 100%\n",
"329\n",
" 247: 100%\n",
" 328: 87.3012%\n",
" 323: 87.3012%\n",
"330\n",
" 223: 93.6906%\n",
" 333: 91.8301%\n",
" 162: 90.2821%\n",
"331\n",
" 335: 100%\n",
" 334: 100%\n",
" 332: 100%\n",
"332\n",
" 335: 100%\n",
" 334: 100%\n",
" 331: 100%\n",
"333\n",
" 330: 91.8301%\n",
" 223: 91.4768%\n",
" 162: 86.5031%\n",
"334\n",
" 335: 100%\n",
" 332: 100%\n",
" 331: 100%\n",
"335\n",
" 334: 100%\n",
" 332: 100%\n",
" 331: 100%\n",
"338\n",
" 144: 88.8314%\n",
" 265: 83.4816%\n",
"339\n",
"340\n",
" 344: 100%\n",
" 343: 100%\n",
" 342: 100%\n",
"341\n",
" 151: 92.3166%\n",
"342\n",
" 344: 100%\n",
" 343: 100%\n",
" 340: 100%\n",
"343\n",
" 344: 100%\n",
" 342: 100%\n",
" 340: 100%\n",
"344\n",
" 343: 100%\n",
" 342: 100%\n",
" 340: 100%\n",
"345\n",
"346\n",
" 97: 89.6939%\n",
" 357: 89.6939%\n",
"347\n",
" 358: 98.5922%\n",
"348\n",
" 182: 96.149%\n",
" 175: 92.3989%\n",
" 172: 92.3989%\n",
"349\n",
"350\n",
" 351: 99.7158%\n",
" 184: 95.4506%\n",
" 183: 95.4506%\n",
"351\n",
" 350: 99.7158%\n",
" 184: 95.1673%\n",
" 183: 95.1673%\n",
"352\n",
"353\n",
" 356: 100%\n",
"354\n",
" 51: 100%\n",
" 355: 100%\n",
"355\n",
" 51: 100%\n",
" 354: 100%\n",
"356\n",
" 353: 100%\n",
"357\n",
" 97: 100%\n",
"358\n",
" 347: 98.5922%\n",
"1000\n",
" 1001: 100%\n",
" 1003: 92.3077%\n",
" 1002: 85.7143%\n",
"1001\n",
" 1000: 100%\n",
" 1003: 92.3077%\n",
" 1002: 85.7143%\n",
"1002\n",
" 1003: 93.3333%\n",
" 1001: 85.7143%\n",
" 1000: 85.7143%\n",
"1003\n",
" 1002: 93.3333%\n",
" 1001: 92.3077%\n",
" 1000: 92.3077%\n",
"1004\n",
" 1005: 80%\n",
"1005\n",
" 1003: 85.7143%\n",
" 1004: 80%\n",
" 1002: 80%\n"
]
}
],
"source": [
"import difflib\n",
"from heapq import nlargest\n",
"\n",
"rcodes = [str(rc) for rc in sorted([int(rc) for rc in rules.keys()])]\n",
"hashed_rules = {rc: hashrule(rc) for rc in rcodes}\n",
"seqm = difflib.SequenceMatcher()\n",
"similarities = {}\n",
"\n",
"n, cutoff = 3, 0.8\n",
"for rcode2 in rcodes:\n",
" print(rcode2)\n",
" sims = []\n",
" seqm.set_seq2(hashed_rules[rcode2])\n",
" for rcode1 in [rc for rc in rcodes if rc != rcode2]:\n",
" seqm.set_seq1(hashed_rules[rcode1])\n",
" if (seqm.real_quick_ratio() >= cutoff and \n",
" seqm.quick_ratio() >= cutoff and \n",
" seqm.ratio() >= cutoff):\n",
" sims.append((seqm.ratio(), rcode1))\n",
" sims = similarities[rcode2] = nlargest(n, sims)\n",
" for sim in sims:\n",
" print(' {}: {:g}%'.format(sim[1], 100 * sim[0]))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So, rule 358 (the last) is 'similar' to rule 347. Just how similar are they?"
]
},
{
"cell_type": "code",
"execution_count": 509,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-17T12:52:24.636564Z",
"start_time": "2017-11-17T12:52:24.517046Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ratio: 98.59%\n",
"\u001b[31m--- 347\n",
"\u001b[0m\n",
"\u001b[32m+++ 358\n",
"\u001b[0m\n",
"\u001b[36m@@ -1,4 +1,3 @@\n",
"\u001b[0m\n",
"\u001b[31m-0 xe .\u001b[0m\n",
" rri rregatik rri\n",
" rri rrek rri\n",
" rri rrekiko/243 rri\n",
"\u001b[36m@@ -97,21 +96,18 @@\n",
"\u001b[0m\n",
" rri rretakotatik rri\n",
" rri rretakotik rri\n",
" rri rretakotzat rri\n",
"\u001b[31m-rri rretakoxe rri\u001b[0m\n",
" rri rretakoz rri\n",
" rri rretan rri\n",
" rri rretara rri\n",
" rri rretarago/243 rri\n",
" rri rretaraino rri\n",
" rri rretarainoko/243 rri\n",
"\u001b[31m-rri rretarainoxe rri\u001b[0m\n",
" rri rretarako/243 rri\n",
" rri rretarantz rri\n",
" rri rretarantzago/243 rri\n",
" rri rretarantzegi rri\n",
" rri rretaranzko/243 rri\n",
" rri rretarat rri\n",
"\u001b[31m-rri rretaraxe rri\u001b[0m\n",
" rri rretarik rri\n",
" rri rretariko/243 rri\n",
" rri rretatik rri\n"
]
}
],
"source": [
"import difflib\n",
"\n",
"\n",
"def colorize_diff(diff_lines):\n",
" return [{'+': '\\x1b[32m', '-': '\\x1b[31m', '@': '\\x1b[36m'}.get(l[0], '') + l +\n",
" ('\\x1b[0m' if l[0] in '+-@' else '') for l in diff_lines]\n",
"\n",
"def compare_rules(rc1, rc2):\n",
" rc1, rc2 = str(rc1), str(rc2)\n",
" h1, h2 = hashrule(rc1), hashrule(rc2)\n",
" ratio = 100*difflib.SequenceMatcher(a=h1, b=h2).ratio()\n",
" return '\\n'.join(['ratio: {:.2f}%'.format(ratio)] + colorize_diff(\n",
" difflib.unified_diff(\n",
" h1.splitlines(), h2.splitlines(), fromfile=rc1, tofile=rc2)))\n",
"\n",
"print(compare_rules('347', '358'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"so essentially, 358 is just 347, plus an extra 4 lines.\n",
"\n",
"It seems a bit pointless to keep all the duplicates, especially since 358 applies to only a single `.dic` entry"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"what about rules `13` & `22`?"
]
},
{
"cell_type": "code",
"execution_count": 513,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-17T13:12:41.981636Z",
"start_time": "2017-11-17T13:12:41.952435Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(743, 742)"
]
},
"execution_count": 513,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(rules['13']), len(rules['22'])"
]
},
{
"cell_type": "code",
"execution_count": 510,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-17T12:52:39.444677Z",
"start_time": "2017-11-17T12:52:39.290168Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ratio: 99.95%\n",
"\u001b[31m--- 13\n",
"\u001b[0m\n",
"\u001b[32m+++ 22\n",
"\u001b[0m\n",
"\u001b[36m@@ -1,4 +1,3 @@\n",
"\u001b[0m\n",
"\u001b[31m-0 etara .\u001b[0m\n",
" 0 tu .\n",
" 0 tua .\n",
" 0 tuagan .\n"
]
}
],
"source": [
"print(compare_rules('13', '22'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"again, pretty similar 😒"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:jup]",
"language": "python",
"name": "conda-env-jup-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.4"
},
"toc": {
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"toc_cell": false,
"toc_position": {},
"toc_section_display": "block",
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment