Skip to content

Instantly share code, notes, and snippets.

@y-mitsui
Created June 11, 2018 11:45
Show Gist options
  • Select an option

  • Save y-mitsui/6e04fd5acc00b45950f781c67e571ba0 to your computer and use it in GitHub Desktop.

Select an option

Save y-mitsui/6e04fd5acc00b45950f781c67e571ba0 to your computer and use it in GitHub Desktop.
String Kernel in Python
"""
String Kernel
Reference: "5.構造化データに対するカーネル" 福水健次
http://www.ism.ac.jp/~fukumizu/ISM_lecture_2010/Kernel_5_structured.pdf
"""
import numpy as np
def subSequenceJustLength(result, str, cur_str, length, depth=0):
if depth == length:
result.append(cur_str)
return
for i in range(len(str)):
cur_str += str[i]
subSequenceJustLength(result, str[i + 1:], cur_str, length, depth + 1)
cur_str = cur_str[:-1]
def getsubSequence(str):
result = []
for i in range(len(str) + 1):
subSequenceJustLength(result, str, "", i)
return result
def unique(items):
li_uniq = []
for x in items:
if x not in li_uniq:
li_uniq.append(x)
return li_uniq
def getCountSeq(seq):
count_seq = {}
for x in seq:
if x not in count_seq:
count_seq[x] = 0
count_seq[x] += 1
return count_seq
def stringKernel(str_x, str_y):
seq_x = getsubSequence(str_x)
seq_y = getsubSequence(str_y)
seq_all = unique(seq_x + seq_y)
count_seq_x = getCountSeq(seq_x)
count_seq_y = getCountSeq(seq_y)
similar = 0.
for sub_seq in seq_all:
if sub_seq in count_seq_x and sub_seq in count_seq_y:
x_count = count_seq_x[sub_seq]
y_count = count_seq_y[sub_seq]
similar += x_count * y_count
return similar
def gramMatrix(sample, kernel):
n_sample = len(sample)
gram = np.zeros((n_sample, n_sample))
for i in range(n_sample):
for j in range(i, n_sample):
gram[i, j] = kernel(sample[i], sample[j])
gram[j, i] = gram[i, j]
return gram
sample = ["今日のご飯はカレーだ", "今日の修行は打ち水だ", "今日はご飯のカレーだった"]
print(gramMatrix(sample, stringKernel))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment