z0z0r4/dat.py

## dat.py
import random

class DAT:
    default_size: int = 4096
    data_set_size = 126
    base: list[int]
    check: list[int]

    def __init__(self):
        self.base = [0] * self.default_size
        self.check = [0] * self.default_size
        self.base[1] = 1

    def extend(self, size: int):
        self.base += [0] * (size - len(self.base))
        self.check += [0] * (size - len(self.check))

    def search(self, s: str) -> bool:
        p = 1
        for c in s:
            t = abs(self.base[p]) + ord(c)
            if t >= len(self.check):
                return False
            if self.check[t] != p:
                return False
            p = t
        else:
            # check if it's a valid end of word
            return self.base[p] < 0

    def find_empty(self, index: int, children: list[int]) -> int:
        while True:
            index += 1
            if index >= len(self.check):
                self.extend(len(self.base) * 2)
            if self.check[index] == 0:
                for child in children:
                    while index + child >= len(self.check):
                        self.extend(len(self.base) * 2)
                    if self.check[index + child] != 0:
                        break
                else:
                    return index

    def relocate(self, parent: int, children: list[int]):
        old_base = abs(self.base[parent])
        sign = -1 if self.base[parent] < 0 else 1
        new_base = self.find_empty(old_base, children)

        for child in children:
            old_pos = old_base + child
            if old_pos >= len(self.check) or self.check[old_pos] != parent:
                continue

            new_pos = new_base + child
            while new_pos >= len(self.check):
                self.extend(len(self.base) * 2)

            self.base[new_pos] = self.base[old_pos]
            self.check[new_pos] = parent

            child_start = abs(self.base[old_pos])
            child_end = min(child_start + self.data_set_size, len(self.check))
            for i in range(child_start, child_end):
                if self.check[i] == old_pos:
                    self.check[i] = new_pos

            self.check[old_pos] = 0
            self.base[old_pos] = 0

        self.base[parent] = sign * new_base
        return new_base

    def add(self, data: str):
        p = 1
        for c in data:
            code = ord(c)
            next_index = abs(self.base[p]) + code
            while next_index >= len(self.base):
                self.extend(len(self.base) * 2)

            if self.check[next_index] == 0:
                self.base[next_index] = self.find_empty(next_index, [0])
                self.check[next_index] = p
                p = next_index
            elif self.check[next_index] != p:
                # conflict
                children = []
                for i in range(min(self.data_set_size, len(self.check) - abs(self.base[p]))):
                    if self.check[abs(self.base[p]) + i] == p:
                        children.append(i)
                children.append(code)
                self.relocate(p, children=children)

                next_index = abs(self.base[p]) + code
                while next_index >= len(self.base):
                    self.extend(len(self.base) * 2)
                self.base[next_index] = self.find_empty(next_index, [0])
                self.check[next_index] = p
                p = next_index
            else:  # already exist
                p = next_index
        self.base[p] *= -1  # mark end of word

    def print_base_and_check(self):
        # ignore empty value
        for i in range(len(self.base)):
            if self.base[i] != 0 or self.check[i] != 0:
                print(f"Index: {i}, Base: {self.base[i]}, Check: {self.check[i]}")

if __name__ == "__main__":
    dat = DAT()
    # https://www.mit.edu/~ecprice/wordlist.10000
    with open("wordlist.10000.txt", "r", encoding="utf-8") as f:
        words = [line[:-1] for line in f.readlines()]
    random.seed(42)
    random.shuffle(words)
    for word in words:
        dat.add(word)

    # dat.print_base_and_check()
    # print("===" * 10)
    for word in words:
        # print(f"Searching for '{word}': {dat.search(word)}")
        assert dat.search(word) == True, f"Failed to find '{word}'"

## opt_dat.py
import random

class DAT:
    default_size: int = 4096
    data_set_size = 126
    base: list[int]
    check: list[int]
    free_head: int
    free_tail: int

    def __init__(self):
        self.base = [0] * self.default_size
        self.check = [0] * self.default_size
        self.base[1] = 1
        self.free_head = -1
        self.free_tail = -1
        self._init_free_list(2, self.default_size)

    def _init_free_list(self, start: int, end: int):
        if start >= end:
            return
        self.free_head = start
        self.free_tail = end - 1
        for i in range(start, end):
            prev = i - 1 if i > start else -1
            nxt = i + 1 if i + 1 < end else -1
            self.base[i] = prev
            self.check[i] = -(nxt + 1)

    def _is_free(self, index: int) -> bool:
        return self.check[index] < 0

    def _free_next(self, index: int) -> int:
        return -self.check[index] - 1

    def _set_free_links(self, index: int, prev: int, nxt: int):
        self.base[index] = prev
        self.check[index] = -(nxt + 1)

    def _remove_free(self, index: int):
        prev = self.base[index]
        nxt = self._free_next(index)

        if prev != -1:
            self.check[prev] = -(nxt + 1)
        else:
            self.free_head = nxt

        if nxt != -1:
            self.base[nxt] = prev
        else:
            self.free_tail = prev

        self.base[index] = 0
        self.check[index] = 0

    def _insert_free_head(self, index: int):
        old_head = self.free_head
        self._set_free_links(index, -1, old_head)
        if old_head != -1:
            self.base[old_head] = index
        else:
            self.free_tail = index
        self.free_head = index

    def _allocate_slot(self, index: int):
        if not self._is_free(index):
            raise Exception(f"Index {index} is not free")
        self._remove_free(index)

    def extend(self, size: int):
        old_size = len(self.base)
        if size <= old_size:
            return

        self.base += [0] * (size - old_size)
        self.check += [0] * (size - old_size)

        start = max(2, old_size)
        end = size
        if start >= end:
            return

        for i in range(start, end):
            prev = i - 1 if i > start else self.free_tail
            nxt = i + 1 if i + 1 < end else -1
            self.base[i] = prev
            self.check[i] = -(nxt + 1)

        if self.free_tail != -1:
            self.check[self.free_tail] = -(start + 1)
        else:
            self.free_head = start
        self.free_tail = end - 1

    def search(self, s: str) -> bool:
        p = 1
        for c in s:
            t = abs(self.base[p]) + ord(c)
            if t >= len(self.check):
                return False
            if self.check[t] != p:
                return False
            p = t
        else:
            # check if it's a valid end of word
            return self.base[p] < 0

    def find_empty(self, children: list[int]) -> int:
        offsets = sorted(set(children))
        max_offset = offsets[-1] if offsets else 0

        while True:
            while max_offset >= len(self.check):
                self.extend(len(self.base) * 2)

            cur = self.free_head
            while cur != -1:
                for child in offsets:
                    candidate = cur + child
                    if candidate >= len(self.check) or not self._is_free(candidate):
                        break
                else:
                    return cur
                cur = self._free_next(cur)

            self.extend(len(self.base) * 2)

    def relocate(self, parent: int, children: list[int]):
        old_base = abs(self.base[parent])
        sign = -1 if self.base[parent] < 0 else 1
        unique_children = sorted(set(children))
        new_base = self.find_empty(unique_children)

        for child in unique_children:
            old_pos = old_base + child
            if old_pos >= len(self.check) or self.check[old_pos] != parent:
                continue

            new_pos = new_base + child
            while new_pos >= len(self.check):
                self.extend(len(self.base) * 2)

            moved_base = self.base[old_pos]
            self._allocate_slot(new_pos)
            self.base[new_pos] = moved_base
            self.check[new_pos] = parent

            child_start = abs(moved_base)
            child_end = min(child_start + self.data_set_size, len(self.check))
            for i in range(child_start, child_end):
                if self.check[i] == old_pos:
                    self.check[i] = new_pos

            self._insert_free_head(old_pos)

        self.base[parent] = sign * new_base
        return new_base

    def add(self, data: str):
        p = 1
        for c in data:
            code = ord(c)
            next_index = abs(self.base[p]) + code
            while next_index >= len(self.base):
                self.extend(len(self.base) * 2)

            if self._is_free(next_index):
                self._allocate_slot(next_index)
                self.base[next_index] = self.find_empty([0])
                self.check[next_index] = p
                p = next_index
            elif self.check[next_index] != p:
                # conflict
                children = []
                for i in range(min(self.data_set_size, len(self.check) - abs(self.base[p]))):
                    if self.check[abs(self.base[p]) + i] == p:
                        children.append(i)
                children.append(code)
                self.relocate(p, children=children)

                next_index = abs(self.base[p]) + code
                while next_index >= len(self.base):
                    self.extend(len(self.base) * 2)
                self._allocate_slot(next_index)
                self.base[next_index] = self.find_empty([0])
                self.check[next_index] = p
                p = next_index
            else:  # already exist
                p = next_index
        self.base[p] *= -1  # mark end of word

    def print_base_and_check(self):
        # ignore empty value
        for i in range(len(self.base)):
            if not self._is_free(i) and (self.base[i] != 0 or self.check[i] != 0):
                print(f"Index: {i}, Base: {self.base[i]}, Check: {self.check[i]}")

if __name__ == "__main__":
    dat = DAT()
    # https://www.mit.edu/~ecprice/wordlist.10000
    with open("wordlist.10000.txt", "r", encoding="utf-8") as f:
        words = [line[:-1] for line in f.readlines()]
    random.seed(42)
    random.shuffle(words)
    for word in words:
        dat.add(word)

    # dat.print_base_and_check()
    # print("===" * 10)
    for word in words:
        # print(f"Searching for '{word}': {dat.search(word)}")
        assert dat.search(word) == True, f"Failed to find '{word}'"
	import random

	class DAT:
	default_size: int = 4096
	data_set_size = 126
	base: list[int]
	check: list[int]

	def __init__(self):
	self.base = [0] * self.default_size
	self.check = [0] * self.default_size
	self.base[1] = 1

	def extend(self, size: int):
	self.base += [0] * (size - len(self.base))
	self.check += [0] * (size - len(self.check))

	def search(self, s: str) -> bool:
	p = 1
	for c in s:
	t = abs(self.base[p]) + ord(c)
	if t >= len(self.check):
	return False
	if self.check[t] != p:
	return False
	p = t
	else:
	# check if it's a valid end of word
	return self.base[p] < 0

	def find_empty(self, index: int, children: list[int]) -> int:
	while True:
	index += 1
	if index >= len(self.check):
	self.extend(len(self.base) * 2)
	if self.check[index] == 0:
	for child in children:
	while index + child >= len(self.check):
	self.extend(len(self.base) * 2)
	if self.check[index + child] != 0:
	break
	else:
	return index

	def relocate(self, parent: int, children: list[int]):
	old_base = abs(self.base[parent])
	sign = -1 if self.base[parent] < 0 else 1
	new_base = self.find_empty(old_base, children)

	for child in children:
	old_pos = old_base + child
	if old_pos >= len(self.check) or self.check[old_pos] != parent:
	continue

	new_pos = new_base + child
	while new_pos >= len(self.check):
	self.extend(len(self.base) * 2)

	self.base[new_pos] = self.base[old_pos]
	self.check[new_pos] = parent

	child_start = abs(self.base[old_pos])
	child_end = min(child_start + self.data_set_size, len(self.check))
	for i in range(child_start, child_end):
	if self.check[i] == old_pos:
	self.check[i] = new_pos

	self.check[old_pos] = 0
	self.base[old_pos] = 0

	self.base[parent] = sign * new_base
	return new_base

	def add(self, data: str):
	p = 1
	for c in data:
	code = ord(c)
	next_index = abs(self.base[p]) + code
	while next_index >= len(self.base):
	self.extend(len(self.base) * 2)

	if self.check[next_index] == 0:
	self.base[next_index] = self.find_empty(next_index, [0])
	self.check[next_index] = p
	p = next_index
	elif self.check[next_index] != p:
	# conflict
	children = []
	for i in range(min(self.data_set_size, len(self.check) - abs(self.base[p]))):
	if self.check[abs(self.base[p]) + i] == p:
	children.append(i)
	children.append(code)
	self.relocate(p, children=children)

	next_index = abs(self.base[p]) + code
	while next_index >= len(self.base):
	self.extend(len(self.base) * 2)
	self.base[next_index] = self.find_empty(next_index, [0])
	self.check[next_index] = p
	p = next_index
	else: # already exist
	p = next_index
	self.base[p] *= -1 # mark end of word

	def print_base_and_check(self):
	# ignore empty value
	for i in range(len(self.base)):
	if self.base[i] != 0 or self.check[i] != 0:
	print(f"Index: {i}, Base: {self.base[i]}, Check: {self.check[i]}")

	if __name__ == "__main__":
	dat = DAT()
	# https://www.mit.edu/~ecprice/wordlist.10000
	with open("wordlist.10000.txt", "r", encoding="utf-8") as f:
	words = [line[:-1] for line in f.readlines()]
	random.seed(42)
	random.shuffle(words)
	for word in words:
	dat.add(word)

	# dat.print_base_and_check()
	# print("===" * 10)
	for word in words:
	# print(f"Searching for '{word}': {dat.search(word)}")
	assert dat.search(word) == True, f"Failed to find '{word}'"
No results found