Yuichi Tateno (secon) hotchpotch

## qdrant_qat_report.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                hotchpotch
                / qdrant_qat_report.md
            
            
              Created
              February 17, 2026 01:17
            
              
                Investigation of QAT-related implementation in qdrant
              
          
    Qdrant Quantization Research Report (int8 / binary)


Repository: qdrant/qdrant
Commit (SHA1): bdd4bb5180f4a4fb378dd3dedf5c307e8a8b74e5
Scope: Implementation-level research for scalar int8 and binary quantization, with search-performance tuning guidance for float32 / float16 / uint8 vector storage.

1. Executive Summary

Qdrant implements two different approximate-vector compression families relevant to this request:

  
## pdf2ja.py
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.12,<3.13"
# dependencies = [
#   "plamo-translate-cli",
#   "pdf2zh-next",
# ]
# ///
# NOTE: PDFMathTranslate-next is published on PyPI as pdf2zh-next.
# License: MIT

## nano_beir_ja_eval_cli.py
#!/usr/bin/env python3
"""Run NanoBEIR-ja evaluation for a SentenceTransformer model (CLI).

Usage example (ndcg@10 only by default):
  uv run scripts/nano_beir_ja_eval_cli.py \
    --model-path cl-nagoya/ruri-v3-30m \
    --batch-size 512 --autocast-dtype bf16 \
    --output output/nano_beir_ja_eval_ruri-v3-30m.json

Use --all-metrics to emit the full metric set.

## pdf2translate.py
#!/usr/bin/env python3

"""Translate recent PDFs in Downloads using pdf2zh_next via uvx.

Defaults:
- Looks for PDFs downloaded within the last day.
- Saves outputs to ~/Downloads/pdf2translated/{filename}.translated.pdf
- Skips files that already have a translated output.
- Uses pdf2zh_next with Google Translate, Japanese output, no watermark, and
  alternating bilingual pages.

## cross_encoder_to_onnx_pr.py
from sentence_transformers import CrossEncoder, export_dynamic_quantized_onnx_model, export_optimized_onnx_model

# モデル名の定義
MODEL_NAME = "hotchpotch/japanese-reranker-xsmall-v2"

# 基本モデルの読み込み（CPUを使用、ONNXバックエンド）
model = CrossEncoder(MODEL_NAME, device="cpu", backend="onnx")

# 1. 基本モデル (model.onnx)
# Hubにプッシュして、必要に応じてPRを作成

## query-crafter-japanese-example.py
"""
query-crafter-japanese のサンプルコード。
実際に大量に処理するときは、vllm などを利用することで、高速処理が可能
"""

from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "hotchpotch/query-crafter-japanese-Qwen3-1.7B"

tokenizer = AutoTokenizer.from_pretrained(model_name)

## xlm_roberta_embeddings_convert.rb
from transformers import AutoModel, AutoTokenizer
import torch
from tqdm import tqdm
import numpy as np


def adapt_model_to_new_tokenizer(model_name, new_tokenizer_name):
    # 元のモデルとトークナイザーをロード
    original_model = AutoModel.from_pretrained(model_name)
    original_tokenizer = AutoTokenizer.from_pretrained(model_name)

## spm_train_jp_tokenizer_xlm_roberta.py
# %%
from datasets import load_dataset

dataset = load_dataset("hpprc/jawiki-paragraphs", split="train")

# %%
len(dataset)

# %%
# head N

## onnx_to_fp16.py
"""
This script converts an ONNX model to float16 precision using the onnxruntime transformers package.
It takes an input ONNX model file as a mandatory argument. The output file name is optional; if not provided,
the script generates the output file name by appending "_fp16" to the base name of the input file.
"""

import argparse
import onnx
from onnxruntime.transformers.float16 import convert_float_to_float16
import os

## bench_gpu_sift1m_ivf_hnsw.py
# base: https://github.com/facebookresearch/faiss/blob/main/benchs/bench_gpu_sift1m.py
# base code License: MIT License
#
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import os
import time
	#!/usr/bin/env python3
	# /// script
	# requires-python = ">=3.12,<3.13"
	# dependencies = [
	# "plamo-translate-cli",
	# "pdf2zh-next",
	# ]
	# ///
	# NOTE: PDFMathTranslate-next is published on PyPI as pdf2zh-next.
	# License: MIT
	#!/usr/bin/env python3
	"""Run NanoBEIR-ja evaluation for a SentenceTransformer model (CLI).

	Usage example (ndcg@10 only by default):
	uv run scripts/nano_beir_ja_eval_cli.py \
	--model-path cl-nagoya/ruri-v3-30m \
	--batch-size 512 --autocast-dtype bf16 \
	--output output/nano_beir_ja_eval_ruri-v3-30m.json

	Use --all-metrics to emit the full metric set.
	#!/usr/bin/env python3

	"""Translate recent PDFs in Downloads using pdf2zh_next via uvx.

	Defaults:
	- Looks for PDFs downloaded within the last day.
	- Saves outputs to ~/Downloads/pdf2translated/{filename}.translated.pdf
	- Skips files that already have a translated output.
	- Uses pdf2zh_next with Google Translate, Japanese output, no watermark, and
	alternating bilingual pages.
	from sentence_transformers import CrossEncoder, export_dynamic_quantized_onnx_model, export_optimized_onnx_model

	# モデル名の定義
	MODEL_NAME = "hotchpotch/japanese-reranker-xsmall-v2"

	# 基本モデルの読み込み（CPUを使用、ONNXバックエンド）
	model = CrossEncoder(MODEL_NAME, device="cpu", backend="onnx")

	# 1. 基本モデル (model.onnx)
	# Hubにプッシュして、必要に応じてPRを作成
	"""
	query-crafter-japanese のサンプルコード。
	実際に大量に処理するときは、vllm などを利用することで、高速処理が可能
	"""

	from transformers import AutoModelForCausalLM, AutoTokenizer

	model_name = "hotchpotch/query-crafter-japanese-Qwen3-1.7B"

	tokenizer = AutoTokenizer.from_pretrained(model_name)
	from transformers import AutoModel, AutoTokenizer
	import torch
	from tqdm import tqdm
	import numpy as np


	def adapt_model_to_new_tokenizer(model_name, new_tokenizer_name):
	# 元のモデルとトークナイザーをロード
	original_model = AutoModel.from_pretrained(model_name)
	original_tokenizer = AutoTokenizer.from_pretrained(model_name)
	# %%
	from datasets import load_dataset

	dataset = load_dataset("hpprc/jawiki-paragraphs", split="train")

	# %%
	len(dataset)

	# %%
	# head N
	"""
	This script converts an ONNX model to float16 precision using the onnxruntime transformers package.
	It takes an input ONNX model file as a mandatory argument. The output file name is optional; if not provided,
	the script generates the output file name by appending "_fp16" to the base name of the input file.
	"""

	import argparse
	import onnx
	from onnxruntime.transformers.float16 import convert_float_to_float16
	import os
	# base: https://github.com/facebookresearch/faiss/blob/main/benchs/bench_gpu_sift1m.py
	# base code License: MIT License
	#
	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import os
	import time