David Mezzetti davidmezzetti

## txtai-paper.py
from txtai import Agent

# Define tools
tools = [
    "websearch",  # Runs a websearch using default engine
    "webview",  # Loads a web page
]

# Define LLM
model = "Qwen/Qwen3-4B-Instruct-2507"

## gguf-vectors.py
import numpy as np
from txtai.ann import ANNFactory

# Index 10M vectors using llama.cpp style quants
ann = None
for _ in range(1000):
  # Generate batch of vectors
  batch = np.random.rand(10000, 768).astype(np.float32)

  if not ann:

## rag-quickstart.py
# RAG Quick Start
# Easy to use way to get started with RAG using YOUR data
#
# For a complete application see this: https://github.com/neuml/rag
#
# TxtAI has 70+ example notebooks covering everything the framework provides
# Examples: https://neuml.github.io/txtai/examples
#
# Install TxtAI
# pip install txtai[pipeline-data]

## txtai-audio.py
import soundfile as sf

from txtai.pipeline import TextToAudio

# Create and run pipeline (Note that model is CC-BY-NC)
tta = TextToAudio("facebook/musicgen-stereo-medium")
speech, rate = tta("Happy 80s rock and synth for a fun startup company")

# Write to file
sf.write("out.wav", speech.T, rate)

## txtai-tts.py
import soundfile as sf

from txtai.pipeline import TextToSpeech

# Build pipeline
tts = TextToSpeech("neuml/kokoro-int8-onnx")

# Generate speech
speech, rate = tts(
    """Have you ever considered having a snooty British accent?

## txtai-textractor-image.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                davidmezzetti
                / txtai-textractor-image.md
            
            
              Created
              November 13, 2025 14:48
            
          
    Image to parse


from txtai.pipeline import Textractor

textractor = Textractor(backend="docling", headers={"user-agent": "Mozilla/5.0"})
textractor("https://miro.medium.com/v2/resize:fit:720/format:webp/1*HHPVwIrcxYcLRvjDpwLQyQ.png")

  
## webrag.py
from smolagents import WebSearchTool
from txtai import LLM

def webrag(query):
    prompt = f"""
Answer the following question using ONLY the context below.

Query: {query}
Context: {search(query)}
"""

## txtai-textractor.py
from txtai.pipeline import Textractor

# Docling backend, split text by sections
textractor = Textractor(sections=True, backend="docling")

# BERT Paper
textractor("https://arxiv.org/pdf/1810.04805")

# PDF converted to Markdown, split on Markdown sections
# ['## BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding...

## text-classify.py
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from txtai.pipeline import HFTrainer

def metrics(pred):
    labels, preds = pred.label_ids, pred.predictions.argmax(-1)

    # Calculate accuracy
    return {"accuracy": accuracy_score(labels, preds)}

## web2gguf.py
from txtai import Embeddings
from txtai.pipeline import Textractor

urls = "https://github.com/neuml/txtai"
textractor = Textractor(chunker="semantic")

embeddings = Embeddings(backend="ggml", ggml={"quantize": "q4_0"})
embeddings.index((url, x) for x in textractor(url))
embeddings.save("gguf")
	from txtai import Agent

	# Define tools
	tools = [
	"websearch", # Runs a websearch using default engine
	"webview", # Loads a web page
	]

	# Define LLM
	model = "Qwen/Qwen3-4B-Instruct-2507"
	import numpy as np
	from txtai.ann import ANNFactory

	# Index 10M vectors using llama.cpp style quants
	ann = None
	for _ in range(1000):
	# Generate batch of vectors
	batch = np.random.rand(10000, 768).astype(np.float32)

	if not ann:
	# RAG Quick Start
	# Easy to use way to get started with RAG using YOUR data
	#
	# For a complete application see this: https://github.com/neuml/rag
	#
	# TxtAI has 70+ example notebooks covering everything the framework provides
	# Examples: https://neuml.github.io/txtai/examples
	#
	# Install TxtAI
	# pip install txtai[pipeline-data]
	import soundfile as sf

	from txtai.pipeline import TextToAudio

	# Create and run pipeline (Note that model is CC-BY-NC)
	tta = TextToAudio("facebook/musicgen-stereo-medium")
	speech, rate = tta("Happy 80s rock and synth for a fun startup company")

	# Write to file
	sf.write("out.wav", speech.T, rate)
	import soundfile as sf

	from txtai.pipeline import TextToSpeech

	# Build pipeline
	tts = TextToSpeech("neuml/kokoro-int8-onnx")

	# Generate speech
	speech, rate = tts(
	"""Have you ever considered having a snooty British accent?
	from smolagents import WebSearchTool
	from txtai import LLM

	def webrag(query):
	prompt = f"""
	Answer the following question using ONLY the context below.

	Query: {query}
	Context: {search(query)}
	"""
	from txtai.pipeline import Textractor

	# Docling backend, split text by sections
	textractor = Textractor(sections=True, backend="docling")

	# BERT Paper
	textractor("https://arxiv.org/pdf/1810.04805")

	# PDF converted to Markdown, split on Markdown sections
	# ['## BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding...
	from datasets import load_dataset
	from sklearn.metrics import accuracy_score
	from transformers import AutoModelForSequenceClassification, AutoTokenizer
	from txtai.pipeline import HFTrainer

	def metrics(pred):
	labels, preds = pred.label_ids, pred.predictions.argmax(-1)

	# Calculate accuracy
	return {"accuracy": accuracy_score(labels, preds)}
	from txtai import Embeddings
	from txtai.pipeline import Textractor

	urls = "https://github.com/neuml/txtai"
	textractor = Textractor(chunker="semantic")

	embeddings = Embeddings(backend="ggml", ggml={"quantize": "q4_0"})
	embeddings.index((url, x) for x in textractor(url))
	embeddings.save("gguf")