davidmezzetti/txtai-textractor.py

## txtai-textractor.py
from txtai.pipeline import Textractor

# Docling backend, split text by sections
textractor = Textractor(sections=True, backend="docling")

# BERT Paper
textractor("https://arxiv.org/pdf/1810.04805")

# PDF converted to Markdown, split on Markdown sections
# ['## BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding...
#  '## Abstract\nWe introduce a new language representation model called BERT...
#  "## 1 Introduction\nLanguage model pre-training has been shown to be effective...
#  '## 2 Related Work\nThere is a long history of pre-training general language representations...
#  ...
# ]

# Website
textractor("https://github.com/neuml/txtai")

# HTML to Markdown split by sections
# ['**GitHub - neuml/txtai: 💡 All-in-one open-source AI framework for semantic search...
#  '**All-in-one AI framework** \ntxtai is an all-in-one AI framework for semantic search...
#  '## Why txtai?\nNew vector databases, LLM frameworks and everything in between are sprouting...
#  '## Use Cases\nThe following sections introduce common txtai use cases. A comprehensive set of...'
# ...
# ]
	from txtai.pipeline import Textractor

	# Docling backend, split text by sections
	textractor = Textractor(sections=True, backend="docling")

	# BERT Paper
	textractor("https://arxiv.org/pdf/1810.04805")

	# PDF converted to Markdown, split on Markdown sections
	# ['## BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding...
	# '## Abstract\nWe introduce a new language representation model called BERT...
	# "## 1 Introduction\nLanguage model pre-training has been shown to be effective...
	# '## 2 Related Work\nThere is a long history of pre-training general language representations...
	# ...
	# ]

	# Website
	textractor("https://github.com/neuml/txtai")

	# HTML to Markdown split by sections
	# ['**GitHub - neuml/txtai: 💡 All-in-one open-source AI framework for semantic search...
	# 'All-in-one AI framework \ntxtai is an all-in-one AI framework for semantic search...
	# '## Why txtai?\nNew vector databases, LLM frameworks and everything in between are sprouting...
	# '## Use Cases\nThe following sections introduce common txtai use cases. A comprehensive set of...'
	# ...
	# ]
No results found