-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvector_search.py
More file actions
126 lines (104 loc) · 5.77 KB
/
vector_search.py
File metadata and controls
126 lines (104 loc) · 5.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""Vector Search Example — semantic similarity search with embeddings.
Demonstrates vector search using laurus's built-in CandleBert embedder:
- Basic vector search (semantic similarity)
- Filtered vector search (with lexical filters)
The embedder is registered in the schema and laurus automatically converts
text to vectors at index and query time — no external embedding library needed.
Run with:
maturin develop --features embeddings-candle
python examples/vector_search.py
"""
from __future__ import annotations
import laurus
# ---------------------------------------------------------------------------
# Embedder configuration
# ---------------------------------------------------------------------------
_EMBEDDER_NAME = "bert"
_EMBEDDER_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
_DIM = 384 # dimension for all-MiniLM-L6-v2
# ---------------------------------------------------------------------------
# Dataset
# ---------------------------------------------------------------------------
CHUNKS = [
("django_guide", "Django Web Development", "Django follows the model-template-view architecture pattern for clean separation of concerns.", 1, "framework"),
("django_guide", "Django Web Development", "Django ORM maps Python classes to database tables with migrations for schema management.", 2, "framework"),
("django_guide", "Django Web Development", "Django middleware processes requests and responses through a chain of pluggable components.", 3, "framework"),
("flask_guide", "Flask Microservices", "Flask provides lightweight routing and Jinja2 templating for building APIs and web apps.", 1, "framework"),
("flask_guide", "Flask Microservices", "Flask extensions like Flask-SQLAlchemy and Flask-Migrate add database support to Flask projects.", 2, "framework"),
("numpy_docs", "NumPy Fundamentals", "NumPy arrays provide vectorized operations that are much faster than Python loops for numerical computing.", 1, "scientific"),
("numpy_docs", "NumPy Fundamentals", "Broadcasting in NumPy allows arithmetic operations on arrays of different shapes without copying data.", 2, "scientific"),
("pytest_book", "Testing with pytest", "pytest fixtures provide reusable setup and teardown logic for test functions with dependency injection.", 1, "testing"),
("pytest_book", "Testing with pytest", "pytest parametrize decorator runs the same test with different input datasets automatically.", 2, "testing"),
]
def main() -> None:
print("=== Laurus Vector Search Example ===\n")
print(f"Embedder: {_EMBEDDER_MODEL} (dim={_DIM})\n")
# ── Schema ─────────────────────────────────────────────────────────────
schema = laurus.Schema()
schema.add_embedder(_EMBEDDER_NAME, {"type": "candle_bert", "model": _EMBEDDER_MODEL})
schema.add_text_field("title")
schema.add_text_field("text")
schema.add_text_field("category")
schema.add_integer_field("page")
schema.add_flat_field("text_vec", dimension=_DIM, distance="cosine", embedder=_EMBEDDER_NAME)
schema.set_default_fields(["text"])
index = laurus.Index(schema=schema)
# ── Index ──────────────────────────────────────────────────────────────
# Passing raw text to text_vec lets laurus automatically vectorize it via the built-in embedder.
print("--- Indexing chunked documents ---\n")
for doc_id, title, text, page, category in CHUNKS:
index.add_document(
doc_id,
{
"title": title,
"text": text,
"category": category,
"page": page,
"text_vec": text,
},
)
index.commit()
print(f"Indexed {len(CHUNKS)} chunks.\n")
# =====================================================================
# [A] Basic Vector Search
# =====================================================================
print("=" * 60)
print("[A] Basic Vector Search: 'database ORM queries'")
print("=" * 60)
_print_results(
index.search(laurus.VectorTextQuery("text_vec", "database ORM queries"), limit=3)
)
# =====================================================================
# [B] Filtered Vector Search — category filter
# =====================================================================
print("\n" + "=" * 60)
print("[B] Filtered Vector Search: 'database ORM queries' + category='testing'")
print("=" * 60)
request = laurus.SearchRequest(
vector_query=laurus.VectorTextQuery("text_vec", "database ORM queries"),
filter_query=laurus.TermQuery("category", "testing"),
limit=3,
)
_print_results(index.search(request))
# =====================================================================
# [C] Filtered Vector Search — numeric range filter (page = 1) + query
# =====================================================================
print("\n" + "=" * 60)
print("[C] Filtered Vector Search: 'web server HTTP' + page=1")
print("=" * 60)
request = laurus.SearchRequest(
vector_query=laurus.VectorTextQuery("text_vec", "web server HTTP"),
filter_query=laurus.NumericRangeQuery("page", min=1, max=1),
limit=3,
)
_print_results(index.search(request))
print("\nVector search example completed!")
def _print_results(results: list) -> None:
if not results:
print(" (no results)")
return
for r in results:
doc = r.document or {}
print(f" id={r.id!r:8s} score={r.score:.4f} text={doc.get('text', '')!r:.60s}")
if __name__ == "__main__":
main()