Semantic Search Examples

Practical examples of using semantic search with GrafitoDB in real-world scenarios.

Academic Paper Search

Build a semantic search system for academic papers with citation networks.

from grafito import GrafitoDatabase
from grafito.embedding_functions import SentenceTransformerEmbeddingFunction
from grafito.indexers import HNSWlibIndexer

# Initialize
db = GrafitoDatabase(':memory:')
embedder = SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2",
    normalize_embeddings=True
)

# Create vector index
indexer = HNSWlibIndexer(
    options={"metric": "cosine"},
    embedding_function=embedder
)
db.create_vector_index("papers_vec", indexer=indexer)

# Sample papers
papers = [
    {
        "title": "Attention Is All You Need",
        "abstract": "We propose a new simple network architecture, the Transformer...",
        "year": 2017,
        "authors": ["Vaswani", "Shazeer", "Parmar"]
    },
    {
        "title": "BERT: Pre-training of Deep Bidirectional Transformers",
        "abstract": "We introduce a new language representation model called BERT...",
        "year": 2018,
        "authors": ["Devlin", "Chang", "Lee", "Toutanova"]
    },
]

# Create paper nodes and embeddings
paper_nodes = {}
for paper in papers:
    node = db.create_node(
        labels=["Paper"],
        properties={
            "title": paper["title"],
            "abstract": paper["abstract"],
            "year": paper["year"]
        }
    )
    paper_nodes[paper["title"]] = node

    # Generate embedding from title + abstract
    text = f"{paper['title']}. {paper['abstract']}"
    vector = embedder([text])[0]
    db.upsert_embedding(node.id, vector, index="papers_vec")

# Create author nodes and relationships
for paper in papers:
    paper_node = paper_nodes[paper["title"]]
    for author_name in paper["authors"]:
        author = db.create_node(
            labels=["Author"],
            properties={"name": author_name}
        )
        db.create_relationship(paper_node.id, author.id, "AUTHORED_BY")

# Search semantically
results = db.semantic_search(
    "transformer architecture for NLP",
    k=10,
    index="papers_vec"
)

# Navigate citation network from results
for result in results:
    paper = result["node"]
    print(f"\nPaper: {paper.properties['title']} (score: {result['score']:.3f})")

    # Get authors
    authors = db.get_neighbors(paper.id, direction="outgoing", rel_type="AUTHORED_BY")
    author_names = [a.properties['name'] for a in authors]
    print(f"Authors: {', '.join(author_names)}")

    # Get citations (if we had them)
    # citations = db.get_neighbors(paper.id, direction="outgoing", rel_type="CITES")

E-commerce Product Search

Semantic product search with related product recommendations.

# Products with descriptions
products = [
    {
        "name": "UltraBook Pro 15",
        "description": "High-performance laptop with 16GB RAM, 512GB SSD, Intel i7 processor",
        "category": "Electronics",
        "price": 1299
    },
    {
        "name": "BaristaMaster Coffee Maker",
        "description": "Automatic drip coffee maker with programmable timer and thermal carafe",
        "category": "Kitchen",
        "price": 199
    },
    {
        "name": "ErgoChair Plus",
        "description": "Ergonomic office chair with lumbar support and adjustable armrests",
        "category": "Furniture",
        "price": 449
    },
]

# Create product nodes and embeddings
for product in products:
    node = db.create_node(
        labels=["Product"],
        properties=product
    )
    vector = embedder([product["description"]])[0]
    db.upsert_embedding(node.id, vector, index="products_vec")

# Natural language search
query = "machine for brewing coffee automatically"
results = db.semantic_search(
    query,
    k=5,
    index="products_vec",
    labels=["Product"]
)

for result in results:
    product = result["node"]
    print(f"{product.properties['name']}: ${product.properties['price']}")
    print(f"  Score: {result['score']:.3f}")
    print(f"  Description: {product.properties['description'][:80]}...")

RAG System with Graph Context

Build a Retrieval-Augmented Generation system with rich graph context.

def rag_query(db, user_question: str, embedder, llm_complete, k: int = 5):
    """
    Answer question using graph-enhanced RAG.

    Args:
        db: GrafitoDB instance
        user_question: The user's question
        embedder: Embedding function
        llm_complete: Function to call LLM with prompt
        k: Number of documents to retrieve
    """
    # 1. Semantic search for relevant documents
    query_vector = embedder([user_question])[0]
    results = db.semantic_search(query_vector, k=k, index="docs_vec")

    # 2. Gather graph context
    context_parts = []
    for result in results:
        doc = result["node"]

        # Document content
        context_parts.append(f"Document: {doc.properties['text']}")

        # Related entities from graph
        related = db.get_neighbors(doc.id, rel_type="MENTIONS")
        if related:
            entities = [n.properties.get('name') for n in related]
            context_parts.append(f"Related entities: {', '.join(entities)}")

        # Source metadata
        sources = db.get_neighbors(doc.id, rel_type="FROM_SOURCE")
        if sources:
            source_names = [s.properties.get('name') for s in sources]
            context_parts.append(f"Sources: {', '.join(source_names)}")

    # 3. Build prompt with rich context
    context = "\n\n".join(context_parts)
    prompt = f"""Context from knowledge graph:
{context}

Question: {user_question}

Answer:"""

    # 4. Send to LLM
    return llm_complete(prompt)

# Usage
# response = rag_query(db, "What are the main benefits of graph databases?", embedder, call_llm)

Healthcare: Symptom-Disease Matching

Match patient symptoms to diseases using semantic similarity.

# Create disease nodes with symptom descriptions
diseases = [
    {
        "name": "Influenza",
        "symptoms": "fever, cough, sore throat, body aches, fatigue, chills",
        "severity": "moderate",
        "contagious": True
    },
    {
        "name": "COVID-19",
        "symptoms": "fever, dry cough, fatigue, loss of taste or smell, shortness of breath",
        "severity": "high",
        "contagious": True
    },
    {
        "name": "Common Cold",
        "symptoms": "runny nose, sneezing, mild cough, sore throat, congestion",
        "severity": "mild",
        "contagious": True
    },
]

# Create disease nodes and embeddings
for disease in diseases:
    node = db.create_node(
        labels=["Disease"],
        properties=disease
    )
    vector = embedder([disease["symptoms"]])[0]
    db.upsert_embedding(node.id, vector, index="diseases_vec")

    # Create treatment nodes
    treatments = get_treatments_for(disease["name"])  # Your function
    for treatment in treatments:
        treatment_node = db.create_node(
            labels=["Treatment"],
            properties={"name": treatment}
        )
        db.create_relationship(node.id, treatment_node.id, "TREATED_BY")

# Patient presents with symptoms
patient_symptoms = "I have a high temperature, dry cough, and can't taste food"
results = db.semantic_search(patient_symptoms, k=3, index="diseases_vec")

# Get treatment protocols from graph
print("Possible diagnoses:")
for i, result in enumerate(results, 1):
    disease = result["node"]
    print(f"\n{i}. {disease.properties['name']} (score: {result['score']:.3f})")
    print(f"   Severity: {disease.properties['severity']}")

    treatments = db.get_neighbors(disease.id, rel_type="TREATED_BY")
    if treatments:
        treatment_names = [t.properties['name'] for t in treatments]
        print(f"   Treatments: {', '.join(treatment_names)}")

Document Management with Semantic + Graph

Enterprise document management combining semantic search with organizational structure.

# Create department and document structure
departments = ["Engineering", "Sales", "HR", "Legal"]
dept_nodes = {}

for dept in departments:
    node = db.create_node(
        labels=["Department"],
        properties={"name": dept}
    )
    dept_nodes[dept] = node

# Create documents linked to departments
documents = [
    {
        "title": "Q4 Engineering Roadmap",
        "content": "Detailed plans for Q4 including microservices migration...",
        "department": "Engineering",
        "confidentiality": "internal"
    },
    {
        "title": "Sales Playbook 2024",
        "content": "Strategies for enterprise sales and customer retention...",
        "department": "Sales",
        "confidentiality": "confidential"
    },
]

for doc in documents:
    node = db.create_node(
        labels=["Document"],
        properties={
            "title": doc["title"],
            "content": doc["content"],
            "confidentiality": doc["confidentiality"]
        }
    )

    # Link to department
    db.create_relationship(node.id, dept_nodes[doc["department"]].id, "BELONGS_TO")

    # Create embedding
    vector = embedder([doc["content"]])[0]
    db.upsert_embedding(node.id, vector, index="documents_vec")

# Search with department filter
query = "microservices architecture migration"
results = db.semantic_search(
    query,
    k=10,
    index="documents_vec",
    labels=["Document"],
    properties={"confidentiality": "internal"}
)

# Navigate organizational context
for result in results:
    doc = result["node"]
    print(f"\nDocument: {doc.properties['title']}")
    print(f"Score: {result['score']:.3f}")

    # Get department
    depts = db.get_neighbors(doc.id, rel_type="BELONGS_TO")
    for dept in depts:
        print(f"Department: {dept.properties['name']}")

        # Find related documents in same department
        related = db.get_neighbors(dept.id, direction="incoming", rel_type="BELONGS_TO")
        related_docs = [r for r in related if r.id != doc.id][:3]
        if related_docs:
            print("Related documents in same dept:")
            for r in related_docs:
                print(f"  - {r.properties['title']}")

Find relevant content in social networks using semantic search.

# Create users and posts
users = [
    {"username": "alice_dev", "interests": ["python", "machine learning"]},
    {"username": "bob_data", "interests": ["data science", "sql"]},
]

user_nodes = {}
for user in users:
    node = db.create_node(
        labels=["User"],
        properties=user
    )
    user_nodes[user["username"]] = node

# Create posts
posts = [
    {
        "author": "alice_dev",
        "content": "Just discovered this amazing graph database called GrafitoDB!",
        "tags": ["databases", "graphs"]
    },
    {
        "author": "bob_data",
        "content": "Working on a new ML pipeline for recommendation systems...",
        "tags": ["machine learning", "recommendations"]
    },
]

for post in posts:
    node = db.create_node(
        labels=["Post"],
        properties={
            "content": post["content"],
            "tags": post["tags"]
        }
    )

    # Link to author
    db.create_relationship(user_nodes[post["author"]].id, node.id, "AUTHORED")

    # Create embedding
    vector = embedder([post["content"]])[0]
    db.upsert_embedding(node.id, vector, index="posts_vec")

# Search for content
query = "graph databases for recommendations"
results = db.semantic_search(query, k=10, index="posts_vec")

# Get social context
for result in results:
    post = result["node"]
    print(f"\nPost: {post.properties['content'][:60]}...")
    print(f"Score: {result['score']:.3f}")

    # Get author
    authors = db.get_neighbors(post.id, direction="incoming", rel_type="AUTHORED")
    for author in authors:
        print(f"Author: @{author.properties['username']}")

        # Get author's other posts
        other_posts = db.get_neighbors(author.id, rel_type="AUTHORED")
        other_posts = [p for p in other_posts if p.id != post.id][:2]
        if other_posts:
            print("Other posts by this user:")
            for p in other_posts:
                print(f"  - {p.properties['content'][:40]}...")

Code Search with Semantic Understanding

Search code repositories semantically.

# Code snippets
snippets = [
    {
        "filename": "database.py",
        "language": "python",
        "code": "def create_node(self, labels, properties):\n    '''Create a new node in the graph'''\n    ...",
        "description": "Create a new node with labels and properties"
    },
    {
        "filename": "query.py",
        "language": "python", 
        "code": "def semantic_search(self, vector, k=10):\n    '''Find similar vectors'''\n    ...",
        "description": "Perform semantic search using vector similarity"
    },
]

for snippet in snippets:
    node = db.create_node(
        labels=["CodeSnippet", snippet["language"].capitalize()],
        properties=snippet
    )

    # Embed description + code
    text = f"{snippet['description']}. {snippet['code']}"
    vector = embedder([text])[0]
    db.upsert_embedding(node.id, vector, index="code_vec")

# Search code semantically
query = "how to create nodes in the graph"
results = db.semantic_search(query, k=5, index="code_vec")

for result in results:
    snippet = result["node"]
    print(f"\nFile: {snippet.properties['filename']}")
    print(f"Score: {result['score']:.3f}")
    print(f"Description: {snippet.properties['description']}")
    print(f"Code:\n{snippet.properties['code'][:200]}...")

Performance Tips for Production

When deploying semantic search at scale:

# 1. Use batch operations for indexing
batch_size = 1000
vectors = []
node_ids = []

for i, item in enumerate(items):
    vector = embedder([item["text"]])[0]
    vectors.append(vector)
    node_ids.append(item["id"])

    if len(vectors) >= batch_size:
        db.upsert_embeddings(node_ids, vectors, index="my_index")
        vectors = []
        node_ids = []

# Insert remaining
if vectors:
    db.upsert_embeddings(node_ids, vectors, index="my_index")

# 2. Enable persistence for faster restarts
db.create_vector_index(
    name="production_index",
    dim=384,
    backend="faiss",
    method="hnsw",
    options={
        "metric": "cosine",
        "index_path": "/data/vector_index.faiss"
    }
)

# 3. Use reranking for critical applications
results = db.semantic_search(
    query,
    k=10,
    index="production_index",
    rerank=True,
    candidate_multiplier=5  # Fetch 50, return top 10
)

# 4. Cache query embeddings
from functools import lru_cache

@lru_cache(maxsize=1000)
def get_cached_embedding(query_text):
    return tuple(embedder([query_text])[0])

# 5. Monitor performance
import time

start = time.time()
results = db.semantic_search(query, k=10, index="my_index")
latency_ms = (time.time() - start) * 1000

print(f"Search latency: {latency_ms:.2f}ms")