Query Augmentation#
The query the user types is rarely the best query for retrieval. Fix it before it hits your vector database.
Why Augment Queries?#
Users ask vague, short, or jargon-heavy questions. Your retrieval system works best with precise, information-rich queries.
| User Query | What They Mean | Better Retrieval Query |
|---|---|---|
| “How does it work?” | (no context) | Depends on the document! |
| “Python error in loop” | IndexError in for loop | “Python IndexError list index out of range for loop” |
| “Fast RAG” | Low-latency RAG pipelines | “Techniques to reduce latency in retrieval-augmented generation” |
Technique 1: HyDE — Hypothetical Document Embeddings#
Idea: Instead of embedding the query, generate a hypothetical answer document, then embed that.
Why? A generated answer is semantically closer to the actual documents in your corpus than a short query is.
graph LR A[User Query] --> B[LLM: Generate Hypothetical Answer] B --> C[Embed Hypothetical Answer] C --> D[Vector Search] D --> E[Real Documents] E --> F[LLM: Real Answer]
from openai import OpenAI
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
client = OpenAI()
def hyde_retrieve(query: str, vectorstore, k: int = 5) -> list:
"""
HyDE: Generate a hypothetical document, then retrieve by its embedding.
"""
# Step 1: Generate a hypothetical answer
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": (
"Generate a detailed, factual passage that would appear in "
"a technical document and directly answer the question. "
"Write 2-3 paragraphs as if you are the document."
)
},
{"role": "user", "content": query}
],
max_tokens=300,
temperature=0,
)
hypothetical_doc = response.choices[0].message.content
print(f"Hypothetical doc preview: {hypothetical_doc[:150]}...")
# Step 2: Retrieve using the hypothetical document as the query
results = vectorstore.similarity_search(hypothetical_doc, k=k)
return results
# Usage
vectorstore = Chroma.from_texts(
texts=[
"RAG pipelines retrieve context before generating answers.",
"Vector databases store high-dimensional embeddings for similarity search.",
"Chunking strategies determine how documents are split for indexing.",
],
embedding=OpenAIEmbeddings()
)
results = hyde_retrieve(
"Explain how retrieval-augmented generation reduces hallucination",
vectorstore
)
for r in results:
print(f"• {r.page_content}")When HyDE shines: Factual Q&A domains (medical, legal, technical documentation).
When to skip: Creative or open-ended queries where a hypothetical is hard to generate.
Technique 2: Query Rewriting#
Use an LLM to rephrase the query for better retrieval — fixing typos, expanding acronyms, adding domain context.
def rewrite_query(original_query: str, domain: str = "") -> str:
"""
Rewrite a user query for better retrieval.
"""
domain_context = f"The search is over documents about: {domain}. " if domain else ""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": (
f"{domain_context}Rewrite the following user query to be more "
"specific and suitable for document retrieval. "
"Keep it concise but complete. Return only the rewritten query."
)
},
{"role": "user", "content": original_query}
],
temperature=0,
)
return response.choices[0].message.content.strip()
# Examples
queries = [
"explain rag",
"why my vectors slow",
"HNSW vs IVF which better?",
]
for q in queries:
rewritten = rewrite_query(q, domain="vector databases and RAG systems")
print(f"Original: {q}")
print(f"Rewritten: {rewritten}")
print()Technique 3: Multi-Query Expansion#
Generate multiple alternative phrasings of the query and retrieve for each. Merge results with RRF.
from typing import List
def multi_query_expand(query: str, n: int = 3) -> List[str]:
"""
Generate N alternative queries for the same information need.
"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": (
f"Generate {n} different ways to ask the following question "
"for use in document retrieval. Each should approach the "
"information need from a different angle. "
f"Return exactly {n} queries, one per line."
)
},
{"role": "user", "content": query}
],
temperature=0.7,
)
lines = response.choices[0].message.content.strip().split("\n")
# Clean up numbered lists if any
queries = [l.lstrip("0123456789.-) ").strip() for l in lines if l.strip()]
return queries[:n]
def multi_query_retrieve(query: str, vectorstore, k: int = 5) -> list:
"""Retrieve using multiple query variations, deduplicate results."""
all_queries = [query] + multi_query_expand(query, n=3)
seen_content = set()
all_results = []
for q in all_queries:
results = vectorstore.similarity_search(q, k=k)
for r in results:
if r.page_content not in seen_content:
seen_content.add(r.page_content)
all_results.append(r)
return all_results[:k]
# Example
variants = multi_query_expand("How does vector similarity work in RAG?")
for i, v in enumerate(variants, 1):
print(f"{i}. {v}")Technique 4: Step-Back Prompting#
For complex, specific questions — first retrieve general background, then answer the specific question.
def step_back_retrieve(specific_query: str, vectorstore, k: int = 5) -> dict:
"""
Step-Back: Generate a more general version of the query,
retrieve for both, then combine.
"""
# Generate the step-back (more general) query
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": (
"Given a specific question, generate a more general, "
"abstract version that retrieves background knowledge. "
"Return only the general question."
)
},
{"role": "user", "content": specific_query}
],
temperature=0,
)
general_query = response.choices[0].message.content.strip()
# Retrieve for both
specific_results = vectorstore.similarity_search(specific_query, k=k)
general_results = vectorstore.similarity_search(general_query, k=k)
return {
"specific_query": specific_query,
"general_query": general_query,
"specific_docs": specific_results,
"general_docs": general_results,
"combined_context": specific_results + general_results,
}
# Example
result = step_back_retrieve(
"Why does HNSW use logarithmic connections at higher layers?"
)
print(f"Specific: {result['specific_query']}")
print(f"General: {result['general_query']}")Combining Everything: Augmented Retriever Class#
from enum import Enum
class AugmentStrategy(Enum):
NONE = "none"
REWRITE = "rewrite"
HYDE = "hyde"
MULTI_QUERY = "multi_query"
STEP_BACK = "step_back"
class AugmentedRetriever:
def __init__(self, vectorstore, strategy: AugmentStrategy = AugmentStrategy.REWRITE):
self.vectorstore = vectorstore
self.strategy = strategy
def retrieve(self, query: str, k: int = 5) -> list:
if self.strategy == AugmentStrategy.NONE:
return self.vectorstore.similarity_search(query, k=k)
elif self.strategy == AugmentStrategy.REWRITE:
improved = rewrite_query(query)
return self.vectorstore.similarity_search(improved, k=k)
elif self.strategy == AugmentStrategy.HYDE:
return hyde_retrieve(query, self.vectorstore, k=k)
elif self.strategy == AugmentStrategy.MULTI_QUERY:
return multi_query_retrieve(query, self.vectorstore, k=k)
elif self.strategy == AugmentStrategy.STEP_BACK:
result = step_back_retrieve(query, self.vectorstore, k=k)
return result["combined_context"]When to Use What#
| Strategy | Best For | Cost |
|---|---|---|
| No augmentation | Simple, precise queries | Cheapest |
| Query rewriting | Vague / ambiguous user input | Low |
| HyDE | Technical Q&A with clear answers | Medium |
| Multi-query | Broad topics needing coverage | Medium |
| Step-back | Specific questions needing background | Medium |