Contextual Retrieval#
The problem: chunks lose context when extracted. The fix: prepend a custom context sentence to each chunk before embedding.
The Core Idea#
Anthropic published this technique in September 2024. The insight is deceptively simple:
Before embedding a chunk, prepend a short context sentence that describes where the chunk comes from and what it’s about — generated by a fast LLM using the full document as input.
BEFORE (standard chunk):
"The revenue was $1.2B, up 15% YoY."
AFTER (contextual chunk):
"This chunk is from the Q3 2024 earnings report for Acme Corp, discussing
financial performance. The revenue was $1.2B, up 15% YoY."The embedding now captures the context. The chunk is self-sufficient.
Why This Works#
Without context, the embedding of “The revenue was $1.2B” looks like a million other revenue sentences. With context, it’s uniquely positioned in the embedding space as “Acme Corp Q3 2024 revenue.”
Anthropic reported 49% reduction in retrieval failures on their benchmarks.
Implementation#
uv add anthropic langchain-anthropic langchain chromadbimport anthropic
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
from typing import Optional
client = anthropic.Anthropic()
def generate_chunk_context(
full_document: str,
chunk: str,
model: str = "claude-haiku-4-5",
) -> str:
"""
Use Claude to generate a context sentence for a chunk,
given the full document.
Uses prompt caching for the full document (major cost saving
when processing many chunks from the same document).
"""
response = client.beta.messages.create(
model=model,
max_tokens=200,
system="You are a document analyzer. Given a document and a chunk from it, "
"write 1-2 sentences of context that describe what the chunk is about "
"and where it fits in the document. Be concise and specific.",
messages=[
{
"role": "user",
"content": [
# Full document with cache_control — this gets cached!
{
"type": "text",
"text": f"<document>\n{full_document}\n</document>",
"cache_control": {"type": "ephemeral"},
},
{
"type": "text",
"text": (
f"<chunk>\n{chunk}\n</chunk>\n\n"
"Write a brief context sentence for this chunk. "
"Return ONLY the context sentence, nothing else."
),
},
],
}
],
betas=["prompt-caching-2024-07-31"],
)
return response.content[0].text.strip()
def contextual_chunk(
full_document: str,
chunk: str,
) -> str:
"""Prepend generated context to a chunk."""
context = generate_chunk_context(full_document, chunk)
return f"{context}\n\n{chunk}"
def build_contextual_vectorstore(
documents: list[str],
collection_name: str = "contextual_rag",
) -> Chroma:
"""
Build a Chroma vectorstore using contextual retrieval.
For each document:
1. Split into chunks
2. Generate context for each chunk (with caching)
3. Prepend context + embed
"""
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
all_docs = []
for doc_idx, document in enumerate(documents):
chunks = splitter.split_text(document)
print(f"Document {doc_idx + 1}: {len(chunks)} chunks")
for chunk_idx, chunk in enumerate(chunks):
print(f" Generating context for chunk {chunk_idx + 1}/{len(chunks)}...")
contextualized = contextual_chunk(full_document=document, chunk=chunk)
all_docs.append(Document(
page_content=contextualized,
metadata={
"doc_idx": doc_idx,
"chunk_idx": chunk_idx,
"original_chunk": chunk, # keep original for display
}
))
print(f"\nBuilding vectorstore with {len(all_docs)} contextual chunks...")
vectorstore = Chroma.from_documents(
documents=all_docs,
embedding=OpenAIEmbeddings(),
collection_name=collection_name,
)
print("Done!")
return vectorstore
# Usage
documents = [
"""
Acme Corporation Q3 2024 Earnings Report
Financial Performance:
Revenue reached $1.2 billion, representing 15% year-over-year growth.
Operating income was $240 million, with margins expanding 200 basis points.
Product Segment:
The cloud division grew 42% to $450 million, now representing 37.5% of revenue.
Hardware sales declined 8% to $320 million due to supply chain constraints.
Outlook:
Management raised full-year guidance to $4.8-5.0 billion in revenue.
Q4 expected revenue: $1.4-1.5 billion.
""",
]
vectorstore = build_contextual_vectorstore(documents)
# Query
results = vectorstore.similarity_search("What happened with cloud revenue?", k=3)
for r in results:
print(f"\n--- Result ---")
print(r.page_content[:300])Cost Optimization with Prompt Caching#
The key insight from Anthropic’s blog: when processing many chunks from the same document, the document gets cached after the first call. Subsequent chunks reuse the cache, saving up to 90% of token costs.
def batch_contextualize(full_document: str, chunks: list[str]) -> list[str]:
"""
Process all chunks from one document.
The first call caches the document; subsequent calls use cache.
This is ~10x cheaper than processing chunks separately.
"""
contextualized_chunks = []
for i, chunk in enumerate(chunks):
print(f"Chunk {i+1}/{len(chunks)}", end="\r")
contextualized = contextual_chunk(full_document, chunk)
contextualized_chunks.append(contextualized)
return contextualized_chunksCost estimate (claude-haiku-4-5):
- Without caching: ~$0.25 per 100-page PDF
- With caching: ~$0.03 per 100-page PDF (~90% savings)
Comparing Standard vs Contextual RAG#
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
# Build both vectorstores
standard_chunks = splitter.split_text(document)
standard_vs = Chroma.from_texts(
standard_chunks,
OpenAIEmbeddings(),
collection_name="standard_rag"
)
contextual_vs = build_contextual_vectorstore([document], "contextual_rag")
# Test query
query = "What is the cloud division revenue growth?"
standard_results = standard_vs.similarity_search(query, k=3)
contextual_results = contextual_vs.similarity_search(query, k=3)
print("=== Standard RAG ===")
for r in standard_results:
print(f" {r.page_content[:150]}")
print("\n=== Contextual RAG ===")
for r in contextual_results:
print(f" {r.page_content[:150]}")Contextual Retrieval + Hybrid + Reranking#
Anthropic’s full recommended stack:
class AnthropicRAGStack:
"""
Anthropic's full recommended RAG stack:
1. Contextual chunking (semantic context injection)
2. Hybrid search (dense + BM25)
3. Reranking (cross-encoder)
"""
def __init__(self, documents: list[str]):
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
# Build contextual vectorstore (dense)
self.vectorstore = build_contextual_vectorstore(documents)
# Build BM25 index (sparse)
self.all_chunks = [] # collect all chunks during indexing
self.bm25 = BM25Okapi([c.lower().split() for c in self.all_chunks])
# Reranker
self.reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
def retrieve(self, query: str, k: int = 5) -> list[str]:
# 1. Dense retrieval
dense = self.vectorstore.similarity_search(query, k=20)
# 2. Sparse retrieval (BM25)
scores = self.bm25.get_scores(query.lower().split())
sparse_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:20]
sparse = [self.all_chunks[i] for i in sparse_idx]
# 3. Merge candidates
candidates = list({d.page_content for d in dense} | set(sparse))
# 4. Rerank
pairs = [(query, c) for c in candidates]
rerank_scores = self.reranker.predict(pairs)
reranked = sorted(zip(candidates, rerank_scores), key=lambda x: x[1], reverse=True)
return [doc for doc, _ in reranked[:k]]Key Takeaways#
- Context is generated per-chunk — the LLM sees the full document to understand what the chunk is about
- Prompt caching is critical — without it, contextual retrieval is too expensive
- Simple but effective — Anthropic reported 49% reduction in retrieval failures
- Combine with hybrid + reranking for best results