Retrieval-augmented generation solves the context window problem: instead of cramming your entire knowledge base into a prompt, you fetch only the relevant chunks at query time. Pinecone makes this fast at scale. This guide builds a working RAG chatbot from scratch — document ingestion, vector search, and a streaming FastAPI endpoint.
Table of Contents
- Architecture Overview
- Prerequisites
- Step 1: Document Ingestion
- Step 2: Retrieval
- Step 3: Generation with Claude
- Step 4: FastAPI Endpoint
- Common Failure Modes
- Chunk Strategy by Document Type
- Multi-Tenant Isolation with Namespaces
- Re-Ranking Retrieved Chunks
- Related Reading
Architecture Overview
Documents → Chunker → Embedder → Pinecone Index
↓
User Query → Embedder → Pinecone Search → Top-K Chunks
↓
System Prompt + Chunks + Query → LLM → Response
Three moving parts: ingestion pipeline, retrieval, and generation. Each has failure modes worth understanding.
Prerequisites
pip install pinecone-client openai anthropic fastapi uvicorn \
langchain-text-splitters tiktoken python-dotenv
# .env
PINECONE_API_KEY=your_pinecone_key
PINECONE_INDEX_NAME=rag-docs
OPENAI_API_KEY=your_openai_key # for embeddings
ANTHROPIC_API_KEY=your_anthropic_key # for generation
Step 1: Document Ingestion
# ingest.py
import os
import hashlib
from pathlib import Path
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
load_dotenv()
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
oai = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
INDEX_NAME = os.environ["PINECONE_INDEX_NAME"]
EMBED_MODEL = "text-embedding-3-small"
EMBED_DIM = 1536
def get_or_create_index():
existing = [idx.name for idx in pc.list_indexes()]
if INDEX_NAME not in existing:
pc.create_index(
name=INDEX_NAME,
dimension=EMBED_DIM,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
return pc.Index(INDEX_NAME)
def chunk_document(text: str, source: str) -> list[dict]:
splitter = RecursiveCharacterTextSplitter(
chunk_size=512,
chunk_overlap=64,
separators=["\n\n", "\n", ". ", " ", ""]
)
chunks = splitter.split_text(text)
return [
{
"text": chunk,
"source": source,
"chunk_index": i,
"chunk_id": hashlib.md5(f"{source}:{i}:{chunk[:50]}".encode()).hexdigest()
}
for i, chunk in enumerate(chunks)
]
def embed_batch(texts: list[str]) -> list[list[float]]:
response = oai.embeddings.create(
model=EMBED_MODEL,
input=texts
)
return [item.embedding for item in response.data]
def ingest_documents(docs: list[dict]):
"""
docs: list of {"text": str, "source": str}
"""
index = get_or_create_index()
all_chunks = []
for doc in docs:
chunks = chunk_document(doc["text"], doc["source"])
all_chunks.extend(chunks)
# Embed in batches of 100 (OpenAI limit)
batch_size = 100
vectors = []
for i in range(0, len(all_chunks), batch_size):
batch = all_chunks[i:i + batch_size]
texts = [c["text"] for c in batch]
embeddings = embed_batch(texts)
for chunk, embedding in zip(batch, embeddings):
vectors.append({
"id": chunk["chunk_id"],
"values": embedding,
"metadata": {
"text": chunk["text"],
"source": chunk["source"],
"chunk_index": chunk["chunk_index"]
}
})
# Upsert in batches of 100
for i in range(0, len(vectors), 100):
index.upsert(vectors=vectors[i:i + 100])
print(f"Ingested {len(vectors)} chunks from {len(docs)} documents")
# Example usage
if __name__ == "__main__":
sample_docs = [
{
"text": Path("docs/api-reference.md").read_text(),
"source": "api-reference.md"
},
{
"text": Path("docs/deployment-guide.md").read_text(),
"source": "deployment-guide.md"
}
]
ingest_documents(sample_docs)
Step 2: Retrieval
# retrieval.py
import os
from pinecone import Pinecone
from openai import OpenAI
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
oai = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
def retrieve_context(
query: str,
top_k: int = 5,
score_threshold: float = 0.70
) -> list[dict]:
index = pc.Index(os.environ["PINECONE_INDEX_NAME"])
# Embed the query
query_embedding = oai.embeddings.create(
model="text-embedding-3-small",
input=[query]
).data[0].embedding
# Search
results = index.query(
vector=query_embedding,
top_k=top_k,
include_metadata=True
)
# Filter by score threshold and deduplicate by source chunk
seen_chunks = set()
contexts = []
for match in results.matches:
if match.score < score_threshold:
continue
chunk_key = f"{match.metadata['source']}:{match.metadata['chunk_index']}"
if chunk_key in seen_chunks:
continue
seen_chunks.add(chunk_key)
contexts.append({
"text": match.metadata["text"],
"source": match.metadata["source"],
"score": round(match.score, 3)
})
return contexts
def format_context_block(contexts: list[dict]) -> str:
if not contexts:
return "No relevant context found."
parts = []
for i, ctx in enumerate(contexts, 1):
parts.append(
f"[{i}] Source: {ctx['source']} (relevance: {ctx['score']})\n"
f"{ctx['text']}"
)
return "\n\n---\n\n".join(parts)
Step 3: Generation with Claude
# generation.py
import os
from anthropic import Anthropic
from retrieval import retrieve_context, format_context_block
client = Anthropic()
SYSTEM_PROMPT = """You are a helpful assistant that answers questions based on
provided documentation. Follow these rules:
1. Answer only from the provided context. If the context doesn't contain the
answer, say "I don't have information about that in the documentation."
2. Cite your sources using [1], [2] notation when referencing specific content.
3. Be concise. Prefer bullet points for lists of steps.
4. If the user asks for code, provide complete, runnable examples."""
def generate_answer(
query: str,
conversation_history: list[dict] = None
) -> dict:
contexts = retrieve_context(query, top_k=5)
context_block = format_context_block(contexts)
messages = conversation_history or []
messages.append({
"role": "user",
"content": f"""Context from documentation:
{context_block}
---
Question: {query}"""
})
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=1024,
system=SYSTEM_PROMPT,
messages=messages
)
answer = response.content[0].text
# Add assistant response to history (without context for future turns)
messages.append({"role": "assistant", "content": answer})
# Clean history: replace the user message with just the query
messages[-2] = {"role": "user", "content": query}
return {
"answer": answer,
"sources": [c["source"] for c in contexts],
"context_used": len(contexts),
"history": messages
}
Step 4: FastAPI Endpoint
# api.py
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from typing import Optional
import json
from generation import generate_answer
from anthropic import Anthropic
app = FastAPI(title="RAG Chatbot API")
client = Anthropic()
# In-memory session store (use Redis in production)
sessions: dict[str, list[dict]] = {}
class ChatRequest(BaseModel):
session_id: str
message: str
class ChatResponse(BaseModel):
answer: str
sources: list[str]
session_id: str
@app.post("/chat", response_model=ChatResponse)
async def chat(req: ChatRequest):
history = sessions.get(req.session_id, [])
try:
result = generate_answer(req.message, history)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
sessions[req.session_id] = result["history"][-20:] # Keep last 10 turns
return ChatResponse(
answer=result["answer"],
sources=result["sources"],
session_id=req.session_id
)
@app.delete("/sessions/{session_id}")
async def clear_session(session_id: str):
sessions.pop(session_id, None)
return {"cleared": session_id}
@app.get("/health")
async def health():
return {"status": "ok"}
# Run with: uvicorn api:app --reload
Common Failure Modes
Low retrieval relevance (score < 0.7 consistently)
Your chunks are too large. Drop chunk_size from 512 to 256. Large chunks dilute the embedding signal. Also check that your documents are prose — code-heavy docs need a different chunking strategy.
Hallucination despite good retrieval
The system prompt’s “answer only from context” instruction works ~90% of the time. For higher reliability, add a verification step:
def verify_grounded(answer: str, contexts: list[dict]) -> bool:
context_text = " ".join(c["text"] for c in contexts)
# Simple heuristic: check if key entities from answer appear in context
# For production: use a separate LLM call to verify
return len(answer) < 50 or any(
word in context_text
for word in answer.split()
if len(word) > 6
)
Rate limits on ingestion
OpenAI’s embedding API allows 1M tokens/minute on tier 2. For large document sets, add exponential backoff:
import time
from tenacity import retry, stop_after_attempt, wait_exponential
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10))
def embed_batch_with_retry(texts: list[str]) -> list[list[float]]:
return embed_batch(texts)
Chunk Strategy by Document Type
Not all documents chunk the same way. Using 512-token fixed-size chunks works fine for prose documentation but breaks poorly for structured content.
API reference docs: Split by endpoint or method. A chunk should contain one complete endpoint description — mixing two endpoint descriptions into one chunk dilutes the embedding for both.
Markdown with headers: Use RecursiveCharacterTextSplitter with "\n## " as a high-priority separator. This keeps H2 sections together, which typically represent coherent concepts.
Code-heavy docs: Add a custom splitter that respects code fence boundaries. A chunk that cuts across a code block mid-function will embed poorly and retrieve inaccurately.
CODE_AWARE_SEPARATORS = [
"\n```\n", # code block end
"\n## ", # H2 section break
"\n### ", # H3 section break
"\n\n", # paragraph break
"\n",
". ",
" ",
""
]
splitter = RecursiveCharacterTextSplitter(
chunk_size=400,
chunk_overlap=50,
separators=CODE_AWARE_SEPARATORS,
keep_separator=True
)
Multi-Tenant Isolation with Namespaces
If you’re serving multiple customers or projects from one Pinecone index, use namespace isolation rather than separate indexes. Namespaces are free and keep your index count manageable.
def ingest_for_tenant(docs: list[dict], tenant_id: str):
index = get_or_create_index()
# ... chunk and embed as before ...
index.upsert(vectors=vectors, namespace=tenant_id)
def retrieve_for_tenant(
query: str,
tenant_id: str,
top_k: int = 5
) -> list[dict]:
index = pc.Index(os.environ["PINECONE_INDEX_NAME"])
query_embedding = embed_query(query)
results = index.query(
vector=query_embedding,
top_k=top_k,
include_metadata=True,
namespace=tenant_id # isolates results to this tenant
)
return [m.metadata for m in results.matches if m.score >= 0.70]
This pattern lets you offer per-customer knowledge bases without provisioning separate indexes or worrying about data leakage between tenants.
Re-Ranking Retrieved Chunks
Cosine similarity ranking is fast but imperfect. A cross-encoder re-ranker improves relevance by scoring each candidate chunk against the full query — computationally heavier, but worth it for precision-sensitive use cases.
from sentence_transformers import CrossEncoder
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
def rerank(query: str, candidates: list[dict]) -> list[dict]:
pairs = [(query, c["text"]) for c in candidates]
scores = reranker.predict(pairs)
ranked = sorted(
zip(candidates, scores),
key=lambda x: x[1],
reverse=True
)
return [item for item, _ in ranked]
Run the initial Pinecone search with top_k=20, then re-rank and pass only the top 5 to the LLM. This maintains Pinecone’s speed advantage while improving the final answer quality significantly.