Reranking top pages from PDF using LlamaParse and ZeroEntropy
In this guide, we’ll build a simple workflow to parse PDF documents into text using LlamaParse and then query and rerank the textual data.
Pre-requisites
Section titled “Pre-requisites”- Python 3.8+
zeroentropy
clientllama_cloud_services
client- A ZeroEntropy API key (Get yours here)
- A LlamaParse API key (Get yours here)
What You’ll Learn
Section titled “What You’ll Learn”- How to use LlamaParse to accurately convert PDF documents into markdown
- How to use ZeroEntropy to semantically index and query the parsed documents
- How to rerank your results using ZeroEntropy’s reranker zerank-1 to boost accuracy
Setting up your ZeroEntropy Client and LlamaParse Client
Section titled “Setting up your ZeroEntropy Client and LlamaParse Client”First, install dependencies:
!pip install zeroentropy python-dotenv llama_cloud_services requests
Now load your API keys and initialize the clients
# Get your API keys from the ZeroEntropy and LlamaParse websites# https://dashboard.zeroentropy.dev/# https://docs.cloud.llamaindex.ai/api_keyZEROENTROPY_API_KEY = "your_api_key_here"LLAMAPARSE_API_KEY = "your_api_key_here"
from zeroentropy import AsyncZeroEntropy, ConflictErrorfrom llama_cloud_services import LlamaParseimport os
# We initialize the AsyncZeroEntropy client in order to parse multiple documents in parallel# If you want to parse a single document, you can use the synchronous client insteadzclient = AsyncZeroEntropy(api_key=ZEROENTROPY_API_KEY)
# We initialize the llama_parse client to parse the PDF documents into textllamaParser = LlamaParse( api_key=LLAMAPARSE_API_KEY, num_workers=1, # if multiple files passed, split in `num_workers` API calls result_type="text", verbose=True, language="en", # optionally define a language, default=en)
Adding a collection to the ZeroEntropy client
Section titled “Adding a collection to the ZeroEntropy client”collection_name = "my_collection"await zclient.collections.add(collection_name=collection_name)
Now define a function to download and extract PDF files from Dropbox directly to memory:
import requestsimport zipfileimport asyncioimport iofrom typing import List, Tuple
def download_and_extract_dropbox_zip_to_memory( url: str,) -> List[Tuple[str, bytes]]: """Download and extract a zip file from Dropbox URL directly to memory.
Returns: List of tuples containing (filename, file_content_bytes) """ try: # Download the zip file print(f"Downloading zip file from: {url}") response = requests.get(url, stream=True) response.raise_for_status()
# Read zip content into memory zip_content = io.BytesIO() for chunk in response.iter_content(chunk_size=8192): zip_content.write(chunk) zip_content.seek(0)
# Extract files from zip in memory files_in_memory = [] with zipfile.ZipFile(zip_content, "r") as zip_ref: for file_info in zip_ref.infolist(): if ( not file_info.is_dir() and file_info.filename.lower().endswith(".pdf") ): file_content = zip_ref.read(file_info.filename) files_in_memory.append((file_info.filename, file_content)) print( f"Loaded {file_info.filename} ({len(file_content)} bytes)" )
print( f"Successfully loaded {len(files_in_memory)} PDF files into memory" ) return files_in_memory
except Exception as e: print(f"Error downloading/extracting zip file: {e}") raise
# Download and extract files from Dropbox directly to memorydropbox_url = "https://www.dropbox.com/scl/fi/oi6kf91gz8h76d2wt57mb/example_docs.zip?rlkey=mf21tvyb65tyrjkr1t2szt226&dl=1"files_in_memory = download_and_extract_dropbox_zip_to_memory(dropbox_url)
Downloading zip file from: https://www.dropbox.com/scl/fi/oi6kf91gz8h76d2wt57mb/example_docs.zip?rlkey=mf21tvyb65tyrjkr1t2szt226&dl=1Loaded example_docs/S-P-Global-2024-Annual-Report.pdf (2434264 bytes)Loaded example_docs/annual-report-sg-en-spy.pdf (603698 bytes)Loaded example_docs/dashboard-sp-500-factor.pdf (1717787 bytes)Successfully loaded 3 PDF files into memory
Parsing PDFs using LlamaParse
Section titled “Parsing PDFs using LlamaParse”Let’s download the PDF files from Dropbox and parse them directly in memory using LlamaParse:
# Create file-like objects for LlamaParsefile_objects = []file_names = []
for filename, file_content in files_in_memory: # Create a file-like object from bytes file_obj = io.BytesIO(file_content) file_obj.name = filename # Set the name attribute for LlamaParse file_objects.append(file_obj) file_names.append(filename)
# Parse all PDF files at once using LlamaParse# Include extra_info with file names formatted as dictionaries for byte data parsingprint(f"Parsing {len(file_objects)} PDF files...")
# Use async parsing to avoid nested event loop issuestext_data = await asyncio.gather( *[ llamaParser.aparse(file_obj, extra_info={"file_name": name}) for file_obj, name in zip(file_objects, file_names) ])print(f"Successfully parsed {len(text_data)} documents")
Parsing 3 PDF files...Started parsing the file under job_id a1324745-c58b-4a24-b757-c6a6a58e57cdStarted parsing the file under job_id 326b947e-9d95-4dc3-aeaf-440b9cc03016Started parsing the file under job_id b8534aa0-ed69-4079-a720-1b2471066c6f............Successfully parsed 3 documents
Organizing your documents
Section titled “Organizing your documents”Once parsed, we form a list of documents with a list of the pages within them.
docs = []
for dindex, doc in enumerate(text_data): pages = [] for index, page in enumerate(doc.pages): pages.append(page.text) docs.append(pages)
print(f"Organized {len(docs)} documents with pages")if docs: print(f"First document has {len(docs[0])} pages")
Organized 3 documents with pagesFirst document has 104 pages
Querying with ZeroEntropy
Section titled “Querying with ZeroEntropy”We’ll now define functions to upload the documents as text pages asynchroniously.
import asynciofrom tqdm.asyncio import tqdm
sem = asyncio.Semaphore(16)
async def add_document_with_pages( collection_name: str, filename: str, pages: list, doc_index: int): """Add a single document with multiple pages to the collection.""" async with sem: # Limit concurrent operations for retry in range(3): # Retry logic try: response = await zclient.documents.add( collection_name=collection_name, path=filename, # Use the actual filename as path content={ "type": "text-pages", "pages": pages, # Send list of strings directly }, ) return response except ConflictError: print( f"Document '{filename}' already exists in collection '{collection_name}'" ) break except Exception as e: if retry == 2: # Last retry print(f"Failed to add document '{filename}': {e}") return None await asyncio.sleep(0.1 * (retry + 1)) # Exponential backoff
async def upload_documents_async( docs: list, file_names: list, collection_name: str): """ Upload documents asynchronously to ZeroEntropy collection.
Args: docs: 2D array where docs[i] contains the list of pages (strings) for document i file_names: Array where file_names[i] contains the path for document i collection_name: Name of the collection to add documents to """
# Validate input arrays have same length if len(docs) != len(file_names): raise ValueError("docs and file_names must have the same length")
# Print starting message print(f"Starting upload of {len(docs)} documents...")
# Create tasks for all documents tasks = [ add_document_with_pages(collection_name, file_names[i], docs[i], i) for i in range(len(docs)) ]
# Execute all tasks concurrently with progress bar results = await tqdm.gather(*tasks, desc="Uploading Documents")
# Count successful uploads successful = sum(1 for result in results if result is not None) print(f"Successfully uploaded {successful}/{len(docs)} documents")
return results
Querying documents with ZeroEntropy
Section titled “Querying documents with ZeroEntropy”First we will upload documents
await upload_documents_async(docs, file_names, collection_name)
Starting upload of 3 documents...
Uploading Documents: 100%|██████████| 3/3 [00:00<00:00, 3.42it/s]
Successfully uploaded 3/3 documents
[DocumentAddResponse(message='Success!'), DocumentAddResponse(message='Success!'), DocumentAddResponse(message='Success!')]
Query for the top 5 pages
response = await zclient.queries.top_pages( collection_name=collection_name, query="What are the top 100 stocks in the S&P 500?", k=5,)
Now let’s define a function to rerank the pages in the response:
async def rerank_top_pages_with_metadata( query: str, top_pages_response, collection_name: str): """ Rerank the results from a top_pages query and return re-ordered list with metadata.
Args: query: The query string to use for reranking top_pages_response: The response object from zclient.queries.top_pages() collection_name: Name of the collection to fetch page content from
Returns: List of dicts with 'path', 'page_index', and 'rerank_score' in reranked order """
# Fetch page content and store metadata for each result documents = [] metadata = []
for result in top_pages_response.results: # Fetch the actual page content page_info = await zclient.documents.get_page_info( collection_name=collection_name, path=result.path, page_index=result.page_index, include_content=True, )
# Get page content and ensure it's not empty page_content = page_info.page.content if page_content and page_content.strip(): documents.append(page_content.strip()) metadata.append( { "path": result.path, "page_index": result.page_index, "original_score": result.score, } ) else: # Include empty pages with fallback content documents.append("No content available") metadata.append( { "path": result.path, "page_index": result.page_index, "original_score": result.score, } )
if not documents: raise ValueError("No documents found to rerank")
# Perform reranking rerank_response = await zclient.models.rerank( model="zerank-1", query=query, documents=documents )
# Create re-ordered list with metadata reranked_results = [] for rerank_result in rerank_response.results: original_metadata = metadata[rerank_result.index] reranked_results.append( { "path": original_metadata["path"], "page_index": original_metadata["page_index"], "rerank_score": rerank_result.relevance_score, } )
return reranked_results
Run the function and see the results!
reranked_results = await rerank_top_pages_with_metadata( query="What are the top 100 stocks in the S&P 500?", top_pages_response=response, collection_name=collection_name,)
# Display resultsprint("Reranked Results with Metadata:")for i, result in enumerate(reranked_results, 1): print( f"Rank {i}: {result['path']} (Page {result['page_index']}) - Score: {result['rerank_score']:.4f}" )
Reranked Results with Metadata:Rank 1: example_docs/dashboard-sp-500-factor.pdf (Page 9) - Score: 0.8472Rank 2: example_docs/dashboard-sp-500-factor.pdf (Page 12) - Score: 0.8311Rank 3: example_docs/dashboard-sp-500-factor.pdf (Page 8) - Score: 0.7941Rank 4: example_docs/dashboard-sp-500-factor.pdf (Page 2) - Score: 0.4571Rank 5: example_docs/dashboard-sp-500-factor.pdf (Page 4) - Score: 0.4511
✅ That’s It!
Section titled “✅ That’s It!”You’ve now built a working semantic search engine that processes PDF files entirely in memory using ZeroEntropy and LlamaParse — no local file storage required!