Skip to content

llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio

import nest_asyncio

nest_asyncio.apply()

import os

os.environ[“LLAMA_CLOUD_API_KEY”] = “llx-…”

os.environ[“OPENAI_API_KEY”] = “sk-…”

#### Setting LLM and Embedding Model
```python
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings
embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-3.5-turbo-0125")
Settings.llm = llm
Settings.embed_model = embed_model

We compare two different retrieval/ queryengine strategies.

  1. Using raw Markdown text as nodes for building index and applying a simple query engine for generating results.
  2. Using MarkdownElementNodeParser for parsing the LlamaParse output Markdown results and building a recursive retriever query engine for generation.
# LlamaParse PDF reader for PDF Parsing
from llama_parse import LlamaParse
documents = LlamaParse(result_type="markdown").load_data(
"./uber_10q_march_2022.pdf"
)
# Started parsing the file under job_id b76a572b-d2bb-42ae-bad9-b9810049f1af
Started parsing the file under job_id 0ef2f65b-9cab-4ca8-b221-d20f1f6d1336
print(documents[0].text[:1000] + "...")
# UNITED STATES SECURITIES AND EXCHANGE COMMISSION
# Washington, D.C. 20549
# FORM 10-Q
(Mark One)
☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the quarterly period ended March 31, 2022
OR
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the transition period from_____ to _____
Commission File Number: 001-38902
# UBER TECHNOLOGIES, INC.
(Exact name of registrant as specified in its charter)
Not Applicable
(Former name, former address and former fiscal year, if changed since last report)
|Delaware|45-2647441|
|---|---|
|(State or other jurisdiction of incorporation or organization)|(I.R.S. Employer Identification No.)|
|1515 3rd Street|San Francisco, California 94158|
|(Address of principal executive offices, including zip code)|(415) 612-8582|
|(Registrant’s telephone number, including area code)| |
# Securities registered pursuant to Section 12(b) of the Act:
|Title of each c...
from llama_index.core.node_parser import MarkdownElementNodeParser
node_parser = MarkdownElementNodeParser(
llm=OpenAI(model="gpt-3.5-turbo-0125"), num_workers=8
)
nodes = node_parser.get_nodes_from_documents(documents)
3it [00:00, 41803.69it/s]
1it [00:00, 22310.13it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 20867.18it/s]
1it [00:00, 22429.43it/s]
1it [00:00, 21399.51it/s]
1it [00:00, 20460.02it/s]
1it [00:00, 19508.39it/s]
1it [00:00, 19508.39it/s]
5it [00:00, 85598.04it/s]
0it [00:00, ?it/s]
2it [00:00, 41527.76it/s]
2it [00:00, 46091.25it/s]
2it [00:00, 40524.68it/s]
2it [00:00, 38836.15it/s]
2it [00:00, 42366.71it/s]
2it [00:00, 41943.04it/s]
1it [00:00, 23967.45it/s]
1it [00:00, 24818.37it/s]
1it [00:00, 25890.77it/s]
4it [00:00, 72628.64it/s]
2it [00:00, 38836.15it/s]
3it [00:00, 41943.04it/s]
0it [00:00, ?it/s]
3it [00:00, 58254.22it/s]
3it [00:00, 53773.13it/s]
1it [00:00, 25575.02it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 26051.58it/s]
1it [00:00, 21509.25it/s]
0it [00:00, ?it/s]
1it [00:00, 16008.79it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
2it [00:00, 42153.81it/s]
4it [00:00, 76260.07it/s]
5it [00:00, 75166.74it/s]
2it [00:00, 39383.14it/s]
2it [00:00, 39756.44it/s]
1it [00:00, 24244.53it/s]
2it [00:00, 42153.81it/s]
0it [00:00, ?it/s]
1it [00:00, 23045.63it/s]
1it [00:00, 23431.87it/s]
1it [00:00, 24528.09it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 10810.06it/s]
2it [00:00, 8473.34it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 12633.45it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
text_nodes, index_nodes = node_parser.get_nodes_and_objects(nodes)
text_nodes[0]
TextNode(id_='c6ffea61-1221-40e3-b0e0-5b24cfbd02d5', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='33b7b29c-8eba-458b-a25f-bb8f88951e92', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='3d4ec5b02a042598b0ea47cdac56453869c17b531a10f60343e9598e05a9390e'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='de618b65-c78a-4390-8536-4e9e295c0e49', node_type=<ObjectType.INDEX: '3'>, metadata={'col_schema': 'Column: Delaware\nType: string\nSummary: State or other jurisdiction of incorporation or organization\n\nColumn: 45-2647441\nType: string\nSummary: I.R.S. Employer Identification No.'}, hash='c008153189b8dd031a3e5e694239a50ebd21f42602676f072d9746241fcef858')}, text='UNITED STATES SECURITIES AND EXCHANGE COMMISSION\n\n Washington, D.C. 20549\n\n FORM 10-Q\n\n(Mark One)\n\n☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\n\nFor the quarterly period ended March 31, 2022\n\nOR\n\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\n\nFor the transition period from_____ to _____\n\nCommission File Number: 001-38902\n\n UBER TECHNOLOGIES, INC.\n\n(Exact name of registrant as specified in its charter)\n\nNot Applicable\n\n(Former name, former address and former fiscal year, if changed since last report)', mimetype='text/plain', start_char_idx=1, end_char_idx=595, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')
index_nodes[0]
IndexNode(id_='de618b65-c78a-4390-8536-4e9e295c0e49', embedding=None, metadata={'col_schema': 'Column: Delaware\nType: string\nSummary: State or other jurisdiction of incorporation or organization\n\nColumn: 45-2647441\nType: string\nSummary: I.R.S. Employer Identification No.'}, excluded_embed_metadata_keys=['col_schema'], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='33b7b29c-8eba-458b-a25f-bb8f88951e92', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='3d4ec5b02a042598b0ea47cdac56453869c17b531a10f60343e9598e05a9390e'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='c6ffea61-1221-40e3-b0e0-5b24cfbd02d5', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='0cafbb2bbffe3085738e748c9ed19c5b88f6b300d876820fc3caa7afa8f0627f'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='c57f8dab-7b69-4850-8885-6a9cf0f531f9', node_type=<ObjectType.TEXT: '1'>, metadata={'table_df': "{'Delaware': {0: '(State or other jurisdiction of incorporation or organization)', 1: '1515 3rd Street', 2: '(Address of principal executive offices, including zip code)', 3: '(Registrant’s telephone number, including area code)'}, '45-2647441': {0: '(I.R.S. Employer Identification No.)', 1: 'San Francisco, California 94158', 2: '(415) 612-8582', 3: ' '}}", 'table_summary': "Table providing information about a company's incorporation details, address of principal executive offices, and contact information.,\nwith the following columns:\n- Delaware: State or other jurisdiction of incorporation or organization\n- 45-2647441: I.R.S. Employer Identification No.\n"}, hash='fadc844962620525c1d3c8d7ff1693a090642818928f9ce7600117258a39aa04')}, text="Table providing information about a company's incorporation details, address of principal executive offices, and contact information.,\nwith the following columns:\n- Delaware: State or other jurisdiction of incorporation or organization\n- 45-2647441: I.R.S. Employer Identification No.\n", mimetype='text/plain', start_char_idx=601, end_char_idx=919, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n', index_id='c57f8dab-7b69-4850-8885-6a9cf0f531f9', obj=TextNode(id_='c57f8dab-7b69-4850-8885-6a9cf0f531f9', embedding=None, metadata={'table_df': "{'Delaware': {0: '(State or other jurisdiction of incorporation or organization)', 1: '1515 3rd Street', 2: '(Address of principal executive offices, including zip code)', 3: '(Registrant’s telephone number, including area code)'}, '45-2647441': {0: '(I.R.S. Employer Identification No.)', 1: 'San Francisco, California 94158', 2: '(415) 612-8582', 3: ' '}}", 'table_summary': "Table providing information about a company's incorporation details, address of principal executive offices, and contact information.,\nwith the following columns:\n- Delaware: State or other jurisdiction of incorporation or organization\n- 45-2647441: I.R.S. Employer Identification No.\n"}, excluded_embed_metadata_keys=['table_df', 'table_summary'], excluded_llm_metadata_keys=['table_df', 'table_summary'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='33b7b29c-8eba-458b-a25f-bb8f88951e92', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='3d4ec5b02a042598b0ea47cdac56453869c17b531a10f60343e9598e05a9390e'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='de618b65-c78a-4390-8536-4e9e295c0e49', node_type=<ObjectType.INDEX: '3'>, metadata={'col_schema': 'Column: Delaware\nType: string\nSummary: State or other jurisdiction of incorporation or organization\n\nColumn: 45-2647441\nType: string\nSummary: I.R.S. Employer Identification No.'}, hash='c008153189b8dd031a3e5e694239a50ebd21f42602676f072d9746241fcef858'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='c0fa90a9-fe14-46fa-8434-cd70e134d40e', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='cc6b1c09572e3a0bd06a93ccea32a5562b36a393a36ede3852f4a5fc946c51fd')}, text="Table providing information about a company's incorporation details, address of principal executive offices, and contact information.,\nwith the following columns:\n- Delaware: State or other jurisdiction of incorporation or organization\n- 45-2647441: I.R.S. Employer Identification No.\n\n|Delaware|45-2647441|\n|---|---|\n|(State or other jurisdiction of incorporation or organization)|(I.R.S. Employer Identification No.)|\n|1515 3rd Street|San Francisco, California 94158|\n|(Address of principal executive offices, including zip code)|(415) 612-8582|\n|(Registrant’s telephone number, including area code)| |\n", mimetype='text/plain', start_char_idx=601, end_char_idx=919, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'))
recursive_index = VectorStoreIndex(nodes=text_nodes + index_nodes)
raw_index = VectorStoreIndex.from_documents(documents)
from llama_index.postprocessor.flag_embedding_reranker import (
FlagEmbeddingReranker,
)
reranker = FlagEmbeddingReranker(
top_n=5,
model="BAAI/bge-reranker-large",
)
tokenizer_config.json: 0%| | 0.00/443 [00:00<?, ?B/s]
sentencepiece.bpe.model: 0%| | 0.00/5.07M [00:00<?, ?B/s]
tokenizer.json: 0%| | 0.00/17.1M [00:00<?, ?B/s]
special_tokens_map.json: 0%| | 0.00/279 [00:00<?, ?B/s]
config.json: 0%| | 0.00/801 [00:00<?, ?B/s]
model.safetensors: 0%| | 0.00/2.24G [00:00<?, ?B/s]
recursive_query_engine = recursive_index.as_query_engine(
similarity_top_k=15, node_postprocessors=[reranker], verbose=True
)
raw_query_engine = raw_index.as_query_engine(
similarity_top_k=15, node_postprocessors=[reranker]
)

we compare base query engine vs recursive query engine with tables

Table Query Task: Queries for Table Question Answering
Section titled “Table Query Task: Queries for Table Question Answering”
query = "What is the change of free cash flow and what is the rate from the financial and operational highlights?"
response_1 = raw_query_engine.query(query)
print("\n************New LlamaParse+ Basic Query Engine************")
print(response_1)
response_2 = recursive_query_engine.query(query)
print(
"\n************New LlamaParse+ Recursive Retriever Query Engine************"
)
print(response_2)
************New LlamaParse+ Basic Query Engine************
The change in free cash flow from the financial and operational highlights is an increase of $826 million, from a net cash used in operating activities of $611 million in 2021 to net cash provided by operating activities of $215 million in 2022. The rate of this change is a positive improvement.
Retrieval entering 015f9778-1f7c-44cd-9e26-90f2c9e21550: TextNode
Retrieving from object TextNode with query What is the change of free cash flow and what is the rate from the financial and operational highlights?
Retrieval entering 5e8febd0-0c43-4552-9499-9465674b8877: TextNode
Retrieving from object TextNode with query What is the change of free cash flow and what is the rate from the financial and operational highlights?
Retrieval entering d3c8d59b-9d7e-4088-94e9-3e58aba09f10: TextNode
Retrieving from object TextNode with query What is the change of free cash flow and what is the rate from the financial and operational highlights?
Retrieval entering 25385e8f-24df-4660-959b-c499cc220246: TextNode
Retrieving from object TextNode with query What is the change of free cash flow and what is the rate from the financial and operational highlights?
Retrieval entering 6b48fe6f-a60e-425a-aba5-22b62d1b4512: TextNode
Retrieving from object TextNode with query What is the change of free cash flow and what is the rate from the financial and operational highlights?
Retrieval entering aa60761d-1d8a-4896-aef9-e41553f17558: TextNode
Retrieving from object TextNode with query What is the change of free cash flow and what is the rate from the financial and operational highlights?
Retrieval entering d0634d58-0589-47cb-9921-d2d57d88240f: TextNode
Retrieving from object TextNode with query What is the change of free cash flow and what is the rate from the financial and operational highlights?
Retrieval entering 492b36c1-8b39-4db9-8dca-dd9f6c488a9d: TextNode
Retrieving from object TextNode with query What is the change of free cash flow and what is the rate from the financial and operational highlights?

************New LlamaParse+ Recursive Retriever Query Engine************
The change in free cash flow from 2021 to 2022 is an increase of $635 million. This change represents a significant improvement in free cash flow performance over the period.