Dynamo DB Docstore Demo
This guide shows you how to directly use our DocumentStore
abstraction backed by DynamoDB. By putting nodes in the docstore, this allows you to define multiple indices over the same underlying docstore, instead of duplicating data across indices.
If you’re opening this Notebook on colab, you will probably need to install LlamaIndex 🦙.
%pip install llama-index-storage-docstore-dynamodb%pip install llama-index-storage-index-store-dynamodb%pip install llama-index-vector-stores-dynamodb%pip install llama-index-llms-openai
!pip install llama-index
import nest_asyncio
nest_asyncio.apply()
import loggingimport sysimport os
logging.basicConfig(stream=sys.stdout, level=logging.INFO)logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
from llama_index.core import SimpleDirectoryReader, StorageContextfrom llama_index.core import VectorStoreIndex, SimpleKeywordTableIndexfrom llama_index.core import SummaryIndexfrom llama_index.llms.openai import OpenAIfrom llama_index.core.response.notebook_utils import display_responsefrom llama_index.core import Settings
Download Data
Section titled “Download Data”!mkdir -p 'data/paul_graham/'!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
Load Documents
Section titled “Load Documents”reader = SimpleDirectoryReader("./data/paul_graham/")documents = reader.load_data()
Parse into Nodes
Section titled “Parse into Nodes”from llama_index.core.node_parser import SentenceSplitter
nodes = SentenceSplitter().get_nodes_from_documents(documents)
Add to Docstore
Section titled “Add to Docstore”TABLE_NAME = os.environ["DYNAMODB_TABLE_NAME"]
from llama_index.storage.docstore.dynamodb import DynamoDBDocumentStorefrom llama_index.storage.index_store.dynamodb import DynamoDBIndexStorefrom llama_index.vector_stores.dynamodb import DynamoDBVectorStore
storage_context = StorageContext.from_defaults( docstore=DynamoDBDocumentStore.from_table_name(table_name=TABLE_NAME), index_store=DynamoDBIndexStore.from_table_name(table_name=TABLE_NAME), vector_store=DynamoDBVectorStore.from_table_name(table_name=TABLE_NAME),)
storage_context.docstore.add_documents(nodes)
Define & Add Multiple Indexes
Section titled “Define & Add Multiple Indexes”Each index uses the same underlying Node.
# https://gpt-index.readthedocs.io/en/latest/api_reference/indices/list.htmlsummary_index = SummaryIndex(nodes, storage_context=storage_context)
# https://gpt-index.readthedocs.io/en/latest/api_reference/indices/vector_store.htmlvector_index = VectorStoreIndex(nodes, storage_context=storage_context)
# https://gpt-index.readthedocs.io/en/latest/api_reference/indices/table.htmlkeyword_table_index = SimpleKeywordTableIndex( nodes, storage_context=storage_context)
# NOTE: the docstore still has the same nodeslen(storage_context.docstore.docs)
Test out saving and loading
Section titled “Test out saving and loading”# NOTE: docstore, index_store, and vector_index is persisted in DynamoDB by default when they are created# NOTE: You can also persist simple vector store to disk by using the command belowstorage_context.persist()
# note down index IDslist_id = summary_index.index_idvector_id = vector_index.index_idkeyword_id = keyword_table_index.index_id
from llama_index.core import load_index_from_storage
# re-create storage contextstorage_context = StorageContext.from_defaults( docstore=DynamoDBDocumentStore.from_table_name(table_name=TABLE_NAME), index_store=DynamoDBIndexStore.from_table_name(table_name=TABLE_NAME), vector_store=DynamoDBVectorStore.from_table_name(table_name=TABLE_NAME),)
summary_index = load_index_from_storage( storage_context=storage_context, index_id=list_id)keyword_table_index = load_index_from_storage( storage_context=storage_context, index_id=keyword_id)
# You need to add "vector_store=DynamoDBVectorStore.from_table_name(table_name=TABLE_NAME)" to StorageContext to load vector index from DynamoDBvector_index = load_index_from_storage( storage_context=storage_context, index_id=vector_id)
Test out some Queries
Section titled “Test out some Queries”chatgpt = OpenAI(temperature=0, model="gpt-3.5-turbo")
Settings.llm = chatgptSettings.chunk_size = 1024
query_engine = summary_index.as_query_engine()list_response = query_engine.query("What is a summary of this document?")
display_response(list_response)
query_engine = vector_index.as_query_engine()vector_response = query_engine.query("What did the author do growing up?")
display_response(vector_response)
query_engine = keyword_table_index.as_query_engine()keyword_response = query_engine.query( "What did the author do after his time at YC?")
display_response(keyword_response)