**WordLift** Vector Store
Introduction
Section titled “Introduction”This script demonstrates how to crawl a product website, extract relevant information, build an SEO-friendly Knowledge Graph (a structured representation of PDPs and PLPs), and leverage it for improved search and user experience.
Key Features & Libraries:
Section titled “Key Features & Libraries:”- Web scraping (Advertools)
- Knowledge Graph creation for Product Detail Pages (PDPs) and Product Listing Pages (PLPs) - WordLift
- Product recommendations (WordLift Neural Search)
- Shopping assistant creation (WordLift + LlamaIndex 🦙)
This approach enhances SEO performance and user engagement for e-commerce sites.
Learn more about how it works here:
- https://www.youtube.com/watch?v=CH-ir1MTAwQ
- https://wordlift.io/academy-entries/mastering-serp-analysis-knowledge-graphs
![]() |
by
Andrea Volpini
and
David Riccitelli
MIT License Last updated: Jul 31st, 2024 |
!pip install advertools -q!pip install -U wordlift-client # 🎉 first time on stage 🎉!pip install rdflib -q
# Standard library importsimport jsonimport loggingimport osimport reimport urllib.parseimport requestsfrom typing import List, Optional
# Third-party importsimport advertools as advimport pandas as pdimport nest_asyncio
# RDFLib importsfrom rdflib import Graph, Literal, RDF, URIReffrom rdflib.namespace import SDO, Namespace, DefinedNamespace
# WordLift client importsimport wordlift_clientfrom wordlift_client import Configuration, ApiClientfrom wordlift_client.rest import ApiExceptionfrom wordlift_client.api.dataset_api import DatasetApifrom wordlift_client.api.entities_api import EntitiesApifrom wordlift_client.api.graph_ql_api import GraphQLApifrom wordlift_client.models.graphql_request import GraphqlRequestfrom wordlift_client.models.page_vector_search_query_response_item import ( PageVectorSearchQueryResponseItem,)from wordlift_client.models.vector_search_query_request import ( VectorSearchQueryRequest,)from wordlift_client.api.vector_search_queries_api import ( VectorSearchQueriesApi,)
# Asynchronous programmingimport asyncio
# Set up logginglogging.basicConfig(level=logging.INFO)logger = logging.getLogger(__name__)
# Apply nest_asyncionest_asyncio.apply()
WORDLIFT_KEY = os.getenv("WORDLIFT_KEY")OPENAI_KEY = os.getenv("OPENAI_KEY")
Crawl the Website w/ Advertools
Section titled “Crawl the Website w/ Advertools”# Step 1: Define the website structure# -----------------------------------
# We're working with two types of pages:# 1. Product Listing Pages (PLP): https://product-finder.wordlift.io/product-category/bags/# 2. Product Detail Pages (PDP): https://product-finder.wordlift.io/product/1980s-marco-polo-crossbody-bag-in-black/
# The product description can be found at this XPath:# /html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/div[2]/div[1]/p/text()# The price is here:# /html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/p/span/bdi/text()# The category is here:# //span[contains(@class, 'breadcrumb')]/a/text()
# Step 2: Set up the crawl# ------------------------
def crawl_website(url, output_file, num_pages=10): logger.info(f"Starting crawl of {url}") adv.crawl( url, output_file, follow_links=True, custom_settings={ "CLOSESPIDER_PAGECOUNT": num_pages, "USER_AGENT": "WordLiftBot/1.0 (Maven Project)", "CONCURRENT_REQUESTS_PER_DOMAIN": 2, "DOWNLOAD_DELAY": 1, "ROBOTSTXT_OBEY": False, }, xpath_selectors={ "product_description": "/html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/div[2]/div[1]/p/text()", "product_price": "/html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/p/span/bdi/text()", "product_category": "//span[@class='posted_in']/a/text()", }, ) logger.info(f"Crawl completed. Results saved to {output_file}")
# Step 3: Analyze URL patterns# ----------------------------
def analyze_url_patterns(df): df["page_type"] = df["url"].apply( lambda x: "PLP" if "/product-category/" in x else ("PDP" if "/product/" in x else "Other") ) logger.info( f"Found {(df['page_type'] == 'PLP').sum()} PLPs and {(df['page_type'] == 'PDP').sum()} PDPs" ) return df
# Step 4: Extract page data# ----------------------------
def extract_page_data(df): extracted_data = [] for _, row in df.iterrows(): page = { "url": row["url"], "title": row["title"], "page_type": row["page_type"], "meta_description": row.get("meta_description", ""), "og_title": row.get("og_title", ""), "og_description": row.get("og_description", ""), "h1": ", ".join(row.get("h1", [])) if isinstance(row.get("h1"), list) else row.get("h1", ""), "h2": ", ".join(row.get("h2", [])) if isinstance(row.get("h2"), list) else row.get("h2", ""), }
if row["page_type"] == "PDP": page.update( { "product_description": ", ".join( row.get("product_description", []) ) if isinstance(row.get("product_description"), list) else row.get("product_description", ""), "product_price": ", ".join(row.get("product_price", [])) if isinstance(row.get("product_price"), list) else row.get("product_price", ""), "product_category": ", ".join( row.get("product_category", []) ) if isinstance(row.get("product_category"), list) else row.get("product_category", ""), } ) elif row["page_type"] == "PLP": # Parse the category from the H1 content h1_content = ( row.get("h1", [""])[0] if isinstance(row.get("h1"), list) else row.get("h1", "") ) category = ( h1_content.split("@@")[-1] if "@@" in h1_content else h1_content.replace("Category: ", "").strip() ) page["category_name"] = category
extracted_data.append(page)
return pd.DataFrame(extracted_data)
Build the KG w/ WordLift 🕸
Section titled “Build the KG w/ WordLift 🕸”# Step 5: Configure the WordLift client# ----------------------------
# Create a configuration object for the WordLift API client using your WordLift key.configuration = Configuration(host="https://api.wordlift.io")configuration.api_key["ApiKey"] = WORDLIFT_KEYconfiguration.api_key_prefix["ApiKey"] = "Key"
EXAMPLE_PRIVATE_NS = Namespace("https://ns.example.org/private/")
BASE_URI = "http://data.wordlift.io/[dataset_id]/"
# Step 6: Build the KG and the embeddings# ----------------------------
async def cleanup_knowledge_graph(api_client): dataset_api = wordlift_client.DatasetApi(api_client) try: # Delete all await dataset_api.delete_all_entities() except Exception as e: print( "Exception when calling DatasetApi->delete_all_entities: %s\n" % e )
async def create_entity(entities_api, entity_data): g = Graph().parse(data=json.dumps(entity_data), format="json-ld") body = g.serialize(format="application/rdf+xml") await entities_api.create_or_update_entities( body=body, _content_type="application/rdf+xml" )
def replace_url(original_url: str) -> str: old_domain = "https://product-finder.wordlift.io/" new_domain = "https://data-science-with-python-for-seo.wordlift.dev/"
if original_url.startswith(old_domain): return original_url.replace(old_domain, new_domain, 1) else: return original_url
def create_entity_uri(url): parsed_url = urllib.parse.urlparse(url) path = parsed_url.path.strip("/") path_parts = path.split("/") fragment = parsed_url.fragment
if "product" in path_parts: # It's a product page or product offer product_id = path_parts[-1] # Get the last part of the path if fragment == "offer": return f"{BASE_URI}offer_{product_id}" else: return f"{BASE_URI}product_{product_id}" elif "product-category" in path_parts: # It's a product listing page (PLP) category = path_parts[-1] # Get the last part of the path return f"{BASE_URI}plp_{category}" else: # For any other type of page safe_path = "".join(c if c.isalnum() else "_" for c in path) if fragment == "offer": return f"{BASE_URI}offer_{safe_path}" else: return f"{BASE_URI}page_{safe_path}"
def clean_price(price_str): if not price_str or price_str == "N/A": return None if isinstance(price_str, (int, float)): return float(price_str) try: # Remove any non-numeric characters except for the decimal point cleaned_price = "".join( char for char in str(price_str) if char.isdigit() or char == "." ) return float(cleaned_price) except ValueError: logger.warning(f"Could not convert price: {price_str}") return None
def create_product_entity(row, dataset_uri): url = replace_url(row["url"]) product_entity_uri = create_entity_uri(url)
entity_data = { "@context": "http://schema.org", "@type": "Product", "@id": product_entity_uri, "url": url, "name": row["title"] if not pd.isna(row["title"]) else "Untitled Product", "urn:meta:requestEmbeddings": [ "http://schema.org/name", "http://schema.org/description", ], }
if not pd.isna(row.get("product_description")): entity_data["description"] = row["product_description"]
if not pd.isna(row.get("product_price")): price = clean_price(row["product_price"]) if price is not None: # Create offer ID as a sub-resource of the product ID offer_entity_uri = f"{product_entity_uri}/offer_1" entity_data["offers"] = { "@type": "Offer", "@id": offer_entity_uri, "price": str(price), "priceCurrency": "GBP", "availability": "http://schema.org/InStock", "url": url, }
if not pd.isna(row.get("product_category")): entity_data["category"] = row["product_category"]
custom_attributes = { key: row[key] for key in [ "meta_description", "og_title", "og_description", "h1", "h2", ] if not pd.isna(row.get(key)) } if custom_attributes: entity_data[str(EXAMPLE_PRIVATE_NS.attributes)] = json.dumps( custom_attributes )
return entity_data
def create_collection_entity(row, dataset_uri): url = replace_url(row["url"]) entity_uri = create_entity_uri(url)
entity_data = { "@context": "http://schema.org", "@type": "CollectionPage", "@id": entity_uri, "url": url, "name": row["category_name"] or row["title"], }
custom_attributes = { key: row[key] for key in [ "meta_description", "og_title", "og_description", "h1", "h2", ] if row.get(key) } if custom_attributes: entity_data[str(EXAMPLE_PRIVATE_NS.attributes)] = json.dumps( custom_attributes )
return entity_data
async def build_knowledge_graph(df, dataset_uri, api_client): entities_api = EntitiesApi(api_client)
for _, row in df.iterrows(): try: if row["page_type"] == "PDP": entity_data = create_product_entity(row, dataset_uri) elif row["page_type"] == "PLP": entity_data = create_collection_entity(row, dataset_uri) else: logger.warning( f"Skipping unknown page type for URL: {row['url']}" ) continue
if entity_data is None: logger.warning( f"Skipping page due to missing critical data: {row['url']}" ) continue
await create_entity(entities_api, entity_data) logger.info( f"Created entity for {row['page_type']}: {row['title']}" ) except Exception as e: logger.error( f"Error creating entity for {row['page_type']}: {row['title']}" ) logger.error(f"Error: {str(e)}")
Run the show
Section titled “Run the show”# ----------------------------# Main Execution# ----------------------------
# Global configuration variablesCRAWL_URL = "https://product-finder.wordlift.io/"OUTPUT_FILE = "crawl_results.jl"
async def main(): # Step 1: Crawl the website crawl_website(CRAWL_URL, OUTPUT_FILE)
# Step 2: Load the crawled data df = pd.read_json(OUTPUT_FILE, lines=True)
# Step 3: Analyze URL patterns df = analyze_url_patterns(df)
# Step 4: Extract page data pages_df = extract_page_data(df)
async with ApiClient(configuration) as api_client: # Clean up the existing knowledge graph try: await cleanup_knowledge_graph(api_client) logger.info(f"Knowledge Graph Cleaned Up") except Exception as e: logger.error( f"Failed to clean up the existing Knowledge Graph: {str(e)}" ) return # Exit if cleanup fails
# Build the new knowledge graph await build_knowledge_graph(pages_df, CRAWL_URL, api_client)
logger.info("Knowledge graph building completed.")
if __name__ == "__main__": asyncio.run(main())
Let’s query products in the KG now using GraphQL
Section titled “Let’s query products in the KG now using GraphQL”async def perform_graphql_query(api_client): graphql_api = GraphQLApi(api_client) query = """ { products(rows: 20) { id: iri category: string(name:"schema:category") name: string(name:"schema:name") description: string(name:"schema:description") url: string(name:"schema:url") } } """ request = GraphqlRequest(query=query)
try: response = await graphql_api.graphql_using_post(body=request) print("GraphQL Query Results:") print(json.dumps(response, indent=2)) except Exception as e: logger.error(f"An error occurred during GraphQL query: {e}")
async with ApiClient(configuration) as api_client: # Step 6: Perform GraphQL query await perform_graphql_query(api_client) logger.info("Knowledge graph building and GraphQL query completed.")
Leveraging the Knowledge Graph
Section titled “Leveraging the Knowledge Graph”Now that we have successfully created a Knowledge Graph for our e-commerce website, complete with product embeddings, we can take advantage of it to enhance user experience and functionality. The embeddings we’ve generated for each product allow us to perform semantic similarity searches and build more intelligent systems.
Adding Structured Data to your Web Pages
Section titled “Adding Structured Data to your Web Pages”In this section, we will perform a simple test of WordLift’s data API. This API is used to inject structured data markup from the Knowledge Graph (KG) into your webpages. Structured data helps search engines better understand your content, potentially leading to rich snippets in search results and improved SEO.
For this notebook, we’re using a pre-configured KG on a demo e-commerce website. We’ll be referencing a fictitious URL: https://data-science-with-python-for-seo.wordlift.dev
.
When calling WordLift’s data API, we simply pass a URL and receive the corresponding JSON-LD (JavaScript Object Notation for Linked Data). This structured data typically includes information such as product details, pricing, and availability for e-commerce sites.
The get_json_ld_from_url()
function below demonstrates this process. It takes a URL as input and returns the structured data in JSON-LD format, ready to be injected into your webpage.
def get_json_ld_from_url(url): # Construct the API URL by prefixing with 'https://api.wordlift.io/data/https/' api_url = "https://api.wordlift.io/data/https/" + url.replace( "https://", "" )
# Make the GET request to the API response = requests.get(api_url)
# Check if the request was successful if response.status_code == 200: # Parse the JSON-LD from the response json_ld = response.json() return json_ld else: print(f"Failed to retrieve data: {response.status_code}") return None
def pretty_print_json(json_obj): # Pretty print the JSON object print(json.dumps(json_obj, indent=4))
# Let's run a testurl = "https://data-science-with-python-for-seo.wordlift.dev/product/100-pure-deluxe-travel-pack-duo-2/"json_ld = get_json_ld_from_url(url)json_ld
Generating Links of Similar Products using WordLift Neural Search
Section titled “Generating Links of Similar Products using WordLift Neural Search”With our product embeddings in place, we can now leverage WordLift’s Neural Search capabilities to recommend similar products to users. This feature significantly enhances user engagement and can potentially boost sales by showcasing relevant products based on semantic similarity.
Unlike traditional keyword matching, semantic similarity considers the context and meaning of product descriptions. This approach allows for more nuanced and accurate recommendations, even when products don’t share exact keywords.
The get_top_k_similar_urls
function we’ve defined earlier implements this functionality. It takes a product URL and returns a list of semantically similar products, ranked by their similarity scores.
For example, if a user is viewing a red cotton t-shirt, this feature might recommend other cotton t-shirts in different colors, or similar style tops made from different materials. This creates a more intuitive and engaging shopping experience for the user.
By implementing this Neural Search feature, we’re able to create a more personalized and efficient shopping experience, potentially leading to increased user satisfaction and higher conversion rates.
async def get_top_k_similar_urls(configuration, query_url: str, top_k: int): request = VectorSearchQueryRequest( query_url=query_url, similarity_top_k=top_k, )
async with wordlift_client.ApiClient(configuration) as api_client: api_instance = VectorSearchQueriesApi(api_client) try: page = await api_instance.create_query( vector_search_query_request=request ) return [ { "url": item.id, "name": item.text.split("\n")[0], "score": item.score, } for item in page.items if item.id and item.text ] except Exception as e: logger.error(f"Error querying for entities: {e}", exc_info=True) return None
top_k = 10url = "https://data-science-with-python-for-seo.wordlift.dev/product/100-mineral-sunscreen-spf-30/"similar_urls = await get_top_k_similar_urls( configuration, query_url=url, top_k=top_k)print(json.dumps(similar_urls, indent=2))
Building a Chatbot for the E-commerce Website using LlamaIndex 🦙
Section titled “Building a Chatbot for the E-commerce Website using LlamaIndex 🦙”The Knowledge Graph we’ve created serves as a perfect foundation for building an intelligent chatbot. LlamaIndex (formerly GPT Index) is a powerful data framework that allows us to ingest, structure, and access private or domain-specific data in Large Language Models (LLMs). With LlamaIndex, we can create a context-aware chatbot that understands our product catalog and can assist customers effectively.
By leveraging LlamaIndex in conjunction with our Knowledge Graph, we can develop a chatbot that responds to direct queries. This chatbot will have an understanding of the product catalog, enabling it to:
- Answer questions about product specifications, availability, and pricing
- Make personalized product recommendations based on customer preferences
- Provide comparisons between similar products
This approach leads to more natural and helpful interactions with customers, enhancing their shopping experience. The chatbot can draw upon the structured data in our Knowledge Graph, using LlamaIndex to efficiently retrieve and present relevant information through the LLM.
In the following sections, we’ll walk through the process of setting up LlamaIndex with our Knowledge Graph data and creating a chatbot that can intelligently assist our e-commerce customers.
Installing LlamaIndex
and WordLiftVectorStore
💪
Section titled “Installing LlamaIndex and WordLiftVectorStore 💪”%%capture!pip install llama-index!pip install -U 'git+https://github.com/wordlift/llama_index.git#egg=llama-index-vector-stores-wordlift&subdirectory=llama-index-integrations/vector_stores/llama-index-vector-stores-wordlift'!pip install llama-index-embeddings-nomic
# import the necessary modulesfrom llama_index.vector_stores.wordlift import WordliftVectorStorefrom llama_index.core import VectorStoreIndex
Setting NomicEmbeddings for our Query Engine
Section titled “Setting NomicEmbeddings for our Query Engine”Nomic has released v1.5 🪆🪆🪆 of their embedding model, which brings significant improvements to text embedding capabilities. Embeddings are numerical representations of text that capture semantic meaning, allowing our system to understand and compare the content of queries and documents.
Key features of Nomic v1.5 include:
- Variable-sized embeddings with dimensions between 64 and 768
- Matryoshka learning, which allows for nested representations
- An expanded context size of 8192 tokens
We use NomicEmbeddings in WordLift due to these advanced features, and now we’re configuring LlamaIndex to use it as well when encoding user queries. This consistency in embedding models across our stack ensures better alignment between our Knowledge Graph and the query understanding process.
More information on NomicEmbeddings can be found here.
Go here to get your free key.
from llama_index.embeddings.nomic import NomicEmbedding
nomic_api_key = os.getenv("NOMIC_KEY")
embed_model = NomicEmbedding( api_key=nomic_api_key, dimensionality=128, model_name="nomic-embed-text-v1.5",)
embedding = embed_model.get_text_embedding("Hey Ho SEO!")len(embedding)
We will use OpenAI as default LLM for generating response. We could of course use any other available LLM.
# Set the environment variableos.environ["OPENAI_API_KEY"] = OPENAI_KEY
Let’s setup now WordliftVectorStore using data from our Knowledge Graph.
# Let's configure WordliftVectorStore using our WL Keyvector_store = WordliftVectorStore(key=API_KEY)
# Create an index from the vector storeindex = VectorStoreIndex.from_vector_store( vector_store, embed_model=embed_model)
# Create a query enginequery_engine = index.as_query_engine()
query1 = "Can you give me a product similar to the facial puff? Please add the URL also"result1 = query_engine.query(query1)
print(result1)
# Function to handle queriesdef query_engine(query): # Create an index from the vector store index = VectorStoreIndex.from_vector_store( vector_store, embed_model=embed_model )
# Create a query engine query_engine = index.as_query_engine() response = query_engine.query(query) return response
# Interactive query loopwhile True: user_query = input("Enter your query (or 'quit' to exit): ") if user_query.lower() == "quit": break result = query_engine(user_query) print(result) print("\n---\n")