Ray

RayIngestionPipeline #

Bases: IngestionPipeline

An ingestion pipeline that can be applied to data using a Ray cluster.

Parameters:

Name	Type	Description	Default
`name`	`str`	Unique name of the ingestion pipeline. Defaults to DEFAULT_PIPELINE_NAME.	`DEFAULT_PIPELINE_NAME`
`project_name`	`str`	Unique name of the project. Defaults to DEFAULT_PROJECT_NAME.	`DEFAULT_PROJECT_NAME`
`transformations`	`List[RayTransformComponent]`	Ray transformations to apply to the data. Defaults to None.	`None`
`documents`	`Optional[Sequence[Document]]`	Documents to ingest. Defaults to None.	`None`
`readers`	`Optional[List[ReaderConfig]]`	Reader to use to read the data. Defaults to None.	`None`
`vector_store`	`Optional[BasePydanticVectorStore]`	Vector store to use to store the data. Defaults to None.	`None`
`docstore`	`Optional[BaseDocumentStore]`	Document store to use for de-duping with a vector store. Defaults to None.	`None`
`docstore_strategy`	`DocstoreStrategy`	Document de-dup strategy. Defaults to DocstoreStrategy.UPSERTS.	`UPSERTS`

Examples:

import ray
from llama_index.core import Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.extractors import TitleExtractor
from llama_index.ingestion.ray import RayIngestionPipeline, RayTransformComponent

# Start a new cluster (or connect to an existing one)
ray.init()

# Create transformations
transformations=[
    RayTransformComponent(
        transform_class=TitleExtractor,
        map_batches_kwargs={
            "batch_size": 10,  # Define the batch size
        },
    ),
    RayTransformComponent(
        transform_class=OpenAIEmbedding,
        map_batches_kwargs={
            "batch_size": 10,
        },
    ),
]

# Create the Ray ingestion pipeline
pipeline = RayIngestionPipeline(
    transformations=transformations
)

# Run the pipeline with many documents
nodes = pipeline.run(documents=[Document.example()] * 100)

Source code in llama-index-integrations/ingestion/llama-index-ingestion-ray/llama_index/ingestion/ray/base.py

class RayIngestionPipeline(IngestionPipeline):
    """
    An ingestion pipeline that can be applied to data using a Ray cluster.

    Args:
        name (str, optional):
            Unique name of the ingestion pipeline. Defaults to DEFAULT_PIPELINE_NAME.
        project_name (str, optional):
            Unique name of the project. Defaults to DEFAULT_PROJECT_NAME.
        transformations (List[RayTransformComponent], optional):
            Ray transformations to apply to the data. Defaults to None.
        documents (Optional[Sequence[Document]], optional):
            Documents to ingest. Defaults to None.
        readers (Optional[List[ReaderConfig]], optional):
            Reader to use to read the data. Defaults to None.
        vector_store (Optional[BasePydanticVectorStore], optional):
            Vector store to use to store the data. Defaults to None.
        docstore (Optional[BaseDocumentStore], optional):
            Document store to use for de-duping with a vector store. Defaults to None.
        docstore_strategy (DocstoreStrategy, optional):
            Document de-dup strategy. Defaults to DocstoreStrategy.UPSERTS.

    Examples:
        ```python
        import ray
        from llama_index.core import Document
        from llama_index.embeddings.openai import OpenAIEmbedding
        from llama_index.core.extractors import TitleExtractor
        from llama_index.ingestion.ray import RayIngestionPipeline, RayTransformComponent

        # Start a new cluster (or connect to an existing one)
        ray.init()

        # Create transformations
        transformations=[
            RayTransformComponent(
                transform_class=TitleExtractor,
                map_batches_kwargs={
                    "batch_size": 10,  # Define the batch size
                },
            ),
            RayTransformComponent(
                transform_class=OpenAIEmbedding,
                map_batches_kwargs={
                    "batch_size": 10,
                },
            ),
        ]

        # Create the Ray ingestion pipeline
        pipeline = RayIngestionPipeline(
            transformations=transformations
        )

        # Run the pipeline with many documents
        nodes = pipeline.run(documents=[Document.example()] * 100)
        ```

    """

    transformations: List[RayTransformComponent] = Field(
        description="Transformations to apply to the data with Ray"
    )

    def __init__(
        self,
        name: str = DEFAULT_PIPELINE_NAME,
        project_name: str = DEFAULT_PROJECT_NAME,
        transformations: Optional[List[RayTransformComponent]] = None,
        readers: Optional[List[ReaderConfig]] = None,
        documents: Optional[Sequence[Document]] = None,
        vector_store: Optional[BasePydanticVectorStore] = None,
        docstore: Optional[BaseDocumentStore] = None,
        docstore_strategy: DocstoreStrategy = DocstoreStrategy.UPSERTS,
    ) -> None:
        BaseModel.__init__(
            self,
            name=name,
            project_name=project_name,
            transformations=transformations,
            readers=readers,
            documents=documents,
            vector_store=vector_store,
            cache=IngestionCache(),
            docstore=docstore,
            docstore_strategy=docstore_strategy,
            disable_cache=True,  # Caching is disabled as Ray processes transformations lazily
        )

    @dispatcher.span
    def run(
        self,
        show_progress: bool = False,
        documents: Optional[List[Document]] = None,
        nodes: Optional[Sequence[BaseNode]] = None,
        store_doc_text: bool = True,
        **kwargs: Any,
    ) -> Sequence[BaseNode]:
        """
        Run a series of transformations on a set of nodes.

        If a vector store is provided, nodes with embeddings will be added to the vector store.

        If a vector store + docstore are provided, the docstore will be used to de-duplicate documents.

        Args:
            show_progress (bool, optional): Shows execution progress bar(s). Defaults to False.
            documents (Optional[List[Document]], optional): Set of documents to be transformed. Defaults to None.
            nodes (Optional[Sequence[BaseNode]], optional): Set of nodes to be transformed. Defaults to None.
            store_doc_text (bool, optional): Whether to store the document texts. Defaults to True.

        Returns:
            Sequence[BaseNode]: The set of transformed Nodes/Documents

        """
        input_nodes = self._prepare_inputs(documents, nodes)

        # check if we need to dedup
        if self.docstore is not None and self.vector_store is not None:
            if self.docstore_strategy in (
                DocstoreStrategy.UPSERTS,
                DocstoreStrategy.UPSERTS_AND_DELETE,
            ):
                nodes_to_run = self._handle_upserts(input_nodes)
            elif self.docstore_strategy == DocstoreStrategy.DUPLICATES_ONLY:
                nodes_to_run = self._handle_duplicates(input_nodes)
            else:
                raise ValueError(f"Invalid docstore strategy: {self.docstore_strategy}")
        elif self.docstore is not None and self.vector_store is None:
            if self.docstore_strategy == DocstoreStrategy.UPSERTS:
                logger.info(
                    "Docstore strategy set to upserts, but no vector store. "
                    "Switching to duplicates_only strategy."
                )
                self.docstore_strategy = DocstoreStrategy.DUPLICATES_ONLY
            elif self.docstore_strategy == DocstoreStrategy.UPSERTS_AND_DELETE:
                logger.info(
                    "Docstore strategy set to upserts and delete, but no vector store. "
                    "Switching to duplicates_only strategy."
                )
                self.docstore_strategy = DocstoreStrategy.DUPLICATES_ONLY
            nodes_to_run = self._handle_duplicates(input_nodes)
        else:
            nodes_to_run = input_nodes

        nodes = run_transformations(
            nodes_to_run,
            self.transformations,
            show_progress=show_progress,
            **kwargs,
        )

        if self.vector_store is not None:
            nodes_with_embeddings = [n for n in nodes if n.embedding is not None]
            if nodes_with_embeddings:
                self.vector_store.add(nodes_with_embeddings)

        if self.docstore is not None:
            self._update_docstore(nodes_to_run, store_doc_text=store_doc_text)

        return nodes

    @dispatcher.span
    async def arun(
        self,
        show_progress: bool = False,
        documents: Optional[List[Document]] = None,
        nodes: Optional[Sequence[BaseNode]] = None,
        store_doc_text: bool = True,
        **kwargs: Any,
    ) -> Sequence[BaseNode]:
        """
        Run a series of transformations on a set of nodes.

        If a vector store is provided, nodes with embeddings will be added to the vector store.

        If a vector store + docstore are provided, the docstore will be used to de-duplicate documents.

        Args:
            show_progress (bool, optional): Shows execution progress bar(s). Defaults to False.
            documents (Optional[List[Document]], optional): Set of documents to be transformed. Defaults to None.
            nodes (Optional[Sequence[BaseNode]], optional): Set of nodes to be transformed. Defaults to None.
            store_doc_text (bool, optional): Whether to store the document texts. Defaults to True.

        Returns:
            Sequence[BaseNode]: The set of transformed Nodes/Documents

        """
        input_nodes = self._prepare_inputs(documents, nodes)

        # check if we need to dedup
        if self.docstore is not None and self.vector_store is not None:
            if self.docstore_strategy in (
                DocstoreStrategy.UPSERTS,
                DocstoreStrategy.UPSERTS_AND_DELETE,
            ):
                nodes_to_run = await self._ahandle_upserts(
                    input_nodes, store_doc_text=store_doc_text
                )
            elif self.docstore_strategy == DocstoreStrategy.DUPLICATES_ONLY:
                nodes_to_run = await self._ahandle_duplicates(
                    input_nodes, store_doc_text=store_doc_text
                )
            else:
                raise ValueError(f"Invalid docstore strategy: {self.docstore_strategy}")
        elif self.docstore is not None and self.vector_store is None:
            if self.docstore_strategy == DocstoreStrategy.UPSERTS:
                logger.info(
                    "Docstore strategy set to upserts, but no vector store. "
                    "Switching to duplicates_only strategy."
                )
                self.docstore_strategy = DocstoreStrategy.DUPLICATES_ONLY
            elif self.docstore_strategy == DocstoreStrategy.UPSERTS_AND_DELETE:
                logger.info(
                    "Docstore strategy set to upserts and delete, but no vector store. "
                    "Switching to duplicates_only strategy."
                )
                self.docstore_strategy = DocstoreStrategy.DUPLICATES_ONLY
            nodes_to_run = await self._ahandle_duplicates(
                input_nodes, store_doc_text=store_doc_text
            )

        else:
            nodes_to_run = input_nodes

        nodes = await arun_transformations(  # type: ignore
            nodes_to_run,
            self.transformations,
            show_progress=show_progress,
            **kwargs,
        )

        if self.vector_store is not None:
            nodes_with_embeddings = [n for n in nodes if n.embedding is not None]
            if nodes_with_embeddings:
                await self.vector_store.async_add(nodes_with_embeddings)

        if self.docstore is not None:
            await self._aupdate_docstore(nodes_to_run, store_doc_text=store_doc_text)

        return nodes

run #

run(
    show_progress: bool = False,
    documents: Optional[List[Document]] = None,
    nodes: Optional[Sequence[BaseNode]] = None,
    store_doc_text: bool = True,
    **kwargs: Any
) -> Sequence[BaseNode]

Run a series of transformations on a set of nodes.

If a vector store is provided, nodes with embeddings will be added to the vector store.

If a vector store + docstore are provided, the docstore will be used to de-duplicate documents.

Parameters:

Name	Type	Description	Default
`show_progress`	`bool`	Shows execution progress bar(s). Defaults to False.	`False`
`documents`	`Optional[List[Document]]`	Set of documents to be transformed. Defaults to None.	`None`
`nodes`	`Optional[Sequence[BaseNode]]`	Set of nodes to be transformed. Defaults to None.	`None`
`store_doc_text`	`bool`	Whether to store the document texts. Defaults to True.	`True`

Returns:

Type	Description
`Sequence[BaseNode]`	Sequence[BaseNode]: The set of transformed Nodes/Documents

Source code in llama-index-integrations/ingestion/llama-index-ingestion-ray/llama_index/ingestion/ray/base.py

@dispatcher.span
def run(
    self,
    show_progress: bool = False,
    documents: Optional[List[Document]] = None,
    nodes: Optional[Sequence[BaseNode]] = None,
    store_doc_text: bool = True,
    **kwargs: Any,
) -> Sequence[BaseNode]:
    """
    Run a series of transformations on a set of nodes.

    If a vector store is provided, nodes with embeddings will be added to the vector store.

    If a vector store + docstore are provided, the docstore will be used to de-duplicate documents.

    Args:
        show_progress (bool, optional): Shows execution progress bar(s). Defaults to False.
        documents (Optional[List[Document]], optional): Set of documents to be transformed. Defaults to None.
        nodes (Optional[Sequence[BaseNode]], optional): Set of nodes to be transformed. Defaults to None.
        store_doc_text (bool, optional): Whether to store the document texts. Defaults to True.

    Returns:
        Sequence[BaseNode]: The set of transformed Nodes/Documents

    """
    input_nodes = self._prepare_inputs(documents, nodes)

    # check if we need to dedup
    if self.docstore is not None and self.vector_store is not None:
        if self.docstore_strategy in (
            DocstoreStrategy.UPSERTS,
            DocstoreStrategy.UPSERTS_AND_DELETE,
        ):
            nodes_to_run = self._handle_upserts(input_nodes)
        elif self.docstore_strategy == DocstoreStrategy.DUPLICATES_ONLY:
            nodes_to_run = self._handle_duplicates(input_nodes)
        else:
            raise ValueError(f"Invalid docstore strategy: {self.docstore_strategy}")
    elif self.docstore is not None and self.vector_store is None:
        if self.docstore_strategy == DocstoreStrategy.UPSERTS:
            logger.info(
                "Docstore strategy set to upserts, but no vector store. "
                "Switching to duplicates_only strategy."
            )
            self.docstore_strategy = DocstoreStrategy.DUPLICATES_ONLY
        elif self.docstore_strategy == DocstoreStrategy.UPSERTS_AND_DELETE:
            logger.info(
                "Docstore strategy set to upserts and delete, but no vector store. "
                "Switching to duplicates_only strategy."
            )
            self.docstore_strategy = DocstoreStrategy.DUPLICATES_ONLY
        nodes_to_run = self._handle_duplicates(input_nodes)
    else:
        nodes_to_run = input_nodes

    nodes = run_transformations(
        nodes_to_run,
        self.transformations,
        show_progress=show_progress,
        **kwargs,
    )

    if self.vector_store is not None:
        nodes_with_embeddings = [n for n in nodes if n.embedding is not None]
        if nodes_with_embeddings:
            self.vector_store.add(nodes_with_embeddings)

    if self.docstore is not None:
        self._update_docstore(nodes_to_run, store_doc_text=store_doc_text)

    return nodes

arun `async` #

arun(
    show_progress: bool = False,
    documents: Optional[List[Document]] = None,
    nodes: Optional[Sequence[BaseNode]] = None,
    store_doc_text: bool = True,
    **kwargs: Any
) -> Sequence[BaseNode]

Run a series of transformations on a set of nodes.

If a vector store is provided, nodes with embeddings will be added to the vector store.

If a vector store + docstore are provided, the docstore will be used to de-duplicate documents.

Parameters:

Name	Type	Description	Default
`show_progress`	`bool`	Shows execution progress bar(s). Defaults to False.	`False`
`documents`	`Optional[List[Document]]`	Set of documents to be transformed. Defaults to None.	`None`
`nodes`	`Optional[Sequence[BaseNode]]`	Set of nodes to be transformed. Defaults to None.	`None`
`store_doc_text`	`bool`	Whether to store the document texts. Defaults to True.	`True`

Returns:

Type	Description
`Sequence[BaseNode]`	Sequence[BaseNode]: The set of transformed Nodes/Documents

Source code in llama-index-integrations/ingestion/llama-index-ingestion-ray/llama_index/ingestion/ray/base.py

@dispatcher.span
async def arun(
    self,
    show_progress: bool = False,
    documents: Optional[List[Document]] = None,
    nodes: Optional[Sequence[BaseNode]] = None,
    store_doc_text: bool = True,
    **kwargs: Any,
) -> Sequence[BaseNode]:
    """
    Run a series of transformations on a set of nodes.

    If a vector store is provided, nodes with embeddings will be added to the vector store.

    If a vector store + docstore are provided, the docstore will be used to de-duplicate documents.

    Args:
        show_progress (bool, optional): Shows execution progress bar(s). Defaults to False.
        documents (Optional[List[Document]], optional): Set of documents to be transformed. Defaults to None.
        nodes (Optional[Sequence[BaseNode]], optional): Set of nodes to be transformed. Defaults to None.
        store_doc_text (bool, optional): Whether to store the document texts. Defaults to True.

    Returns:
        Sequence[BaseNode]: The set of transformed Nodes/Documents

    """
    input_nodes = self._prepare_inputs(documents, nodes)

    # check if we need to dedup
    if self.docstore is not None and self.vector_store is not None:
        if self.docstore_strategy in (
            DocstoreStrategy.UPSERTS,
            DocstoreStrategy.UPSERTS_AND_DELETE,
        ):
            nodes_to_run = await self._ahandle_upserts(
                input_nodes, store_doc_text=store_doc_text
            )
        elif self.docstore_strategy == DocstoreStrategy.DUPLICATES_ONLY:
            nodes_to_run = await self._ahandle_duplicates(
                input_nodes, store_doc_text=store_doc_text
            )
        else:
            raise ValueError(f"Invalid docstore strategy: {self.docstore_strategy}")
    elif self.docstore is not None and self.vector_store is None:
        if self.docstore_strategy == DocstoreStrategy.UPSERTS:
            logger.info(
                "Docstore strategy set to upserts, but no vector store. "
                "Switching to duplicates_only strategy."
            )
            self.docstore_strategy = DocstoreStrategy.DUPLICATES_ONLY
        elif self.docstore_strategy == DocstoreStrategy.UPSERTS_AND_DELETE:
            logger.info(
                "Docstore strategy set to upserts and delete, but no vector store. "
                "Switching to duplicates_only strategy."
            )
            self.docstore_strategy = DocstoreStrategy.DUPLICATES_ONLY
        nodes_to_run = await self._ahandle_duplicates(
            input_nodes, store_doc_text=store_doc_text
        )

    else:
        nodes_to_run = input_nodes

    nodes = await arun_transformations(  # type: ignore
        nodes_to_run,
        self.transformations,
        show_progress=show_progress,
        **kwargs,
    )

    if self.vector_store is not None:
        nodes_with_embeddings = [n for n in nodes if n.embedding is not None]
        if nodes_with_embeddings:
            await self.vector_store.async_add(nodes_with_embeddings)

    if self.docstore is not None:
        await self._aupdate_docstore(nodes_to_run, store_doc_text=store_doc_text)

    return nodes

RayTransformComponent #

Bases: BaseModel

A wrapper around transformations that enables execution in Ray.

Parameters:

Name	Type	Description	Default
`transform_class`	`Type[TransformComponent]`	The transformation class to wrap.	required
`transform_kwargs`	`Optional[Dict[str, Any]]`	The keyword arguments to pass to the transformation init function.	`None`
`map_batches_kwargs`	`Optional[Dict[str, Any]]`	The keyword arguments to pass to ray.data.Dataset.map_batches (see https://docs.ray.io/en/latest/data/api/doc/ray.data.Dataset.map_batches.html for details)	`None`

Source code in llama-index-integrations/ingestion/llama-index-ingestion-ray/llama_index/ingestion/ray/transform.py

class RayTransformComponent(BaseModel):
    """
    A wrapper around transformations that enables execution in Ray.

    Args:
        transform_class (Type[TransformComponent]): The transformation class to wrap.
        transform_kwargs (Optional[Dict[str, Any]], optional): The keyword arguments to pass to the transformation __init__ function.
        map_batches_kwargs (Optional[Dict[str, Any]], optional): The keyword arguments to pass to ray.data.Dataset.map_batches (see https://docs.ray.io/en/latest/data/api/doc/ray.data.Dataset.map_batches.html for details)

    """

    transform_class: Type[TransformComponent]
    transform_kwargs: Dict[str, Any]
    map_batches_kwargs: Dict[str, Any]

    def __init__(
        self,
        transform_class: Type[TransformComponent],
        map_batches_kwargs: Optional[Dict[str, Any]] = None,
        transform_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ) -> None:
        super().__init__(
            transform_class=transform_class,
            transform_kwargs=transform_kwargs or kwargs,
            map_batches_kwargs=map_batches_kwargs or {},
        )

    def __call__(self, dataset: ray.data.Dataset, **kwargs) -> ray.data.Dataset:
        """Run the transformation on the given ray dataset."""
        return dataset.map_batches(
            TransformActor,
            fn_constructor_kwargs={
                "transform_class": self.transform_class,
                "transform_kwargs": self.transform_kwargs,
            },
            fn_kwargs=kwargs,
            batch_format="pyarrow",
            **self.map_batches_kwargs,
        )

options: members: - RayIngestionPipeline

Ray

RayIngestionPipeline #

run #

arun async #

RayTransformComponent #

arun `async` #