chunking_config?: NoneChunkingConfig { mode } | CharacterChunkingConfig { chunk_overlap, chunk_size, mode } | TokenChunkingConfig { chunk_overlap, chunk_size, mode, separator } | 2 more

Configuration for the chunking.

One of the following:

NoneChunkingConfig { mode }

mode?: "none"

CharacterChunkingConfig { chunk_overlap, chunk_size, mode }

chunk_overlap?: number

chunk_size?: number

mode?: "character"

TokenChunkingConfig { chunk_overlap, chunk_size, mode, separator }

chunk_overlap?: number

chunk_size?: number

mode?: "token"

separator?: string

SentenceChunkingConfig { chunk_overlap, chunk_size, mode, 2 more }

chunk_overlap?: number

chunk_size?: number

mode?: "sentence"

paragraph_separator?: string

separator?: string

SemanticChunkingConfig { breakpoint_percentile_threshold, buffer_size, mode }

breakpoint_percentile_threshold?: number

buffer_size?: number

mode?: "semantic"

mode?: "advanced"

segmentation_config?: NoneSegmentationConfig { mode } | PageSegmentationConfig { mode, page_separator } | ElementSegmentationConfig { mode }

Configuration for the segmentation.

One of the following:

NoneSegmentationConfig { mode }

mode?: "none"

PageSegmentationConfig { mode, page_separator }

mode?: "page"

page_separator?: string

ElementSegmentationConfig { mode }

mode?: "element"

AutoTransformConfig { chunk_overlap, chunk_size, mode }

chunk_overlap?: number

Chunk overlap for the transformation.

chunk_size?: number

Chunk size for the transformation.

exclusiveMinimum0

mode?: "auto"

AzureOpenAIEmbedding { additional_kwargs, api_base, api_key, 12 more }

additional_kwargs?: Record<string, unknown>

Additional kwargs for the OpenAI API.

api_base?: string

The base URL for Azure deployment.

api_key?: string | null

The OpenAI API key.

api_version?: string

The version for Azure OpenAI API.

azure_deployment?: string | null

The Azure deployment to use.

azure_endpoint?: string | null

The Azure endpoint to use.

class_name?: string

default_headers?: Record<string, string> | null

The default headers for API requests.

dimensions?: number | null

The number of dimensions on the output embedding vectors. Works only with v3 embedding models.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

Maximum number of retries.

minimum0

model_name?: string

The name of the OpenAI embedding model.

num_workers?: number | null

The number of workers to use for async embedding calls.

reuse_client?: boolean

Reuse the OpenAI client between requests. When doing anything with large volumes of async API calls, setting this to false can improve stability.

timeout?: number

Timeout for each request.

minimum0

AzureOpenAIEmbeddingConfig { component, type }

component?: AzureOpenAIEmbedding { additional_kwargs, api_base, api_key, 12 more }

Configuration for the Azure OpenAI embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the OpenAI API.

api_base?: string

The base URL for Azure deployment.

api_key?: string | null

The OpenAI API key.

api_version?: string

The version for Azure OpenAI API.

azure_deployment?: string | null

The Azure deployment to use.

azure_endpoint?: string | null

The Azure endpoint to use.

class_name?: string

default_headers?: Record<string, string> | null

The default headers for API requests.

dimensions?: number | null

The number of dimensions on the output embedding vectors. Works only with v3 embedding models.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

Maximum number of retries.

minimum0

model_name?: string

The name of the OpenAI embedding model.

num_workers?: number | null

The number of workers to use for async embedding calls.

reuse_client?: boolean

Reuse the OpenAI client between requests. When doing anything with large volumes of async API calls, setting this to false can improve stability.

timeout?: number

Timeout for each request.

minimum0

type?: "AZURE_EMBEDDING"

Type of the embedding model.

BedrockEmbedding { additional_kwargs, aws_access_key_id, aws_secret_access_key, 9 more }

additional_kwargs?: Record<string, unknown>

Additional kwargs for the bedrock client.

aws_access_key_id?: string | null

AWS Access Key ID to use

aws_secret_access_key?: string | null

AWS Secret Access Key to use

aws_session_token?: string | null

AWS Session Token to use

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

The maximum number of API retries.

exclusiveMinimum0

model_name?: string

The modelId of the Bedrock model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

profile_name?: string | null

The name of aws profile to use. If not given, then the default profile is used.

region_name?: string | null

AWS region name to use. Uses region configured in AWS CLI if not passed

timeout?: number

The timeout for the Bedrock API request in seconds. It will be used for both connect and read timeouts.

BedrockEmbeddingConfig { component, type }

component?: BedrockEmbedding { additional_kwargs, aws_access_key_id, aws_secret_access_key, 9 more }

Configuration for the Bedrock embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the bedrock client.

aws_access_key_id?: string | null

AWS Access Key ID to use

aws_secret_access_key?: string | null

AWS Secret Access Key to use

aws_session_token?: string | null

AWS Session Token to use

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

The maximum number of API retries.

exclusiveMinimum0

model_name?: string

The modelId of the Bedrock model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

profile_name?: string | null

The name of aws profile to use. If not given, then the default profile is used.

region_name?: string | null

AWS region name to use. Uses region configured in AWS CLI if not passed

timeout?: number

The timeout for the Bedrock API request in seconds. It will be used for both connect and read timeouts.

type?: "BEDROCK_EMBEDDING"

Type of the embedding model.

CohereEmbedding { api_key, class_name, embed_batch_size, 5 more }

api_key: string | null

The Cohere API key.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

embedding_type?: string

Embedding type. If not provided float embedding_type is used when needed.

input_type?: string | null

Model Input type. If not provided, search_document and search_query are used when needed.

model_name?: string

The modelId of the Cohere model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

truncate?: string

Truncation type - START/ END/ NONE

CohereEmbeddingConfig { component, type }

component?: CohereEmbedding { api_key, class_name, embed_batch_size, 5 more }

Configuration for the Cohere embedding model.

api_key: string | null

The Cohere API key.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

embedding_type?: string

Embedding type. If not provided float embedding_type is used when needed.

input_type?: string | null

Model Input type. If not provided, search_document and search_query are used when needed.

model_name?: string

The modelId of the Cohere model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

truncate?: string

Truncation type - START/ END/ NONE

type?: "COHERE_EMBEDDING"

Type of the embedding model.

DataSinkCreate { component, name, sink_type }

Schema for creating a data sink.

component: Record<string, unknown> | CloudPineconeVectorStore { api_key, index_name, class_name, 3 more } | CloudPostgresVectorStore { database, embed_dim, host, 10 more } | 5 more

Component that implements the data sink

One of the following:

Record<string, unknown>

CloudPineconeVectorStore { api_key, index_name, class_name, 3 more }

Cloud Pinecone Vector Store.

This class is used to store the configuration for a Pinecone vector store, so that it can be created and used in LlamaCloud.

Args: api_key (str): API key for authenticating with Pinecone index_name (str): name of the Pinecone index namespace (optional[str]): namespace to use in the Pinecone index insert_kwargs (optional[dict]): additional kwargs to pass during insertion

api_key: string

The API key for authenticating with Pinecone

formatpassword

index_name: string

class_name?: string

insert_kwargs?: Record<string, unknown> | null

namespace?: string | null

supports_nested_metadata_filters?: true

CloudPostgresVectorStore { database, embed_dim, host, 10 more }

database: string

embed_dim: number

host: string

password: string

port: number

schema_name: string

table_name: string

user: string

class_name?: string

hnsw_settings?: PgVectorHnswSettings { distance_method, ef_construction, ef_search, 2 more } | null

HNSW settings for PGVector.

distance_method?: "l2" | "ip" | "cosine" | 3 more

The distance method to use.

One of the following:

"l2"

"ip"

"cosine"

"l1"

"hamming"

"jaccard"

ef_construction?: number

The number of edges to use during the construction phase.

minimum1

ef_search?: number

The number of edges to use during the search phase.

minimum1

m?: number

The number of bi-directional links created for each new element.

minimum1

vector_type?: "vector" | "half_vec" | "bit" | "sparse_vec"

The type of vector to use.

One of the following:

"vector"

"half_vec"

"bit"

"sparse_vec"

hybrid_search?: boolean | null

perform_setup?: boolean

supports_nested_metadata_filters?: boolean

CloudQdrantVectorStore { api_key, collection_name, url, 4 more }

Cloud Qdrant Vector Store.

This class is used to store the configuration for a Qdrant vector store, so that it can be created and used in LlamaCloud.

Args: collection_name (str): name of the Qdrant collection url (str): url of the Qdrant instance api_key (str): API key for authenticating with Qdrant max_retries (int): maximum number of retries in case of a failure. Defaults to 3 client_kwargs (dict): additional kwargs to pass to the Qdrant client

api_key: string

collection_name: string

url: string

class_name?: string

client_kwargs?: Record<string, unknown>

max_retries?: number

supports_nested_metadata_filters?: true

CloudAzureAISearchVectorStore { search_service_api_key, search_service_endpoint, class_name, 8 more }

Cloud Azure AI Search Vector Store.

search_service_api_key: string

search_service_endpoint: string

class_name?: string

client_id?: string | null

client_secret?: string | null

embedding_dimension?: number | null

filterable_metadata_field_keys?: Record<string, unknown> | null

index_name?: string | null

search_service_api_version?: string | null

supports_nested_metadata_filters?: true

tenant_id?: string | null

CloudMongoDBAtlasVectorSearch { collection_name, db_name, mongodb_uri, 5 more }

Cloud MongoDB Atlas Vector Store.

This class is used to store the configuration for a MongoDB Atlas vector store, so that it can be created and used in LlamaCloud.

Args: mongodb_uri (str): URI for connecting to MongoDB Atlas db_name (str): name of the MongoDB database collection_name (str): name of the MongoDB collection vector_index_name (str): name of the MongoDB Atlas vector index fulltext_index_name (str): name of the MongoDB Atlas full-text index

collection_name: string

db_name: string

mongodb_uri: string

class_name?: string

embedding_dimension?: number | null

fulltext_index_name?: string | null

supports_nested_metadata_filters?: boolean

vector_index_name?: string | null

CloudMilvusVectorStore { uri, token, class_name, 3 more }

Cloud Milvus Vector Store.

uri: string

token?: string | null

class_name?: string

collection_name?: string | null

embedding_dimension?: number | null

supports_nested_metadata_filters?: boolean

CloudAstraDBVectorStore { token, api_endpoint, collection_name, 4 more }

Cloud AstraDB Vector Store.

This class is used to store the configuration for an AstraDB vector store, so that it can be created and used in LlamaCloud.

Args: token (str): The Astra DB Application Token to use. api_endpoint (str): The Astra DB JSON API endpoint for your database. collection_name (str): Collection name to use. If not existing, it will be created. embedding_dimension (int): Length of the embedding vectors in use. keyspace (optional[str]): The keyspace to use. If not provided, ‘default_keyspace’

token: string

The Astra DB Application Token to use

formatpassword

api_endpoint: string

The Astra DB JSON API endpoint for your database

collection_name: string

Collection name to use. If not existing, it will be created

embedding_dimension: number

Length of the embedding vectors in use

class_name?: string

keyspace?: string | null

The keyspace to use. If not provided, ‘default_keyspace’

supports_nested_metadata_filters?: true

The name of the data sink.

sink_type: "PINECONE" | "POSTGRES" | "QDRANT" | 4 more

One of the following:

"PINECONE"

"POSTGRES"

"QDRANT"

"AZUREAI_SEARCH"

"MONGODB_ATLAS"

"MILVUS"

"ASTRA_DB"

GeminiEmbedding { api_base, api_key, class_name, 7 more }

api_base?: string | null

API base to access the model. Defaults to None.

api_key?: string | null

API key to access the model. Defaults to None.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

model_name?: string

The modelId of the Gemini model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

output_dimensionality?: number | null

Optional reduced dimension for output embeddings. Supported by models/text-embedding-004 and newer (e.g. gemini-embedding-001). Not supported by models/embedding-001.

task_type?: string | null

The task for embedding model.

title?: string | null

Title is only applicable for retrieval_document tasks, and is used to represent a document title. For other tasks, title is invalid.

transport?: string | null

Transport to access the model. Defaults to None.

GeminiEmbeddingConfig { component, type }

component?: GeminiEmbedding { api_base, api_key, class_name, 7 more }

Configuration for the Gemini embedding model.

api_base?: string | null

API base to access the model. Defaults to None.

api_key?: string | null

API key to access the model. Defaults to None.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

model_name?: string

The modelId of the Gemini model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

output_dimensionality?: number | null

Optional reduced dimension for output embeddings. Supported by models/text-embedding-004 and newer (e.g. gemini-embedding-001). Not supported by models/embedding-001.

task_type?: string | null

The task for embedding model.

title?: string | null

Title is only applicable for retrieval_document tasks, and is used to represent a document title. For other tasks, title is invalid.

transport?: string | null

Transport to access the model. Defaults to None.

type?: "GEMINI_EMBEDDING"

Type of the embedding model.

HuggingFaceInferenceAPIEmbedding { token, class_name, cookies, 9 more }

token?: string | boolean | null

Hugging Face token. Will default to the locally saved token. Pass token=False if you don’t want to send your token to the server.

One of the following:

string

boolean

class_name?: string

cookies?: Record<string, string> | null

Additional cookies to send to the server.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

headers?: Record<string, string> | null

Additional headers to send to the server. By default only the authorization and user-agent headers are sent. Values in this dictionary will override the default values.

model_name?: string | null

Hugging Face model name. If None, the task will be used.

num_workers?: number | null

The number of workers to use for async embedding calls.

pooling?: "cls" | "mean" | "last" | null

Enum of possible pooling choices with pooling behaviors.

One of the following:

"cls"

"mean"

"last"

query_instruction?: string | null

Instruction to prepend during query embedding.

task?: string | null

Optional task to pick Hugging Face’s recommended model, used when model_name is left as default of None.

text_instruction?: string | null

Instruction to prepend during text embedding.

timeout?: number | null

The maximum number of seconds to wait for a response from the server. Loading a new model in Inference API can take up to several minutes. Defaults to None, meaning it will loop until the server is available.

HuggingFaceInferenceAPIEmbeddingConfig { component, type }

component?: HuggingFaceInferenceAPIEmbedding { token, class_name, cookies, 9 more }

Configuration for the HuggingFace Inference API embedding model.

token?: string | boolean | null

Hugging Face token. Will default to the locally saved token. Pass token=False if you don’t want to send your token to the server.

One of the following:

string

boolean

class_name?: string

cookies?: Record<string, string> | null

Additional cookies to send to the server.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

headers?: Record<string, string> | null

Additional headers to send to the server. By default only the authorization and user-agent headers are sent. Values in this dictionary will override the default values.

model_name?: string | null

Hugging Face model name. If None, the task will be used.

num_workers?: number | null

The number of workers to use for async embedding calls.

pooling?: "cls" | "mean" | "last" | null

Enum of possible pooling choices with pooling behaviors.

One of the following:

"cls"

"mean"

"last"

query_instruction?: string | null

Instruction to prepend during query embedding.

task?: string | null

Optional task to pick Hugging Face’s recommended model, used when model_name is left as default of None.

text_instruction?: string | null

Instruction to prepend during text embedding.

timeout?: number | null

type?: "HUGGINGFACE_API_EMBEDDING"

Type of the embedding model.

LlamaParseParameters { adaptive_long_table, aggressive_table_extraction, annotate_links, 116 more }

adaptive_long_table?: boolean | null

aggressive_table_extraction?: boolean | null

annotate_links?: boolean | null

auto_mode?: boolean | null

auto_mode_configuration_json?: string | null

auto_mode_trigger_on_image_in_page?: boolean | null

auto_mode_trigger_on_regexp_in_page?: string | null

auto_mode_trigger_on_table_in_page?: boolean | null

auto_mode_trigger_on_text_in_page?: string | null

azure_openai_api_version?: string | null

azure_openai_deployment_name?: string | null

azure_openai_endpoint?: string | null

azure_openai_key?: string | null

bbox_bottom?: number | null

bbox_left?: number | null

bbox_right?: number | null

bbox_top?: number | null

bounding_box?: string | null

compact_markdown_table?: boolean | null

complemental_formatting_instruction?: string | null

content_guideline_instruction?: string | null

continuous_mode?: boolean | null

disable_image_extraction?: boolean | null

disable_ocr?: boolean | null

disable_reconstruction?: boolean | null

do_not_cache?: boolean | null

do_not_unroll_columns?: boolean | null

enable_cost_optimizer?: boolean | null

extract_charts?: boolean | null

extract_layout?: boolean | null

extract_printed_page_number?: boolean | null

fast_mode?: boolean | null

formatting_instruction?: string | null

gpt4o_api_key?: string | null

gpt4o_mode?: boolean | null

guess_xlsx_sheet_name?: boolean | null

hide_footers?: boolean | null

hide_headers?: boolean | null

high_res_ocr?: boolean | null

html_make_all_elements_visible?: boolean | null

html_remove_fixed_elements?: boolean | null

html_remove_navigation_elements?: boolean | null

http_proxy?: string | null

ignore_document_elements_for_layout_detection?: boolean | null

images_to_save?: Array<"screenshot" | "embedded" | "layout"> | null

One of the following:

"screenshot"

"embedded"

"layout"

inline_images_in_markdown?: boolean | null

input_s3_path?: string | null

input_s3_region?: string | null

input_url?: string | null

internal_is_screenshot_job?: boolean | null

invalidate_cache?: boolean | null

is_formatting_instruction?: boolean | null

job_timeout_extra_time_per_page_in_seconds?: number | null

job_timeout_in_seconds?: number | null

keep_page_separator_when_merging_tables?: boolean | null

languages?: Array<ParsingLanguages>

One of the following:

"af"

"az"

"bs"

"cs"

"cy"

"da"

"de"

"en"

"es"

"et"

"fr"

"ga"

"hr"

"hu"

"id"

"is"

"it"

"ku"

"la"

"lt"

"lv"

"mi"

"ms"

"mt"

"nl"

"no"

"oc"

"pi"

"pl"

"pt"

"ro"

"rs_latin"

"sk"

"sl"

"sq"

"sv"

"sw"

"tl"

"tr"

"uz"

"vi"

"ar"

"fa"

"ug"

"ur"

"bn"

"as"

"mni"

"ru"

"rs_cyrillic"

"be"

"bg"

"uk"

"mn"

"abq"

"ady"

"kbd"

"ava"

"dar"

"inh"

"che"

"lbe"

"lez"

"tab"

"tjk"

"hi"

"mr"

"ne"

"bh"

"mai"

"ang"

"bho"

"mah"

"sck"

"new"

"gom"

"sa"

"bgc"

"th"

"ch_sim"

"ch_tra"

"ja"

"ko"

"ta"

"te"

"kn"

layout_aware?: boolean | null

line_level_bounding_box?: boolean | null

markdown_table_multiline_header_separator?: string | null

max_pages?: number | null

max_pages_enforced?: number | null

merge_tables_across_pages_in_markdown?: boolean | null

model?: string | null

outlined_table_extraction?: boolean | null

output_pdf_of_document?: boolean | null

output_s3_path_prefix?: string | null

output_s3_region?: string | null

output_tables_as_HTML?: boolean | null

page_error_tolerance?: number | null

page_footer_prefix?: string | null

page_footer_suffix?: string | null

page_header_prefix?: string | null

page_header_suffix?: string | null

page_prefix?: string | null

page_separator?: string | null

page_suffix?: string | null

parse_mode?: ParsingMode | null

Enum for representing the mode of parsing to be used.

One of the following:

"parse_page_without_llm"

"parse_page_with_llm"

"parse_page_with_lvm"

"parse_page_with_agent"

"parse_page_with_layout_agent"

"parse_document_with_llm"

"parse_document_with_lvm"

"parse_document_with_agent"

parsing_instruction?: string | null

precise_bounding_box?: boolean | null

premium_mode?: boolean | null

presentation_out_of_bounds_content?: boolean | null

presentation_skip_embedded_data?: boolean | null

preserve_layout_alignment_across_pages?: boolean | null

preserve_very_small_text?: boolean | null

preset?: string | null

priority?: "low" | "medium" | "high" | "critical" | null

The priority for the request. This field may be ignored or overwritten depending on the organization tier.

One of the following:

"low"

"medium"

"high"

"critical"

project_id?: string | null

remove_hidden_text?: boolean | null

replace_failed_page_mode?: FailPageMode | null

Enum for representing the different available page error handling modes.

One of the following:

"raw_text"

"blank_page"

"error_message"

replace_failed_page_with_error_message_prefix?: string | null

replace_failed_page_with_error_message_suffix?: string | null

save_images?: boolean | null

skip_diagonal_text?: boolean | null

specialized_chart_parsing_agentic?: boolean | null

specialized_chart_parsing_efficient?: boolean | null

specialized_chart_parsing_plus?: boolean | null

specialized_image_parsing?: boolean | null

spreadsheet_extract_sub_tables?: boolean | null

spreadsheet_force_formula_computation?: boolean | null

spreadsheet_include_hidden_sheets?: boolean | null

strict_mode_buggy_font?: boolean | null

strict_mode_image_extraction?: boolean | null

strict_mode_image_ocr?: boolean | null

strict_mode_reconstruction?: boolean | null

structured_output?: boolean | null

structured_output_json_schema?: string | null

structured_output_json_schema_name?: string | null

system_prompt?: string | null

system_prompt_append?: string | null

take_screenshot?: boolean | null

target_pages?: string | null

tier?: string | null

use_vendor_multimodal_model?: boolean | null

user_prompt?: string | null

vendor_multimodal_api_key?: string | null

vendor_multimodal_model_name?: string | null

version?: string | null

webhook_configurations?: Array<WebhookConfiguration> | null

Outbound webhook endpoints to notify on job status changes

webhook_events?: Array<"extract.pending" | "extract.success" | "extract.error" | 20 more> | null

Events to subscribe to (e.g. ‘parse.success’, ‘extract.error’). If null, all events are delivered.

One of the following:

"extract.pending"

"extract.success"

"extract.error"

"extract.partial_success"

"extract.cancelled"

"parse.pending"

"parse.running"

"parse.success"

"parse.error"

"parse.partial_success"

"parse.cancelled"

"classify.pending"

"classify.running"

"classify.success"

"classify.error"

"classify.partial_success"

"classify.cancelled"

"sheets.pending"

"sheets.success"

"sheets.error"

"sheets.partial_success"

"sheets.cancelled"

"unmapped_event"

webhook_headers?: Record<string, string> | null

Custom HTTP headers sent with each webhook request (e.g. auth tokens)

webhook_output_format?: string | null

Response format sent to the webhook: ‘string’ (default) or ‘json’

webhook_url?: string | null

URL to receive webhook POST notifications

webhook_url?: string | null

LlmParameters { class_name, model_name, system_prompt, 3 more }

class_name?: string

model_name?: "GPT_4O" | "GPT_4O_MINI" | "GPT_4_1" | 10 more

The name of the model to use for LLM completions.

One of the following:

"GPT_4O"

"GPT_4O_MINI"

"GPT_4_1"

"GPT_4_1_NANO"

"GPT_4_1_MINI"

"AZURE_OPENAI_GPT_4O"

"AZURE_OPENAI_GPT_4O_MINI"

"AZURE_OPENAI_GPT_4_1"

"AZURE_OPENAI_GPT_4_1_MINI"

"AZURE_OPENAI_GPT_4_1_NANO"

"CLAUDE_4_5_SONNET"

"BEDROCK_CLAUDE_3_5_SONNET_V1"

"BEDROCK_CLAUDE_3_5_SONNET_V2"

system_prompt?: string | null

The system prompt to use for the completion.

maxLength3000

temperature?: number | null

The temperature value for the model.

use_chain_of_thought_reasoning?: boolean | null

Whether to use chain of thought reasoning.

use_citation?: boolean | null

Whether to show citations in the response.

ManagedIngestionStatusResponse { status, deployment_date, effective_at, 2 more }

status: "NOT_STARTED" | "IN_PROGRESS" | "SUCCESS" | 3 more

Status of the ingestion.

One of the following:

"NOT_STARTED"

"IN_PROGRESS"

"SUCCESS"

"ERROR"

"PARTIAL_SUCCESS"

"CANCELLED"

deployment_date?: string | null

Date of the deployment.

formatdate-time

effective_at?: string | null

When the status is effective

formatdate-time

error?: Array<Error> | null

List of errors that occurred during ingestion.

job_id: string

ID of the job that failed.

formatuuid

message: string

List of errors that occurred during ingestion.

step: "MANAGED_INGESTION" | "DATA_SOURCE" | "FILE_UPDATER" | 4 more

Name of the job that failed.

One of the following:

"MANAGED_INGESTION"

"DATA_SOURCE"

"FILE_UPDATER"

"PARSE"

"TRANSFORM"

"INGESTION"

"METADATA_UPDATE"

job_id?: string | null

ID of the latest job.

formatuuid

MessageRole = "system" | "developer" | "user" | 5 more

Message role.

One of the following:

"system"

"developer"

"user"

"assistant"

"function"

"tool"

"chatbot"

"model"

MetadataFilters { filters, condition }

Metadata filters for vector stores.

filters: Array<MetadataFilter { key, value, operator } | MetadataFilters { filters, condition } >

One of the following:

MetadataFilter { key, value, operator }

Comprehensive metadata filter for vector stores to support more operators.

Value uses Strict types, as int, float and str are compatible types and were all converted to string before.

See: https://docs.pydantic.dev/latest/usage/types/#strict-types

key: string

value: number | string | Array<string> | 2 more | null

One of the following:

number

string

Array<string>

Array<number>

operator?: "==" | ">" | "<" | 11 more

Vector store filter operator.

One of the following:

"=="

">"

"<"

"!="

">="

"<="

"in"

"nin"

"any"

"all"

"text_match"

"text_match_insensitive"

"contains"

"is_empty"

MetadataFilters = MetadataFilters { filters, condition }

condition?: "and" | "or" | "not" | null

Vector store filter conditions to combine different filters.

One of the following:

"and"

"or"

"not"

OpenAIEmbedding { additional_kwargs, api_base, api_key, 10 more }

additional_kwargs?: Record<string, unknown>

Additional kwargs for the OpenAI API.

api_base?: string | null

The base URL for OpenAI API.

api_key?: string | null

The OpenAI API key.

api_version?: string | null

The version for OpenAI API.

class_name?: string

default_headers?: Record<string, string> | null

The default headers for API requests.

dimensions?: number | null

The number of dimensions on the output embedding vectors. Works only with v3 embedding models.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

Maximum number of retries.

minimum0

model_name?: string

The name of the OpenAI embedding model.

num_workers?: number | null

The number of workers to use for async embedding calls.

reuse_client?: boolean

Reuse the OpenAI client between requests. When doing anything with large volumes of async API calls, setting this to false can improve stability.

timeout?: number

Timeout for each request.

minimum0

OpenAIEmbeddingConfig { component, type }

component?: OpenAIEmbedding { additional_kwargs, api_base, api_key, 10 more }

Configuration for the OpenAI embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the OpenAI API.

api_base?: string | null

The base URL for OpenAI API.

api_key?: string | null

The OpenAI API key.

api_version?: string | null

The version for OpenAI API.

class_name?: string

default_headers?: Record<string, string> | null

The default headers for API requests.

dimensions?: number | null

The number of dimensions on the output embedding vectors. Works only with v3 embedding models.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

Maximum number of retries.

minimum0

model_name?: string

The name of the OpenAI embedding model.

num_workers?: number | null

The number of workers to use for async embedding calls.

reuse_client?: boolean

Reuse the OpenAI client between requests. When doing anything with large volumes of async API calls, setting this to false can improve stability.

timeout?: number

Timeout for each request.

minimum0

type?: "OPENAI_EMBEDDING"

Type of the embedding model.

PageFigureNodeWithScore { node, score, class_name }

Page figure metadata with score

node: Node { confidence, figure_name, figure_size, 4 more }

confidence: number

The confidence of the figure

maximum1

minimum0

figure_name: string

The name of the figure

figure_size: number

The size of the figure in bytes

minimum0

file_id: string

The ID of the file that the figure was taken from

formatuuid

page_index: number

The index of the page for which the figure is taken (0-indexed)

minimum0

is_likely_noise?: boolean

Whether the figure is likely to be noise

metadata?: Record<string, unknown> | null

Metadata for the figure

score: number

The score of the figure node

class_name?: string

PageScreenshotNodeWithScore { node, score, class_name }

Page screenshot metadata with score

node: Node { file_id, image_size, page_index, metadata }

file_id: string

The ID of the file that the page screenshot was taken from

formatuuid

image_size: number

The size of the image in bytes

minimum0

page_index: number

The index of the page for which the screenshot is taken (0-indexed)

minimum0

metadata?: Record<string, unknown> | null

Metadata for the screenshot

score: number

The score of the screenshot node

class_name?: string

Pipeline { id, embedding_config, name, 15 more }

Schema for a pipeline.

id: string

Unique identifier

formatuuid

embedding_config: ManagedOpenAIEmbeddingConfig { component, type } | AzureOpenAIEmbeddingConfig { component, type } | CohereEmbeddingConfig { component, type } | 5 more

One of the following:

ManagedOpenAIEmbeddingConfig { component, type }

component?: Component { class_name, embed_batch_size, model_name, num_workers }

Configuration for the Managed OpenAI embedding model.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

model_name?: "openai-text-embedding-3-small"

The name of the OpenAI embedding model.

num_workers?: number | null

The number of workers to use for async embedding calls.

type?: "MANAGED_OPENAI_EMBEDDING"

Type of the embedding model.

AzureOpenAIEmbeddingConfig { component, type }

component?: AzureOpenAIEmbedding { additional_kwargs, api_base, api_key, 12 more }

Configuration for the Azure OpenAI embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the OpenAI API.

api_base?: string

The base URL for Azure deployment.

api_key?: string | null

The OpenAI API key.

api_version?: string

The version for Azure OpenAI API.

azure_deployment?: string | null

The Azure deployment to use.

azure_endpoint?: string | null

The Azure endpoint to use.

class_name?: string

default_headers?: Record<string, string> | null

The default headers for API requests.

dimensions?: number | null

The number of dimensions on the output embedding vectors. Works only with v3 embedding models.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

Maximum number of retries.

minimum0

model_name?: string

The name of the OpenAI embedding model.

num_workers?: number | null

The number of workers to use for async embedding calls.

reuse_client?: boolean

Reuse the OpenAI client between requests. When doing anything with large volumes of async API calls, setting this to false can improve stability.

timeout?: number

Timeout for each request.

minimum0

type?: "AZURE_EMBEDDING"

Type of the embedding model.

CohereEmbeddingConfig { component, type }

component?: CohereEmbedding { api_key, class_name, embed_batch_size, 5 more }

Configuration for the Cohere embedding model.

api_key: string | null

The Cohere API key.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

embedding_type?: string

Embedding type. If not provided float embedding_type is used when needed.

input_type?: string | null

Model Input type. If not provided, search_document and search_query are used when needed.

model_name?: string

The modelId of the Cohere model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

truncate?: string

Truncation type - START/ END/ NONE

type?: "COHERE_EMBEDDING"

Type of the embedding model.

GeminiEmbeddingConfig { component, type }

component?: GeminiEmbedding { api_base, api_key, class_name, 7 more }

Configuration for the Gemini embedding model.

api_base?: string | null

API base to access the model. Defaults to None.

api_key?: string | null

API key to access the model. Defaults to None.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

model_name?: string

The modelId of the Gemini model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

output_dimensionality?: number | null

Optional reduced dimension for output embeddings. Supported by models/text-embedding-004 and newer (e.g. gemini-embedding-001). Not supported by models/embedding-001.

task_type?: string | null

The task for embedding model.

title?: string | null

Title is only applicable for retrieval_document tasks, and is used to represent a document title. For other tasks, title is invalid.

transport?: string | null

Transport to access the model. Defaults to None.

type?: "GEMINI_EMBEDDING"

Type of the embedding model.

HuggingFaceInferenceAPIEmbeddingConfig { component, type }

component?: HuggingFaceInferenceAPIEmbedding { token, class_name, cookies, 9 more }

Configuration for the HuggingFace Inference API embedding model.

token?: string | boolean | null

Hugging Face token. Will default to the locally saved token. Pass token=False if you don’t want to send your token to the server.

One of the following:

string

boolean

class_name?: string

cookies?: Record<string, string> | null

Additional cookies to send to the server.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

headers?: Record<string, string> | null

Additional headers to send to the server. By default only the authorization and user-agent headers are sent. Values in this dictionary will override the default values.

model_name?: string | null

Hugging Face model name. If None, the task will be used.

num_workers?: number | null

The number of workers to use for async embedding calls.

pooling?: "cls" | "mean" | "last" | null

Enum of possible pooling choices with pooling behaviors.

One of the following:

"cls"

"mean"

"last"

query_instruction?: string | null

Instruction to prepend during query embedding.

task?: string | null

Optional task to pick Hugging Face’s recommended model, used when model_name is left as default of None.

text_instruction?: string | null

Instruction to prepend during text embedding.

timeout?: number | null

type?: "HUGGINGFACE_API_EMBEDDING"

Type of the embedding model.

OpenAIEmbeddingConfig { component, type }

component?: OpenAIEmbedding { additional_kwargs, api_base, api_key, 10 more }

Configuration for the OpenAI embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the OpenAI API.

api_base?: string | null

The base URL for OpenAI API.

api_key?: string | null

The OpenAI API key.

api_version?: string | null

The version for OpenAI API.

class_name?: string

default_headers?: Record<string, string> | null

The default headers for API requests.

dimensions?: number | null

The number of dimensions on the output embedding vectors. Works only with v3 embedding models.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

Maximum number of retries.

minimum0

model_name?: string

The name of the OpenAI embedding model.

num_workers?: number | null

The number of workers to use for async embedding calls.

reuse_client?: boolean

Reuse the OpenAI client between requests. When doing anything with large volumes of async API calls, setting this to false can improve stability.

timeout?: number

Timeout for each request.

minimum0

type?: "OPENAI_EMBEDDING"

Type of the embedding model.

VertexAIEmbeddingConfig { component, type }

component?: VertexTextEmbedding { client_email, location, private_key, 9 more }

Configuration for the VertexAI embedding model.

client_email: string | null

The client email for the VertexAI credentials.

location: string

The default location to use when making API calls.

private_key: string | null

The private key for the VertexAI credentials.

private_key_id: string | null

The private key ID for the VertexAI credentials.

project: string

The default GCP project to use when making Vertex API calls.

token_uri: string | null

The token URI for the VertexAI credentials.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the Vertex.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

embed_mode?: "default" | "classification" | "clustering" | 2 more

The embedding mode to use.

One of the following:

"default"

"classification"

"clustering"

"similarity"

"retrieval"

model_name?: string

The modelId of the VertexAI model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

type?: "VERTEXAI_EMBEDDING"

Type of the embedding model.

BedrockEmbeddingConfig { component, type }

component?: BedrockEmbedding { additional_kwargs, aws_access_key_id, aws_secret_access_key, 9 more }

Configuration for the Bedrock embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the bedrock client.

aws_access_key_id?: string | null

AWS Access Key ID to use

aws_secret_access_key?: string | null

AWS Secret Access Key to use

aws_session_token?: string | null

AWS Session Token to use

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

The maximum number of API retries.

exclusiveMinimum0

model_name?: string

The modelId of the Bedrock model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

profile_name?: string | null

The name of aws profile to use. If not given, then the default profile is used.

region_name?: string | null

AWS region name to use. Uses region configured in AWS CLI if not passed

timeout?: number

The timeout for the Bedrock API request in seconds. It will be used for both connect and read timeouts.

type?: "BEDROCK_EMBEDDING"

Type of the embedding model.

project_id: string

config_hash?: ConfigHash | null

Hashes for the configuration of a pipeline.

embedding_config_hash?: string | null

Hash of the embedding config.

parsing_config_hash?: string | null

Hash of the llama parse parameters.

transform_config_hash?: string | null

Hash of the transform config.

created_at?: string | null

Creation datetime

formatdate-time

data_sink?: DataSink { id, component, name, 4 more } | null

Schema for a data sink.

id: string

Unique identifier

formatuuid

component: Record<string, unknown> | CloudPineconeVectorStore { api_key, index_name, class_name, 3 more } | CloudPostgresVectorStore { database, embed_dim, host, 10 more } | 5 more

Component that implements the data sink

One of the following:

Record<string, unknown>

CloudPineconeVectorStore { api_key, index_name, class_name, 3 more }

Cloud Pinecone Vector Store.

This class is used to store the configuration for a Pinecone vector store, so that it can be created and used in LlamaCloud.

api_key: string

The API key for authenticating with Pinecone

formatpassword

index_name: string

class_name?: string

insert_kwargs?: Record<string, unknown> | null

namespace?: string | null

supports_nested_metadata_filters?: true

CloudPostgresVectorStore { database, embed_dim, host, 10 more }

database: string

embed_dim: number

host: string

password: string

port: number

schema_name: string

table_name: string

user: string

class_name?: string

hnsw_settings?: PgVectorHnswSettings { distance_method, ef_construction, ef_search, 2 more } | null

HNSW settings for PGVector.

distance_method?: "l2" | "ip" | "cosine" | 3 more

The distance method to use.

One of the following:

"l2"

"ip"

"cosine"

"l1"

"hamming"

"jaccard"

ef_construction?: number

The number of edges to use during the construction phase.

minimum1

ef_search?: number

The number of edges to use during the search phase.

minimum1

m?: number

The number of bi-directional links created for each new element.

minimum1

vector_type?: "vector" | "half_vec" | "bit" | "sparse_vec"

The type of vector to use.

One of the following:

"vector"

"half_vec"

"bit"

"sparse_vec"

hybrid_search?: boolean | null

perform_setup?: boolean

supports_nested_metadata_filters?: boolean

CloudQdrantVectorStore { api_key, collection_name, url, 4 more }

Cloud Qdrant Vector Store.

This class is used to store the configuration for a Qdrant vector store, so that it can be created and used in LlamaCloud.

api_key: string

collection_name: string

url: string

class_name?: string

client_kwargs?: Record<string, unknown>

max_retries?: number

supports_nested_metadata_filters?: true

CloudAzureAISearchVectorStore { search_service_api_key, search_service_endpoint, class_name, 8 more }

Cloud Azure AI Search Vector Store.

search_service_api_key: string

search_service_endpoint: string

class_name?: string

client_id?: string | null

client_secret?: string | null

embedding_dimension?: number | null

filterable_metadata_field_keys?: Record<string, unknown> | null

index_name?: string | null

search_service_api_version?: string | null

supports_nested_metadata_filters?: true

tenant_id?: string | null

CloudMongoDBAtlasVectorSearch { collection_name, db_name, mongodb_uri, 5 more }

Cloud MongoDB Atlas Vector Store.

This class is used to store the configuration for a MongoDB Atlas vector store, so that it can be created and used in LlamaCloud.

collection_name: string

db_name: string

mongodb_uri: string

class_name?: string

embedding_dimension?: number | null

fulltext_index_name?: string | null

supports_nested_metadata_filters?: boolean

vector_index_name?: string | null

CloudMilvusVectorStore { uri, token, class_name, 3 more }

Cloud Milvus Vector Store.

uri: string

token?: string | null

class_name?: string

collection_name?: string | null

embedding_dimension?: number | null

supports_nested_metadata_filters?: boolean

CloudAstraDBVectorStore { token, api_endpoint, collection_name, 4 more }

Cloud AstraDB Vector Store.

This class is used to store the configuration for an AstraDB vector store, so that it can be created and used in LlamaCloud.

token: string

The Astra DB Application Token to use

formatpassword

api_endpoint: string

The Astra DB JSON API endpoint for your database

collection_name: string

Collection name to use. If not existing, it will be created

embedding_dimension: number

Length of the embedding vectors in use

class_name?: string

keyspace?: string | null

The keyspace to use. If not provided, ‘default_keyspace’

supports_nested_metadata_filters?: true

The name of the data sink.

project_id: string

sink_type: "PINECONE" | "POSTGRES" | "QDRANT" | 4 more

One of the following:

"PINECONE"

"POSTGRES"

"QDRANT"

"AZUREAI_SEARCH"

"MONGODB_ATLAS"

"MILVUS"

"ASTRA_DB"

created_at?: string | null

Creation datetime

formatdate-time

updated_at?: string | null

Update datetime

formatdate-time

embedding_model_config?: EmbeddingModelConfig | null

Schema for an embedding model config.

id: string

Unique identifier

formatuuid

embedding_config: AzureOpenAIEmbeddingConfig { component, type } | CohereEmbeddingConfig { component, type } | GeminiEmbeddingConfig { component, type } | 4 more

The embedding configuration for the embedding model config.

One of the following:

AzureOpenAIEmbeddingConfig { component, type }

component?: AzureOpenAIEmbedding { additional_kwargs, api_base, api_key, 12 more }

Configuration for the Azure OpenAI embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the OpenAI API.

api_base?: string

The base URL for Azure deployment.

api_key?: string | null

The OpenAI API key.

api_version?: string

The version for Azure OpenAI API.

azure_deployment?: string | null

The Azure deployment to use.

azure_endpoint?: string | null

The Azure endpoint to use.

class_name?: string

default_headers?: Record<string, string> | null

The default headers for API requests.

dimensions?: number | null

The number of dimensions on the output embedding vectors. Works only with v3 embedding models.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

Maximum number of retries.

minimum0

model_name?: string

The name of the OpenAI embedding model.

num_workers?: number | null

The number of workers to use for async embedding calls.

reuse_client?: boolean

Reuse the OpenAI client between requests. When doing anything with large volumes of async API calls, setting this to false can improve stability.

timeout?: number

Timeout for each request.

minimum0

type?: "AZURE_EMBEDDING"

Type of the embedding model.

CohereEmbeddingConfig { component, type }

component?: CohereEmbedding { api_key, class_name, embed_batch_size, 5 more }

Configuration for the Cohere embedding model.

api_key: string | null

The Cohere API key.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

embedding_type?: string

Embedding type. If not provided float embedding_type is used when needed.

input_type?: string | null

Model Input type. If not provided, search_document and search_query are used when needed.

model_name?: string

The modelId of the Cohere model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

truncate?: string

Truncation type - START/ END/ NONE

type?: "COHERE_EMBEDDING"

Type of the embedding model.

GeminiEmbeddingConfig { component, type }

component?: GeminiEmbedding { api_base, api_key, class_name, 7 more }

Configuration for the Gemini embedding model.

api_base?: string | null

API base to access the model. Defaults to None.

api_key?: string | null

API key to access the model. Defaults to None.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

model_name?: string

The modelId of the Gemini model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

output_dimensionality?: number | null

Optional reduced dimension for output embeddings. Supported by models/text-embedding-004 and newer (e.g. gemini-embedding-001). Not supported by models/embedding-001.

task_type?: string | null

The task for embedding model.

title?: string | null

Title is only applicable for retrieval_document tasks, and is used to represent a document title. For other tasks, title is invalid.

transport?: string | null

Transport to access the model. Defaults to None.

type?: "GEMINI_EMBEDDING"

Type of the embedding model.

HuggingFaceInferenceAPIEmbeddingConfig { component, type }

component?: HuggingFaceInferenceAPIEmbedding { token, class_name, cookies, 9 more }

Configuration for the HuggingFace Inference API embedding model.

token?: string | boolean | null

Hugging Face token. Will default to the locally saved token. Pass token=False if you don’t want to send your token to the server.

One of the following:

string

boolean

class_name?: string

cookies?: Record<string, string> | null

Additional cookies to send to the server.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

headers?: Record<string, string> | null

Additional headers to send to the server. By default only the authorization and user-agent headers are sent. Values in this dictionary will override the default values.

model_name?: string | null

Hugging Face model name. If None, the task will be used.

num_workers?: number | null

The number of workers to use for async embedding calls.

pooling?: "cls" | "mean" | "last" | null

Enum of possible pooling choices with pooling behaviors.

One of the following:

"cls"

"mean"

"last"

query_instruction?: string | null

Instruction to prepend during query embedding.

task?: string | null

Optional task to pick Hugging Face’s recommended model, used when model_name is left as default of None.

text_instruction?: string | null

Instruction to prepend during text embedding.

timeout?: number | null

type?: "HUGGINGFACE_API_EMBEDDING"

Type of the embedding model.

OpenAIEmbeddingConfig { component, type }

component?: OpenAIEmbedding { additional_kwargs, api_base, api_key, 10 more }

Configuration for the OpenAI embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the OpenAI API.

api_base?: string | null

The base URL for OpenAI API.

api_key?: string | null

The OpenAI API key.

api_version?: string | null

The version for OpenAI API.

class_name?: string

default_headers?: Record<string, string> | null

The default headers for API requests.

dimensions?: number | null

The number of dimensions on the output embedding vectors. Works only with v3 embedding models.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

Maximum number of retries.

minimum0

model_name?: string

The name of the OpenAI embedding model.

num_workers?: number | null

The number of workers to use for async embedding calls.

reuse_client?: boolean

Reuse the OpenAI client between requests. When doing anything with large volumes of async API calls, setting this to false can improve stability.

timeout?: number

Timeout for each request.

minimum0

type?: "OPENAI_EMBEDDING"

Type of the embedding model.

VertexAIEmbeddingConfig { component, type }

component?: VertexTextEmbedding { client_email, location, private_key, 9 more }

Configuration for the VertexAI embedding model.

client_email: string | null

The client email for the VertexAI credentials.

location: string

The default location to use when making API calls.

private_key: string | null

The private key for the VertexAI credentials.

private_key_id: string | null

The private key ID for the VertexAI credentials.

project: string

The default GCP project to use when making Vertex API calls.

token_uri: string | null

The token URI for the VertexAI credentials.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the Vertex.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

embed_mode?: "default" | "classification" | "clustering" | 2 more

The embedding mode to use.

One of the following:

"default"

"classification"

"clustering"

"similarity"

"retrieval"

model_name?: string

The modelId of the VertexAI model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

type?: "VERTEXAI_EMBEDDING"

Type of the embedding model.

BedrockEmbeddingConfig { component, type }

component?: BedrockEmbedding { additional_kwargs, aws_access_key_id, aws_secret_access_key, 9 more }

Configuration for the Bedrock embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the bedrock client.

aws_access_key_id?: string | null

AWS Access Key ID to use

aws_secret_access_key?: string | null

AWS Secret Access Key to use

aws_session_token?: string | null

AWS Session Token to use

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

The maximum number of API retries.

exclusiveMinimum0

model_name?: string

The modelId of the Bedrock model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

profile_name?: string | null

The name of aws profile to use. If not given, then the default profile is used.

region_name?: string | null

AWS region name to use. Uses region configured in AWS CLI if not passed

timeout?: number

The timeout for the Bedrock API request in seconds. It will be used for both connect and read timeouts.

type?: "BEDROCK_EMBEDDING"

Type of the embedding model.

The name of the embedding model config.

project_id: string

created_at?: string | null

Creation datetime

formatdate-time

updated_at?: string | null

Update datetime

formatdate-time

embedding_model_config_id?: string | null

The ID of the EmbeddingModelConfig this pipeline is using.

formatuuid

llama_parse_parameters?: LlamaParseParameters { adaptive_long_table, aggressive_table_extraction, annotate_links, 116 more } | null

Settings that can be configured for how to use LlamaParse to parse files within a LlamaCloud pipeline.

adaptive_long_table?: boolean | null

aggressive_table_extraction?: boolean | null

annotate_links?: boolean | null

auto_mode?: boolean | null

auto_mode_configuration_json?: string | null

auto_mode_trigger_on_image_in_page?: boolean | null

auto_mode_trigger_on_regexp_in_page?: string | null

auto_mode_trigger_on_table_in_page?: boolean | null

auto_mode_trigger_on_text_in_page?: string | null

azure_openai_api_version?: string | null

azure_openai_deployment_name?: string | null

azure_openai_endpoint?: string | null

azure_openai_key?: string | null

bbox_bottom?: number | null

bbox_left?: number | null

bbox_right?: number | null

bbox_top?: number | null

bounding_box?: string | null

compact_markdown_table?: boolean | null

complemental_formatting_instruction?: string | null

content_guideline_instruction?: string | null

continuous_mode?: boolean | null

disable_image_extraction?: boolean | null

disable_ocr?: boolean | null

disable_reconstruction?: boolean | null

do_not_cache?: boolean | null

do_not_unroll_columns?: boolean | null

enable_cost_optimizer?: boolean | null

extract_charts?: boolean | null

extract_layout?: boolean | null

extract_printed_page_number?: boolean | null

fast_mode?: boolean | null

formatting_instruction?: string | null

gpt4o_api_key?: string | null

gpt4o_mode?: boolean | null

guess_xlsx_sheet_name?: boolean | null

hide_footers?: boolean | null

hide_headers?: boolean | null

high_res_ocr?: boolean | null

html_make_all_elements_visible?: boolean | null

html_remove_fixed_elements?: boolean | null

html_remove_navigation_elements?: boolean | null

http_proxy?: string | null

ignore_document_elements_for_layout_detection?: boolean | null

images_to_save?: Array<"screenshot" | "embedded" | "layout"> | null

One of the following:

"screenshot"

"embedded"

"layout"

inline_images_in_markdown?: boolean | null

input_s3_path?: string | null

input_s3_region?: string | null

input_url?: string | null

internal_is_screenshot_job?: boolean | null

invalidate_cache?: boolean | null

is_formatting_instruction?: boolean | null

job_timeout_extra_time_per_page_in_seconds?: number | null

job_timeout_in_seconds?: number | null

keep_page_separator_when_merging_tables?: boolean | null

languages?: Array<ParsingLanguages>

One of the following:

"af"

"az"

"bs"

"cs"

"cy"

"da"

"de"

"en"

"es"

"et"

"fr"

"ga"

"hr"

"hu"

"id"

"is"

"it"

"ku"

"la"

"lt"

"lv"

"mi"

"ms"

"mt"

"nl"

"no"

"oc"

"pi"

"pl"

"pt"

"ro"

"rs_latin"

"sk"

"sl"

"sq"

"sv"

"sw"

"tl"

"tr"

"uz"

"vi"

"ar"

"fa"

"ug"

"ur"

"bn"

"as"

"mni"

"ru"

"rs_cyrillic"

"be"

"bg"

"uk"

"mn"

"abq"

"ady"

"kbd"

"ava"

"dar"

"inh"

"che"

"lbe"

"lez"

"tab"

"tjk"

"hi"

"mr"

"ne"

"bh"

"mai"

"ang"

"bho"

"mah"

"sck"

"new"

"gom"

"sa"

"bgc"

"th"

"ch_sim"

"ch_tra"

"ja"

"ko"

"ta"

"te"

"kn"

layout_aware?: boolean | null

line_level_bounding_box?: boolean | null

markdown_table_multiline_header_separator?: string | null

max_pages?: number | null

max_pages_enforced?: number | null

merge_tables_across_pages_in_markdown?: boolean | null

model?: string | null

outlined_table_extraction?: boolean | null

output_pdf_of_document?: boolean | null

output_s3_path_prefix?: string | null

output_s3_region?: string | null

output_tables_as_HTML?: boolean | null

page_error_tolerance?: number | null

page_footer_prefix?: string | null

page_footer_suffix?: string | null

page_header_prefix?: string | null

page_header_suffix?: string | null

page_prefix?: string | null

page_separator?: string | null

page_suffix?: string | null

parse_mode?: ParsingMode | null

Enum for representing the mode of parsing to be used.

One of the following:

"parse_page_without_llm"

"parse_page_with_llm"

"parse_page_with_lvm"

"parse_page_with_agent"

"parse_page_with_layout_agent"

"parse_document_with_llm"

"parse_document_with_lvm"

"parse_document_with_agent"

parsing_instruction?: string | null

precise_bounding_box?: boolean | null

premium_mode?: boolean | null

presentation_out_of_bounds_content?: boolean | null

presentation_skip_embedded_data?: boolean | null

preserve_layout_alignment_across_pages?: boolean | null

preserve_very_small_text?: boolean | null

preset?: string | null

priority?: "low" | "medium" | "high" | "critical" | null

The priority for the request. This field may be ignored or overwritten depending on the organization tier.

One of the following:

"low"

"medium"

"high"

"critical"

project_id?: string | null

remove_hidden_text?: boolean | null

replace_failed_page_mode?: FailPageMode | null

Enum for representing the different available page error handling modes.

One of the following:

"raw_text"

"blank_page"

"error_message"

replace_failed_page_with_error_message_prefix?: string | null

replace_failed_page_with_error_message_suffix?: string | null

save_images?: boolean | null

skip_diagonal_text?: boolean | null

specialized_chart_parsing_agentic?: boolean | null

specialized_chart_parsing_efficient?: boolean | null

specialized_chart_parsing_plus?: boolean | null

specialized_image_parsing?: boolean | null

spreadsheet_extract_sub_tables?: boolean | null

spreadsheet_force_formula_computation?: boolean | null

spreadsheet_include_hidden_sheets?: boolean | null

strict_mode_buggy_font?: boolean | null

strict_mode_image_extraction?: boolean | null

strict_mode_image_ocr?: boolean | null

strict_mode_reconstruction?: boolean | null

structured_output?: boolean | null

structured_output_json_schema?: string | null

structured_output_json_schema_name?: string | null

system_prompt?: string | null

system_prompt_append?: string | null

take_screenshot?: boolean | null

target_pages?: string | null

tier?: string | null

use_vendor_multimodal_model?: boolean | null

user_prompt?: string | null

vendor_multimodal_api_key?: string | null

vendor_multimodal_model_name?: string | null

version?: string | null

webhook_configurations?: Array<WebhookConfiguration> | null

Outbound webhook endpoints to notify on job status changes

webhook_events?: Array<"extract.pending" | "extract.success" | "extract.error" | 20 more> | null

Events to subscribe to (e.g. ‘parse.success’, ‘extract.error’). If null, all events are delivered.

One of the following:

"extract.pending"

"extract.success"

"extract.error"

"extract.partial_success"

"extract.cancelled"

"parse.pending"

"parse.running"

"parse.success"

"parse.error"

"parse.partial_success"

"parse.cancelled"

"classify.pending"

"classify.running"

"classify.success"

"classify.error"

"classify.partial_success"

"classify.cancelled"

"sheets.pending"

"sheets.success"

"sheets.error"

"sheets.partial_success"

"sheets.cancelled"

"unmapped_event"

webhook_headers?: Record<string, string> | null

Custom HTTP headers sent with each webhook request (e.g. auth tokens)

webhook_output_format?: string | null

Response format sent to the webhook: ‘string’ (default) or ‘json’

webhook_url?: string | null

URL to receive webhook POST notifications

webhook_url?: string | null

managed_pipeline_id?: string | null

The ID of the ManagedPipeline this playground pipeline is linked to.

formatuuid

metadata_config?: PipelineMetadataConfig { excluded_embed_metadata_keys, excluded_llm_metadata_keys } | null

Metadata configuration for the pipeline.

excluded_embed_metadata_keys?: Array<string>

List of metadata keys to exclude from embeddings

excluded_llm_metadata_keys?: Array<string>

List of metadata keys to exclude from LLM during retrieval

pipeline_type?: PipelineType

Type of pipeline. Either PLAYGROUND or MANAGED.

One of the following:

"PLAYGROUND"

"MANAGED"

preset_retrieval_parameters?: PresetRetrievalParams { alpha, class_name, dense_similarity_cutoff, 11 more }

Preset retrieval parameters for the pipeline.

alpha?: number | null

Alpha value for hybrid retrieval to determine the weights between dense and sparse retrieval. 0 is sparse retrieval and 1 is dense retrieval.

maximum1

minimum0

class_name?: string

dense_similarity_cutoff?: number | null

Minimum similarity score wrt query for retrieval

maximum1

minimum0

dense_similarity_top_k?: number | null

Number of nodes for dense retrieval.

maximum100

minimum1

enable_reranking?: boolean | null

Enable reranking for retrieval

files_top_k?: number | null

Number of files to retrieve (only for retrieval mode files_via_metadata and files_via_content).

maximum5

minimum1

rerank_top_n?: number | null

Number of reranked nodes for returning.

maximum100

minimum1

retrieval_mode?: RetrievalMode

The retrieval mode for the query.

One of the following:

"chunks"

"files_via_metadata"

"files_via_content"

"auto_routed"

Deprecatedretrieve_image_nodes?: boolean

Whether to retrieve image nodes.

retrieve_page_figure_nodes?: boolean

Whether to retrieve page figure nodes.

retrieve_page_screenshot_nodes?: boolean

Whether to retrieve page screenshot nodes.

Metadata filters for vector stores.

One of the following:

Comprehensive metadata filter for vector stores to support more operators.

Value uses Strict types, as int, float and str are compatible types and were all converted to string before.

See: https://docs.pydantic.dev/latest/usage/types/#strict-types

One of the following:

Vector store filter operator.

One of the following:

Vector store filter conditions to combine different filters.

One of the following:

JSON Schema that will be used to infer search_filters. Omit or leave as null to skip inference.

One of the following:

Record<string, unknown>

Array<unknown>

string

number

boolean

sparse_similarity_top_k?: number | null

Number of nodes for sparse retrieval.

maximum100

minimum1

sparse_model_config?: SparseModelConfig { class_name, model_type } | null

Configuration for sparse embedding models used in hybrid search.

This allows users to choose between Splade and BM25 models for sparse retrieval in managed data sinks.

class_name?: string

model_type?: "splade" | "bm25" | "auto"

The sparse model type to use. ‘bm25’ uses Qdrant’s FastEmbed BM25 model (default for new pipelines), ‘splade’ uses HuggingFace Splade model, ‘auto’ selects based on deployment mode (BYOC uses term frequency, Cloud uses Splade).

One of the following:

"splade"

"bm25"

"auto"

status?: "CREATED" | "DELETING" | null

Status of the pipeline.

One of the following:

"CREATED"

"DELETING"

transform_config?: AutoTransformConfig { chunk_overlap, chunk_size, mode } | AdvancedModeTransformConfig { chunking_config, mode, segmentation_config }

Configuration for the transformation.

One of the following:

AutoTransformConfig { chunk_overlap, chunk_size, mode }

chunk_overlap?: number

Chunk overlap for the transformation.

chunk_size?: number

Chunk size for the transformation.

exclusiveMinimum0

mode?: "auto"

AdvancedModeTransformConfig { chunking_config, mode, segmentation_config }

chunking_config?: NoneChunkingConfig { mode } | CharacterChunkingConfig { chunk_overlap, chunk_size, mode } | TokenChunkingConfig { chunk_overlap, chunk_size, mode, separator } | 2 more

Configuration for the chunking.

One of the following:

NoneChunkingConfig { mode }

mode?: "none"

CharacterChunkingConfig { chunk_overlap, chunk_size, mode }

chunk_overlap?: number

chunk_size?: number

mode?: "character"

TokenChunkingConfig { chunk_overlap, chunk_size, mode, separator }

chunk_overlap?: number

chunk_size?: number

mode?: "token"

separator?: string

SentenceChunkingConfig { chunk_overlap, chunk_size, mode, 2 more }

chunk_overlap?: number

chunk_size?: number

mode?: "sentence"

paragraph_separator?: string

separator?: string

SemanticChunkingConfig { breakpoint_percentile_threshold, buffer_size, mode }

breakpoint_percentile_threshold?: number

buffer_size?: number

mode?: "semantic"

mode?: "advanced"

segmentation_config?: NoneSegmentationConfig { mode } | PageSegmentationConfig { mode, page_separator } | ElementSegmentationConfig { mode }

Configuration for the segmentation.

One of the following:

NoneSegmentationConfig { mode }

mode?: "none"

PageSegmentationConfig { mode, page_separator }

mode?: "page"

page_separator?: string

ElementSegmentationConfig { mode }

mode?: "element"

updated_at?: string | null

Update datetime

formatdate-time

PipelineCreate { name, data_sink, data_sink_id, 10 more }

Schema for creating a pipeline.

data_sink?: DataSinkCreate { component, name, sink_type } | null

Schema for creating a data sink.

component: Record<string, unknown> | CloudPineconeVectorStore { api_key, index_name, class_name, 3 more } | CloudPostgresVectorStore { database, embed_dim, host, 10 more } | 5 more

Component that implements the data sink

One of the following:

Record<string, unknown>

CloudPineconeVectorStore { api_key, index_name, class_name, 3 more }

Cloud Pinecone Vector Store.

This class is used to store the configuration for a Pinecone vector store, so that it can be created and used in LlamaCloud.

api_key: string

The API key for authenticating with Pinecone

formatpassword

index_name: string

class_name?: string

insert_kwargs?: Record<string, unknown> | null

namespace?: string | null

supports_nested_metadata_filters?: true

CloudPostgresVectorStore { database, embed_dim, host, 10 more }

database: string

embed_dim: number

host: string

password: string

port: number

schema_name: string

table_name: string

user: string

class_name?: string

hnsw_settings?: PgVectorHnswSettings { distance_method, ef_construction, ef_search, 2 more } | null

HNSW settings for PGVector.

distance_method?: "l2" | "ip" | "cosine" | 3 more

The distance method to use.

One of the following:

"l2"

"ip"

"cosine"

"l1"

"hamming"

"jaccard"

ef_construction?: number

The number of edges to use during the construction phase.

minimum1

ef_search?: number

The number of edges to use during the search phase.

minimum1

m?: number

The number of bi-directional links created for each new element.

minimum1

vector_type?: "vector" | "half_vec" | "bit" | "sparse_vec"

The type of vector to use.

One of the following:

"vector"

"half_vec"

"bit"

"sparse_vec"

hybrid_search?: boolean | null

perform_setup?: boolean

supports_nested_metadata_filters?: boolean

CloudQdrantVectorStore { api_key, collection_name, url, 4 more }

Cloud Qdrant Vector Store.

This class is used to store the configuration for a Qdrant vector store, so that it can be created and used in LlamaCloud.

api_key: string

collection_name: string

url: string

class_name?: string

client_kwargs?: Record<string, unknown>

max_retries?: number

supports_nested_metadata_filters?: true

CloudAzureAISearchVectorStore { search_service_api_key, search_service_endpoint, class_name, 8 more }

Cloud Azure AI Search Vector Store.

search_service_api_key: string

search_service_endpoint: string

class_name?: string

client_id?: string | null

client_secret?: string | null

embedding_dimension?: number | null

filterable_metadata_field_keys?: Record<string, unknown> | null

index_name?: string | null

search_service_api_version?: string | null

supports_nested_metadata_filters?: true

tenant_id?: string | null

CloudMongoDBAtlasVectorSearch { collection_name, db_name, mongodb_uri, 5 more }

Cloud MongoDB Atlas Vector Store.

This class is used to store the configuration for a MongoDB Atlas vector store, so that it can be created and used in LlamaCloud.

collection_name: string

db_name: string

mongodb_uri: string

class_name?: string

embedding_dimension?: number | null

fulltext_index_name?: string | null

supports_nested_metadata_filters?: boolean

vector_index_name?: string | null

CloudMilvusVectorStore { uri, token, class_name, 3 more }

Cloud Milvus Vector Store.

uri: string

token?: string | null

class_name?: string

collection_name?: string | null

embedding_dimension?: number | null

supports_nested_metadata_filters?: boolean

CloudAstraDBVectorStore { token, api_endpoint, collection_name, 4 more }

Cloud AstraDB Vector Store.

This class is used to store the configuration for an AstraDB vector store, so that it can be created and used in LlamaCloud.

token: string

The Astra DB Application Token to use

formatpassword

api_endpoint: string

The Astra DB JSON API endpoint for your database

collection_name: string

Collection name to use. If not existing, it will be created

embedding_dimension: number

Length of the embedding vectors in use

class_name?: string

keyspace?: string | null

The keyspace to use. If not provided, ‘default_keyspace’

supports_nested_metadata_filters?: true

The name of the data sink.

sink_type: "PINECONE" | "POSTGRES" | "QDRANT" | 4 more

One of the following:

"PINECONE"

"POSTGRES"

"QDRANT"

"AZUREAI_SEARCH"

"MONGODB_ATLAS"

"MILVUS"

"ASTRA_DB"

data_sink_id?: string | null

Data sink ID. When provided instead of data_sink, the data sink will be looked up by ID.

formatuuid

embedding_config?: AzureOpenAIEmbeddingConfig { component, type } | CohereEmbeddingConfig { component, type } | GeminiEmbeddingConfig { component, type } | 4 more | null

One of the following:

AzureOpenAIEmbeddingConfig { component, type }

component?: AzureOpenAIEmbedding { additional_kwargs, api_base, api_key, 12 more }

Configuration for the Azure OpenAI embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the OpenAI API.

api_base?: string

The base URL for Azure deployment.

api_key?: string | null

The OpenAI API key.

api_version?: string

The version for Azure OpenAI API.

azure_deployment?: string | null

The Azure deployment to use.

azure_endpoint?: string | null

The Azure endpoint to use.

class_name?: string

default_headers?: Record<string, string> | null

The default headers for API requests.

dimensions?: number | null

The number of dimensions on the output embedding vectors. Works only with v3 embedding models.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

Maximum number of retries.

minimum0

model_name?: string

The name of the OpenAI embedding model.

num_workers?: number | null

The number of workers to use for async embedding calls.

reuse_client?: boolean

Reuse the OpenAI client between requests. When doing anything with large volumes of async API calls, setting this to false can improve stability.

timeout?: number

Timeout for each request.

minimum0

type?: "AZURE_EMBEDDING"

Type of the embedding model.

CohereEmbeddingConfig { component, type }

component?: CohereEmbedding { api_key, class_name, embed_batch_size, 5 more }

Configuration for the Cohere embedding model.

api_key: string | null

The Cohere API key.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

embedding_type?: string

Embedding type. If not provided float embedding_type is used when needed.

input_type?: string | null

Model Input type. If not provided, search_document and search_query are used when needed.

model_name?: string

The modelId of the Cohere model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

truncate?: string

Truncation type - START/ END/ NONE

type?: "COHERE_EMBEDDING"

Type of the embedding model.

GeminiEmbeddingConfig { component, type }

component?: GeminiEmbedding { api_base, api_key, class_name, 7 more }

Configuration for the Gemini embedding model.

api_base?: string | null

API base to access the model. Defaults to None.

api_key?: string | null

API key to access the model. Defaults to None.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

model_name?: string

The modelId of the Gemini model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

output_dimensionality?: number | null

Optional reduced dimension for output embeddings. Supported by models/text-embedding-004 and newer (e.g. gemini-embedding-001). Not supported by models/embedding-001.

task_type?: string | null

The task for embedding model.

title?: string | null

Title is only applicable for retrieval_document tasks, and is used to represent a document title. For other tasks, title is invalid.

transport?: string | null

Transport to access the model. Defaults to None.

type?: "GEMINI_EMBEDDING"

Type of the embedding model.

HuggingFaceInferenceAPIEmbeddingConfig { component, type }

component?: HuggingFaceInferenceAPIEmbedding { token, class_name, cookies, 9 more }

Configuration for the HuggingFace Inference API embedding model.

token?: string | boolean | null

Hugging Face token. Will default to the locally saved token. Pass token=False if you don’t want to send your token to the server.

One of the following:

string

boolean

class_name?: string

cookies?: Record<string, string> | null

Additional cookies to send to the server.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

headers?: Record<string, string> | null

Additional headers to send to the server. By default only the authorization and user-agent headers are sent. Values in this dictionary will override the default values.

model_name?: string | null

Hugging Face model name. If None, the task will be used.

num_workers?: number | null

The number of workers to use for async embedding calls.

pooling?: "cls" | "mean" | "last" | null

Enum of possible pooling choices with pooling behaviors.

One of the following:

"cls"

"mean"

"last"

query_instruction?: string | null

Instruction to prepend during query embedding.

task?: string | null

Optional task to pick Hugging Face’s recommended model, used when model_name is left as default of None.

text_instruction?: string | null

Instruction to prepend during text embedding.

timeout?: number | null

type?: "HUGGINGFACE_API_EMBEDDING"

Type of the embedding model.

OpenAIEmbeddingConfig { component, type }

component?: OpenAIEmbedding { additional_kwargs, api_base, api_key, 10 more }

Configuration for the OpenAI embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the OpenAI API.

api_base?: string | null

The base URL for OpenAI API.

api_key?: string | null

The OpenAI API key.

api_version?: string | null

The version for OpenAI API.

class_name?: string

default_headers?: Record<string, string> | null

The default headers for API requests.

dimensions?: number | null

The number of dimensions on the output embedding vectors. Works only with v3 embedding models.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

Maximum number of retries.

minimum0

model_name?: string

The name of the OpenAI embedding model.

num_workers?: number | null

The number of workers to use for async embedding calls.

reuse_client?: boolean

Reuse the OpenAI client between requests. When doing anything with large volumes of async API calls, setting this to false can improve stability.

timeout?: number

Timeout for each request.

minimum0

type?: "OPENAI_EMBEDDING"

Type of the embedding model.

VertexAIEmbeddingConfig { component, type }

component?: VertexTextEmbedding { client_email, location, private_key, 9 more }

Configuration for the VertexAI embedding model.

client_email: string | null

The client email for the VertexAI credentials.

location: string

The default location to use when making API calls.

private_key: string | null

The private key for the VertexAI credentials.

private_key_id: string | null

The private key ID for the VertexAI credentials.

project: string

The default GCP project to use when making Vertex API calls.

token_uri: string | null

The token URI for the VertexAI credentials.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the Vertex.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

embed_mode?: "default" | "classification" | "clustering" | 2 more

The embedding mode to use.

One of the following:

"default"

"classification"

"clustering"

"similarity"

"retrieval"

model_name?: string

The modelId of the VertexAI model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

type?: "VERTEXAI_EMBEDDING"

Type of the embedding model.

BedrockEmbeddingConfig { component, type }

component?: BedrockEmbedding { additional_kwargs, aws_access_key_id, aws_secret_access_key, 9 more }

Configuration for the Bedrock embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the bedrock client.

aws_access_key_id?: string | null

AWS Access Key ID to use

aws_secret_access_key?: string | null

AWS Secret Access Key to use

aws_session_token?: string | null

AWS Session Token to use

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

The maximum number of API retries.

exclusiveMinimum0

model_name?: string

The modelId of the Bedrock model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

profile_name?: string | null

The name of aws profile to use. If not given, then the default profile is used.

region_name?: string | null

AWS region name to use. Uses region configured in AWS CLI if not passed

timeout?: number

The timeout for the Bedrock API request in seconds. It will be used for both connect and read timeouts.

type?: "BEDROCK_EMBEDDING"

Type of the embedding model.

embedding_model_config_id?: string | null

Embedding model config ID. When provided instead of embedding_config, the embedding model config will be looked up by ID.

formatuuid

llama_parse_parameters?: LlamaParseParameters { adaptive_long_table, aggressive_table_extraction, annotate_links, 116 more }

Settings that can be configured for how to use LlamaParse to parse files within a LlamaCloud pipeline.

adaptive_long_table?: boolean | null

aggressive_table_extraction?: boolean | null

annotate_links?: boolean | null

auto_mode?: boolean | null

auto_mode_configuration_json?: string | null

auto_mode_trigger_on_image_in_page?: boolean | null

auto_mode_trigger_on_regexp_in_page?: string | null

auto_mode_trigger_on_table_in_page?: boolean | null

auto_mode_trigger_on_text_in_page?: string | null

azure_openai_api_version?: string | null

azure_openai_deployment_name?: string | null

azure_openai_endpoint?: string | null

azure_openai_key?: string | null

bbox_bottom?: number | null

bbox_left?: number | null

bbox_right?: number | null

bbox_top?: number | null

bounding_box?: string | null

compact_markdown_table?: boolean | null

complemental_formatting_instruction?: string | null

content_guideline_instruction?: string | null

continuous_mode?: boolean | null

disable_image_extraction?: boolean | null

disable_ocr?: boolean | null

disable_reconstruction?: boolean | null

do_not_cache?: boolean | null

do_not_unroll_columns?: boolean | null

enable_cost_optimizer?: boolean | null

extract_charts?: boolean | null

extract_layout?: boolean | null

extract_printed_page_number?: boolean | null

fast_mode?: boolean | null

formatting_instruction?: string | null

gpt4o_api_key?: string | null

gpt4o_mode?: boolean | null

guess_xlsx_sheet_name?: boolean | null

hide_footers?: boolean | null

hide_headers?: boolean | null

high_res_ocr?: boolean | null

html_make_all_elements_visible?: boolean | null

html_remove_fixed_elements?: boolean | null

html_remove_navigation_elements?: boolean | null

http_proxy?: string | null

ignore_document_elements_for_layout_detection?: boolean | null

images_to_save?: Array<"screenshot" | "embedded" | "layout"> | null

One of the following:

"screenshot"

"embedded"

"layout"

inline_images_in_markdown?: boolean | null

input_s3_path?: string | null

input_s3_region?: string | null

input_url?: string | null

internal_is_screenshot_job?: boolean | null

invalidate_cache?: boolean | null

is_formatting_instruction?: boolean | null

job_timeout_extra_time_per_page_in_seconds?: number | null

job_timeout_in_seconds?: number | null

keep_page_separator_when_merging_tables?: boolean | null

languages?: Array<ParsingLanguages>

One of the following:

"af"

"az"

"bs"

"cs"

"cy"

"da"

"de"

"en"

"es"

"et"

"fr"

"ga"

"hr"

"hu"

"id"

"is"

"it"

"ku"

"la"

"lt"

"lv"

"mi"

"ms"

"mt"

"nl"

"no"

"oc"

"pi"

"pl"

"pt"

"ro"

"rs_latin"

"sk"

"sl"

"sq"

"sv"

"sw"

"tl"

"tr"

"uz"

"vi"

"ar"

"fa"

"ug"

"ur"

"bn"

"as"

"mni"

"ru"

"rs_cyrillic"

"be"

"bg"

"uk"

"mn"

"abq"

"ady"

"kbd"

"ava"

"dar"

"inh"

"che"

"lbe"

"lez"

"tab"

"tjk"

"hi"

"mr"

"ne"

"bh"

"mai"

"ang"

"bho"

"mah"

"sck"

"new"

"gom"

"sa"

"bgc"

"th"

"ch_sim"

"ch_tra"

"ja"

"ko"

"ta"

"te"

"kn"

layout_aware?: boolean | null

line_level_bounding_box?: boolean | null

markdown_table_multiline_header_separator?: string | null

max_pages?: number | null

max_pages_enforced?: number | null

merge_tables_across_pages_in_markdown?: boolean | null

model?: string | null

outlined_table_extraction?: boolean | null

output_pdf_of_document?: boolean | null

output_s3_path_prefix?: string | null

output_s3_region?: string | null

output_tables_as_HTML?: boolean | null

page_error_tolerance?: number | null

page_footer_prefix?: string | null

page_footer_suffix?: string | null

page_header_prefix?: string | null

page_header_suffix?: string | null

page_prefix?: string | null

page_separator?: string | null

page_suffix?: string | null

parse_mode?: ParsingMode | null

Enum for representing the mode of parsing to be used.

One of the following:

"parse_page_without_llm"

"parse_page_with_llm"

"parse_page_with_lvm"

"parse_page_with_agent"

"parse_page_with_layout_agent"

"parse_document_with_llm"

"parse_document_with_lvm"

"parse_document_with_agent"

parsing_instruction?: string | null

precise_bounding_box?: boolean | null

premium_mode?: boolean | null

presentation_out_of_bounds_content?: boolean | null

presentation_skip_embedded_data?: boolean | null

preserve_layout_alignment_across_pages?: boolean | null

preserve_very_small_text?: boolean | null

preset?: string | null

priority?: "low" | "medium" | "high" | "critical" | null

The priority for the request. This field may be ignored or overwritten depending on the organization tier.

One of the following:

"low"

"medium"

"high"

"critical"

project_id?: string | null

remove_hidden_text?: boolean | null

replace_failed_page_mode?: FailPageMode | null

Enum for representing the different available page error handling modes.

One of the following:

"raw_text"

"blank_page"

"error_message"

replace_failed_page_with_error_message_prefix?: string | null

replace_failed_page_with_error_message_suffix?: string | null

save_images?: boolean | null

skip_diagonal_text?: boolean | null

specialized_chart_parsing_agentic?: boolean | null

specialized_chart_parsing_efficient?: boolean | null

specialized_chart_parsing_plus?: boolean | null

specialized_image_parsing?: boolean | null

spreadsheet_extract_sub_tables?: boolean | null

spreadsheet_force_formula_computation?: boolean | null

spreadsheet_include_hidden_sheets?: boolean | null

strict_mode_buggy_font?: boolean | null

strict_mode_image_extraction?: boolean | null

strict_mode_image_ocr?: boolean | null

strict_mode_reconstruction?: boolean | null

structured_output?: boolean | null

structured_output_json_schema?: string | null

structured_output_json_schema_name?: string | null

system_prompt?: string | null

system_prompt_append?: string | null

take_screenshot?: boolean | null

target_pages?: string | null

tier?: string | null

use_vendor_multimodal_model?: boolean | null

user_prompt?: string | null

vendor_multimodal_api_key?: string | null

vendor_multimodal_model_name?: string | null

version?: string | null

webhook_configurations?: Array<WebhookConfiguration> | null

Outbound webhook endpoints to notify on job status changes

webhook_events?: Array<"extract.pending" | "extract.success" | "extract.error" | 20 more> | null

Events to subscribe to (e.g. ‘parse.success’, ‘extract.error’). If null, all events are delivered.

One of the following:

"extract.pending"

"extract.success"

"extract.error"

"extract.partial_success"

"extract.cancelled"

"parse.pending"

"parse.running"

"parse.success"

"parse.error"

"parse.partial_success"

"parse.cancelled"

"classify.pending"

"classify.running"

"classify.success"

"classify.error"

"classify.partial_success"

"classify.cancelled"

"sheets.pending"

"sheets.success"

"sheets.error"

"sheets.partial_success"

"sheets.cancelled"

"unmapped_event"

webhook_headers?: Record<string, string> | null

Custom HTTP headers sent with each webhook request (e.g. auth tokens)

webhook_output_format?: string | null

Response format sent to the webhook: ‘string’ (default) or ‘json’

webhook_url?: string | null

URL to receive webhook POST notifications

webhook_url?: string | null

managed_pipeline_id?: string | null

The ID of the ManagedPipeline this playground pipeline is linked to.

formatuuid

metadata_config?: PipelineMetadataConfig { excluded_embed_metadata_keys, excluded_llm_metadata_keys } | null

Metadata configuration for the pipeline.

excluded_embed_metadata_keys?: Array<string>

List of metadata keys to exclude from embeddings

excluded_llm_metadata_keys?: Array<string>

List of metadata keys to exclude from LLM during retrieval

pipeline_type?: PipelineType

Type of pipeline. Either PLAYGROUND or MANAGED.

One of the following:

"PLAYGROUND"

"MANAGED"

preset_retrieval_parameters?: PresetRetrievalParams { alpha, class_name, dense_similarity_cutoff, 11 more }

Preset retrieval parameters for the pipeline.

alpha?: number | null

Alpha value for hybrid retrieval to determine the weights between dense and sparse retrieval. 0 is sparse retrieval and 1 is dense retrieval.

maximum1

minimum0

class_name?: string

dense_similarity_cutoff?: number | null

Minimum similarity score wrt query for retrieval

maximum1

minimum0

dense_similarity_top_k?: number | null

Number of nodes for dense retrieval.

maximum100

minimum1

enable_reranking?: boolean | null

Enable reranking for retrieval

files_top_k?: number | null

Number of files to retrieve (only for retrieval mode files_via_metadata and files_via_content).

maximum5

minimum1

rerank_top_n?: number | null

Number of reranked nodes for returning.

maximum100

minimum1

retrieval_mode?: RetrievalMode

The retrieval mode for the query.

One of the following:

"chunks"

"files_via_metadata"

"files_via_content"

"auto_routed"

Deprecatedretrieve_image_nodes?: boolean

Whether to retrieve image nodes.

retrieve_page_figure_nodes?: boolean

Whether to retrieve page figure nodes.

retrieve_page_screenshot_nodes?: boolean

Whether to retrieve page screenshot nodes.

Metadata filters for vector stores.

One of the following:

Comprehensive metadata filter for vector stores to support more operators.

Value uses Strict types, as int, float and str are compatible types and were all converted to string before.

See: https://docs.pydantic.dev/latest/usage/types/#strict-types

One of the following:

Vector store filter operator.

One of the following:

Vector store filter conditions to combine different filters.

One of the following:

JSON Schema that will be used to infer search_filters. Omit or leave as null to skip inference.

One of the following:

Record<string, unknown>

Array<unknown>

string

number

boolean

sparse_similarity_top_k?: number | null

Number of nodes for sparse retrieval.

maximum100

minimum1

sparse_model_config?: SparseModelConfig { class_name, model_type } | null

Configuration for sparse embedding models used in hybrid search.

This allows users to choose between Splade and BM25 models for sparse retrieval in managed data sinks.

class_name?: string

model_type?: "splade" | "bm25" | "auto"

One of the following:

"splade"

"bm25"

"auto"

status?: string | null

Status of the pipeline deployment.

transform_config?: AutoTransformConfig { chunk_overlap, chunk_size, mode } | AdvancedModeTransformConfig { chunking_config, mode, segmentation_config } | null

Configuration for the transformation.

One of the following:

AutoTransformConfig { chunk_overlap, chunk_size, mode }

chunk_overlap?: number

Chunk overlap for the transformation.

chunk_size?: number

Chunk size for the transformation.

exclusiveMinimum0

mode?: "auto"

AdvancedModeTransformConfig { chunking_config, mode, segmentation_config }

chunking_config?: NoneChunkingConfig { mode } | CharacterChunkingConfig { chunk_overlap, chunk_size, mode } | TokenChunkingConfig { chunk_overlap, chunk_size, mode, separator } | 2 more

Configuration for the chunking.

One of the following:

NoneChunkingConfig { mode }

mode?: "none"

CharacterChunkingConfig { chunk_overlap, chunk_size, mode }

chunk_overlap?: number

chunk_size?: number

mode?: "character"

TokenChunkingConfig { chunk_overlap, chunk_size, mode, separator }

chunk_overlap?: number

chunk_size?: number

mode?: "token"

separator?: string

SentenceChunkingConfig { chunk_overlap, chunk_size, mode, 2 more }

chunk_overlap?: number

chunk_size?: number

mode?: "sentence"

paragraph_separator?: string

separator?: string

SemanticChunkingConfig { breakpoint_percentile_threshold, buffer_size, mode }

breakpoint_percentile_threshold?: number

buffer_size?: number

mode?: "semantic"

mode?: "advanced"

segmentation_config?: NoneSegmentationConfig { mode } | PageSegmentationConfig { mode, page_separator } | ElementSegmentationConfig { mode }

Configuration for the segmentation.

One of the following:

NoneSegmentationConfig { mode }

mode?: "none"

PageSegmentationConfig { mode, page_separator }

mode?: "page"

page_separator?: string

ElementSegmentationConfig { mode }

mode?: "element"

PipelineMetadataConfig { excluded_embed_metadata_keys, excluded_llm_metadata_keys }

excluded_embed_metadata_keys?: Array<string>

List of metadata keys to exclude from embeddings

excluded_llm_metadata_keys?: Array<string>

List of metadata keys to exclude from LLM during retrieval

PipelineType = "PLAYGROUND" | "MANAGED"

Enum for representing the type of a pipeline

One of the following:

"PLAYGROUND"

"MANAGED"

PresetRetrievalParams { alpha, class_name, dense_similarity_cutoff, 11 more }

Schema for the search params for an retrieval execution that can be preset for a pipeline.

alpha?: number | null

Alpha value for hybrid retrieval to determine the weights between dense and sparse retrieval. 0 is sparse retrieval and 1 is dense retrieval.

maximum1

minimum0

class_name?: string

dense_similarity_cutoff?: number | null

Minimum similarity score wrt query for retrieval

maximum1

minimum0

dense_similarity_top_k?: number | null

Number of nodes for dense retrieval.

maximum100

minimum1

enable_reranking?: boolean | null

Enable reranking for retrieval

files_top_k?: number | null

Number of files to retrieve (only for retrieval mode files_via_metadata and files_via_content).

maximum5

minimum1

rerank_top_n?: number | null

Number of reranked nodes for returning.

maximum100

minimum1

retrieval_mode?: RetrievalMode

The retrieval mode for the query.

One of the following:

"chunks"

"files_via_metadata"

"files_via_content"

"auto_routed"

Deprecatedretrieve_image_nodes?: boolean

Whether to retrieve image nodes.

retrieve_page_figure_nodes?: boolean

Whether to retrieve page figure nodes.

retrieve_page_screenshot_nodes?: boolean

Whether to retrieve page screenshot nodes.

Metadata filters for vector stores.

One of the following:

Comprehensive metadata filter for vector stores to support more operators.

Value uses Strict types, as int, float and str are compatible types and were all converted to string before.

See: https://docs.pydantic.dev/latest/usage/types/#strict-types

One of the following:

Vector store filter operator.

One of the following:

Vector store filter conditions to combine different filters.

One of the following:

JSON Schema that will be used to infer search_filters. Omit or leave as null to skip inference.

One of the following:

Record<string, unknown>

Array<unknown>

string

number

boolean

sparse_similarity_top_k?: number | null

Number of nodes for sparse retrieval.

maximum100

minimum1

RetrievalMode = "chunks" | "files_via_metadata" | "files_via_content" | "auto_routed"

One of the following:

"chunks"

"files_via_metadata"

"files_via_content"

"auto_routed"

SparseModelConfig { class_name, model_type }

Configuration for sparse embedding models used in hybrid search.

This allows users to choose between Splade and BM25 models for sparse retrieval in managed data sinks.

class_name?: string

model_type?: "splade" | "bm25" | "auto"

One of the following:

"splade"

"bm25"

"auto"

VertexAIEmbeddingConfig { component, type }

component?: VertexTextEmbedding { client_email, location, private_key, 9 more }

Configuration for the VertexAI embedding model.

client_email: string | null

The client email for the VertexAI credentials.

location: string

The default location to use when making API calls.

private_key: string | null

The private key for the VertexAI credentials.

private_key_id: string | null

The private key ID for the VertexAI credentials.

project: string

The default GCP project to use when making Vertex API calls.

token_uri: string | null

The token URI for the VertexAI credentials.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the Vertex.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

embed_mode?: "default" | "classification" | "clustering" | 2 more

The embedding mode to use.

One of the following:

"default"

"classification"

"clustering"

"similarity"

"retrieval"

model_name?: string

The modelId of the VertexAI model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

type?: "VERTEXAI_EMBEDDING"

Type of the embedding model.

VertexTextEmbedding { client_email, location, private_key, 9 more }

client_email: string | null

The client email for the VertexAI credentials.

location: string

The default location to use when making API calls.

private_key: string | null

The private key for the VertexAI credentials.

private_key_id: string | null

The private key ID for the VertexAI credentials.

project: string

The default GCP project to use when making Vertex API calls.

token_uri: string | null

The token URI for the VertexAI credentials.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the Vertex.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

embed_mode?: "default" | "classification" | "clustering" | 2 more

The embedding mode to use.

One of the following:

"default"

"classification"

"clustering"

"similarity"

"retrieval"

model_name?: string

The modelId of the VertexAI model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

PipelineListResponse = Array<Pipeline { id, embedding_config, name, 15 more } >

id: string

Unique identifier

formatuuid

embedding_config: ManagedOpenAIEmbeddingConfig { component, type } | AzureOpenAIEmbeddingConfig { component, type } | CohereEmbeddingConfig { component, type } | 5 more

One of the following:

ManagedOpenAIEmbeddingConfig { component, type }

component?: Component { class_name, embed_batch_size, model_name, num_workers }

Configuration for the Managed OpenAI embedding model.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

model_name?: "openai-text-embedding-3-small"

The name of the OpenAI embedding model.

num_workers?: number | null

The number of workers to use for async embedding calls.

type?: "MANAGED_OPENAI_EMBEDDING"

Type of the embedding model.

AzureOpenAIEmbeddingConfig { component, type }

component?: AzureOpenAIEmbedding { additional_kwargs, api_base, api_key, 12 more }

Configuration for the Azure OpenAI embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the OpenAI API.

api_base?: string

The base URL for Azure deployment.

api_key?: string | null

The OpenAI API key.

api_version?: string

The version for Azure OpenAI API.

azure_deployment?: string | null

The Azure deployment to use.

azure_endpoint?: string | null

The Azure endpoint to use.

class_name?: string

default_headers?: Record<string, string> | null

The default headers for API requests.

dimensions?: number | null

The number of dimensions on the output embedding vectors. Works only with v3 embedding models.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

Maximum number of retries.

minimum0

model_name?: string

The name of the OpenAI embedding model.

num_workers?: number | null

The number of workers to use for async embedding calls.

reuse_client?: boolean

Reuse the OpenAI client between requests. When doing anything with large volumes of async API calls, setting this to false can improve stability.

timeout?: number

Timeout for each request.

minimum0

type?: "AZURE_EMBEDDING"

Type of the embedding model.

CohereEmbeddingConfig { component, type }

component?: CohereEmbedding { api_key, class_name, embed_batch_size, 5 more }

Configuration for the Cohere embedding model.

api_key: string | null

The Cohere API key.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

embedding_type?: string

Embedding type. If not provided float embedding_type is used when needed.

input_type?: string | null

Model Input type. If not provided, search_document and search_query are used when needed.

model_name?: string

The modelId of the Cohere model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

truncate?: string

Truncation type - START/ END/ NONE

type?: "COHERE_EMBEDDING"

Type of the embedding model.

GeminiEmbeddingConfig { component, type }

component?: GeminiEmbedding { api_base, api_key, class_name, 7 more }

Configuration for the Gemini embedding model.

api_base?: string | null

API base to access the model. Defaults to None.

api_key?: string | null

API key to access the model. Defaults to None.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

model_name?: string

The modelId of the Gemini model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

output_dimensionality?: number | null

Optional reduced dimension for output embeddings. Supported by models/text-embedding-004 and newer (e.g. gemini-embedding-001). Not supported by models/embedding-001.

task_type?: string | null

The task for embedding model.

title?: string | null

Title is only applicable for retrieval_document tasks, and is used to represent a document title. For other tasks, title is invalid.

transport?: string | null

Transport to access the model. Defaults to None.

type?: "GEMINI_EMBEDDING"

Type of the embedding model.

HuggingFaceInferenceAPIEmbeddingConfig { component, type }

component?: HuggingFaceInferenceAPIEmbedding { token, class_name, cookies, 9 more }

Configuration for the HuggingFace Inference API embedding model.

token?: string | boolean | null

Hugging Face token. Will default to the locally saved token. Pass token=False if you don’t want to send your token to the server.

One of the following:

string

boolean

class_name?: string

cookies?: Record<string, string> | null

Additional cookies to send to the server.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

headers?: Record<string, string> | null

Additional headers to send to the server. By default only the authorization and user-agent headers are sent. Values in this dictionary will override the default values.

model_name?: string | null

Hugging Face model name. If None, the task will be used.

num_workers?: number | null

The number of workers to use for async embedding calls.

pooling?: "cls" | "mean" | "last" | null

Enum of possible pooling choices with pooling behaviors.

One of the following:

"cls"

"mean"

"last"

query_instruction?: string | null

Instruction to prepend during query embedding.

task?: string | null

Optional task to pick Hugging Face’s recommended model, used when model_name is left as default of None.

text_instruction?: string | null

Instruction to prepend during text embedding.

timeout?: number | null

type?: "HUGGINGFACE_API_EMBEDDING"

Type of the embedding model.

OpenAIEmbeddingConfig { component, type }

component?: OpenAIEmbedding { additional_kwargs, api_base, api_key, 10 more }

Configuration for the OpenAI embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the OpenAI API.

api_base?: string | null

The base URL for OpenAI API.

api_key?: string | null

The OpenAI API key.

api_version?: string | null

The version for OpenAI API.

class_name?: string

default_headers?: Record<string, string> | null

The default headers for API requests.

dimensions?: number | null

The number of dimensions on the output embedding vectors. Works only with v3 embedding models.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

Maximum number of retries.

minimum0

model_name?: string

The name of the OpenAI embedding model.

num_workers?: number | null

The number of workers to use for async embedding calls.

reuse_client?: boolean

Reuse the OpenAI client between requests. When doing anything with large volumes of async API calls, setting this to false can improve stability.

timeout?: number

Timeout for each request.

minimum0

type?: "OPENAI_EMBEDDING"

Type of the embedding model.

VertexAIEmbeddingConfig { component, type }

component?: VertexTextEmbedding { client_email, location, private_key, 9 more }

Configuration for the VertexAI embedding model.

client_email: string | null

The client email for the VertexAI credentials.

location: string

The default location to use when making API calls.

private_key: string | null

The private key for the VertexAI credentials.

private_key_id: string | null

The private key ID for the VertexAI credentials.

project: string

The default GCP project to use when making Vertex API calls.

token_uri: string | null

The token URI for the VertexAI credentials.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the Vertex.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

embed_mode?: "default" | "classification" | "clustering" | 2 more

The embedding mode to use.

One of the following:

"default"

"classification"

"clustering"

"similarity"

"retrieval"

model_name?: string

The modelId of the VertexAI model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

type?: "VERTEXAI_EMBEDDING"

Type of the embedding model.

BedrockEmbeddingConfig { component, type }

component?: BedrockEmbedding { additional_kwargs, aws_access_key_id, aws_secret_access_key, 9 more }

Configuration for the Bedrock embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the bedrock client.

aws_access_key_id?: string | null

AWS Access Key ID to use

aws_secret_access_key?: string | null

AWS Secret Access Key to use

aws_session_token?: string | null

AWS Session Token to use

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

The maximum number of API retries.

exclusiveMinimum0

model_name?: string

The modelId of the Bedrock model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

profile_name?: string | null

The name of aws profile to use. If not given, then the default profile is used.

region_name?: string | null

AWS region name to use. Uses region configured in AWS CLI if not passed

timeout?: number

The timeout for the Bedrock API request in seconds. It will be used for both connect and read timeouts.

type?: "BEDROCK_EMBEDDING"

Type of the embedding model.

project_id: string

config_hash?: ConfigHash | null

Hashes for the configuration of a pipeline.

embedding_config_hash?: string | null

Hash of the embedding config.

parsing_config_hash?: string | null

Hash of the llama parse parameters.

transform_config_hash?: string | null

Hash of the transform config.

created_at?: string | null

Creation datetime

formatdate-time

data_sink?: DataSink { id, component, name, 4 more } | null

Schema for a data sink.

id: string

Unique identifier

formatuuid

component: Record<string, unknown> | CloudPineconeVectorStore { api_key, index_name, class_name, 3 more } | CloudPostgresVectorStore { database, embed_dim, host, 10 more } | 5 more

Component that implements the data sink

One of the following:

Record<string, unknown>

CloudPineconeVectorStore { api_key, index_name, class_name, 3 more }

Cloud Pinecone Vector Store.

This class is used to store the configuration for a Pinecone vector store, so that it can be created and used in LlamaCloud.

api_key: string

The API key for authenticating with Pinecone

formatpassword

index_name: string

class_name?: string

insert_kwargs?: Record<string, unknown> | null

namespace?: string | null

supports_nested_metadata_filters?: true

CloudPostgresVectorStore { database, embed_dim, host, 10 more }

database: string

embed_dim: number

host: string

password: string

port: number

schema_name: string

table_name: string

user: string

class_name?: string

hnsw_settings?: PgVectorHnswSettings { distance_method, ef_construction, ef_search, 2 more } | null

HNSW settings for PGVector.

distance_method?: "l2" | "ip" | "cosine" | 3 more

The distance method to use.

One of the following:

"l2"

"ip"

"cosine"

"l1"

"hamming"

"jaccard"

ef_construction?: number

The number of edges to use during the construction phase.

minimum1

ef_search?: number

The number of edges to use during the search phase.

minimum1

m?: number

The number of bi-directional links created for each new element.

minimum1

vector_type?: "vector" | "half_vec" | "bit" | "sparse_vec"

The type of vector to use.

One of the following:

"vector"

"half_vec"

"bit"

"sparse_vec"

hybrid_search?: boolean | null

perform_setup?: boolean

supports_nested_metadata_filters?: boolean

CloudQdrantVectorStore { api_key, collection_name, url, 4 more }

Cloud Qdrant Vector Store.

This class is used to store the configuration for a Qdrant vector store, so that it can be created and used in LlamaCloud.

api_key: string

collection_name: string

url: string

class_name?: string

client_kwargs?: Record<string, unknown>

max_retries?: number

supports_nested_metadata_filters?: true

CloudAzureAISearchVectorStore { search_service_api_key, search_service_endpoint, class_name, 8 more }

Cloud Azure AI Search Vector Store.

search_service_api_key: string

search_service_endpoint: string

class_name?: string

client_id?: string | null

client_secret?: string | null

embedding_dimension?: number | null

filterable_metadata_field_keys?: Record<string, unknown> | null

index_name?: string | null

search_service_api_version?: string | null

supports_nested_metadata_filters?: true

tenant_id?: string | null

CloudMongoDBAtlasVectorSearch { collection_name, db_name, mongodb_uri, 5 more }

Cloud MongoDB Atlas Vector Store.

This class is used to store the configuration for a MongoDB Atlas vector store, so that it can be created and used in LlamaCloud.

collection_name: string

db_name: string

mongodb_uri: string

class_name?: string

embedding_dimension?: number | null

fulltext_index_name?: string | null

supports_nested_metadata_filters?: boolean

vector_index_name?: string | null

CloudMilvusVectorStore { uri, token, class_name, 3 more }

Cloud Milvus Vector Store.

uri: string

token?: string | null

class_name?: string

collection_name?: string | null

embedding_dimension?: number | null

supports_nested_metadata_filters?: boolean

CloudAstraDBVectorStore { token, api_endpoint, collection_name, 4 more }

Cloud AstraDB Vector Store.

This class is used to store the configuration for an AstraDB vector store, so that it can be created and used in LlamaCloud.

token: string

The Astra DB Application Token to use

formatpassword

api_endpoint: string

The Astra DB JSON API endpoint for your database

collection_name: string

Collection name to use. If not existing, it will be created

embedding_dimension: number

Length of the embedding vectors in use

class_name?: string

keyspace?: string | null

The keyspace to use. If not provided, ‘default_keyspace’

supports_nested_metadata_filters?: true

The name of the data sink.

project_id: string

sink_type: "PINECONE" | "POSTGRES" | "QDRANT" | 4 more

One of the following:

"PINECONE"

"POSTGRES"

"QDRANT"

"AZUREAI_SEARCH"

"MONGODB_ATLAS"

"MILVUS"

"ASTRA_DB"

created_at?: string | null

Creation datetime

formatdate-time

updated_at?: string | null

Update datetime

formatdate-time

embedding_model_config?: EmbeddingModelConfig | null

Schema for an embedding model config.

id: string

Unique identifier

formatuuid

embedding_config: AzureOpenAIEmbeddingConfig { component, type } | CohereEmbeddingConfig { component, type } | GeminiEmbeddingConfig { component, type } | 4 more

The embedding configuration for the embedding model config.

One of the following:

AzureOpenAIEmbeddingConfig { component, type }

component?: AzureOpenAIEmbedding { additional_kwargs, api_base, api_key, 12 more }

Configuration for the Azure OpenAI embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the OpenAI API.

api_base?: string

The base URL for Azure deployment.

api_key?: string | null

The OpenAI API key.

api_version?: string

The version for Azure OpenAI API.

azure_deployment?: string | null

The Azure deployment to use.

azure_endpoint?: string | null

The Azure endpoint to use.

class_name?: string

default_headers?: Record<string, string> | null

The default headers for API requests.

dimensions?: number | null

The number of dimensions on the output embedding vectors. Works only with v3 embedding models.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

Maximum number of retries.

minimum0

model_name?: string

The name of the OpenAI embedding model.

num_workers?: number | null

The number of workers to use for async embedding calls.

reuse_client?: boolean

Reuse the OpenAI client between requests. When doing anything with large volumes of async API calls, setting this to false can improve stability.

timeout?: number

Timeout for each request.

minimum0

type?: "AZURE_EMBEDDING"

Type of the embedding model.

CohereEmbeddingConfig { component, type }

component?: CohereEmbedding { api_key, class_name, embed_batch_size, 5 more }

Configuration for the Cohere embedding model.

api_key: string | null

The Cohere API key.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

embedding_type?: string

Embedding type. If not provided float embedding_type is used when needed.

input_type?: string | null

Model Input type. If not provided, search_document and search_query are used when needed.

model_name?: string

The modelId of the Cohere model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

truncate?: string

Truncation type - START/ END/ NONE

type?: "COHERE_EMBEDDING"

Type of the embedding model.

GeminiEmbeddingConfig { component, type }

component?: GeminiEmbedding { api_base, api_key, class_name, 7 more }

Configuration for the Gemini embedding model.

api_base?: string | null

API base to access the model. Defaults to None.

api_key?: string | null

API key to access the model. Defaults to None.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

model_name?: string

The modelId of the Gemini model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

output_dimensionality?: number | null

Optional reduced dimension for output embeddings. Supported by models/text-embedding-004 and newer (e.g. gemini-embedding-001). Not supported by models/embedding-001.

task_type?: string | null

The task for embedding model.

title?: string | null

Title is only applicable for retrieval_document tasks, and is used to represent a document title. For other tasks, title is invalid.

transport?: string | null

Transport to access the model. Defaults to None.

type?: "GEMINI_EMBEDDING"

Type of the embedding model.

HuggingFaceInferenceAPIEmbeddingConfig { component, type }

component?: HuggingFaceInferenceAPIEmbedding { token, class_name, cookies, 9 more }

Configuration for the HuggingFace Inference API embedding model.

token?: string | boolean | null

Hugging Face token. Will default to the locally saved token. Pass token=False if you don’t want to send your token to the server.

One of the following:

string

boolean

class_name?: string

cookies?: Record<string, string> | null

Additional cookies to send to the server.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

headers?: Record<string, string> | null

Additional headers to send to the server. By default only the authorization and user-agent headers are sent. Values in this dictionary will override the default values.

model_name?: string | null

Hugging Face model name. If None, the task will be used.

num_workers?: number | null

The number of workers to use for async embedding calls.

pooling?: "cls" | "mean" | "last" | null

Enum of possible pooling choices with pooling behaviors.

One of the following:

"cls"

"mean"

"last"

query_instruction?: string | null

Instruction to prepend during query embedding.

task?: string | null

Optional task to pick Hugging Face’s recommended model, used when model_name is left as default of None.

text_instruction?: string | null

Instruction to prepend during text embedding.

timeout?: number | null

type?: "HUGGINGFACE_API_EMBEDDING"

Type of the embedding model.

OpenAIEmbeddingConfig { component, type }

component?: OpenAIEmbedding { additional_kwargs, api_base, api_key, 10 more }

Configuration for the OpenAI embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the OpenAI API.

api_base?: string | null

The base URL for OpenAI API.

api_key?: string | null

The OpenAI API key.

api_version?: string | null

The version for OpenAI API.

class_name?: string

default_headers?: Record<string, string> | null

The default headers for API requests.

dimensions?: number | null

The number of dimensions on the output embedding vectors. Works only with v3 embedding models.

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

Maximum number of retries.

minimum0

model_name?: string

The name of the OpenAI embedding model.

num_workers?: number | null

The number of workers to use for async embedding calls.

reuse_client?: boolean

Reuse the OpenAI client between requests. When doing anything with large volumes of async API calls, setting this to false can improve stability.

timeout?: number

Timeout for each request.

minimum0

type?: "OPENAI_EMBEDDING"

Type of the embedding model.

VertexAIEmbeddingConfig { component, type }

component?: VertexTextEmbedding { client_email, location, private_key, 9 more }

Configuration for the VertexAI embedding model.

client_email: string | null

The client email for the VertexAI credentials.

location: string

The default location to use when making API calls.

private_key: string | null

The private key for the VertexAI credentials.

private_key_id: string | null

The private key ID for the VertexAI credentials.

project: string

The default GCP project to use when making Vertex API calls.

token_uri: string | null

The token URI for the VertexAI credentials.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the Vertex.

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

embed_mode?: "default" | "classification" | "clustering" | 2 more

The embedding mode to use.

One of the following:

"default"

"classification"

"clustering"

"similarity"

"retrieval"

model_name?: string

The modelId of the VertexAI model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

type?: "VERTEXAI_EMBEDDING"

Type of the embedding model.

BedrockEmbeddingConfig { component, type }

component?: BedrockEmbedding { additional_kwargs, aws_access_key_id, aws_secret_access_key, 9 more }

Configuration for the Bedrock embedding model.

additional_kwargs?: Record<string, unknown>

Additional kwargs for the bedrock client.

aws_access_key_id?: string | null

AWS Access Key ID to use

aws_secret_access_key?: string | null

AWS Secret Access Key to use

aws_session_token?: string | null

AWS Session Token to use

class_name?: string

embed_batch_size?: number

The batch size for embedding calls.

maximum2048

exclusiveMinimum0

max_retries?: number

The maximum number of API retries.

exclusiveMinimum0

model_name?: string

The modelId of the Bedrock model to use.

num_workers?: number | null

The number of workers to use for async embedding calls.

profile_name?: string | null

The name of aws profile to use. If not given, then the default profile is used.

region_name?: string | null

AWS region name to use. Uses region configured in AWS CLI if not passed

timeout?: number

The timeout for the Bedrock API request in seconds. It will be used for both connect and read timeouts.

type?: "BEDROCK_EMBEDDING"

Type of the embedding model.

The name of the embedding model config.

project_id: string

created_at?: string | null

Creation datetime

formatdate-time

updated_at?: string | null

Update datetime

formatdate-time

embedding_model_config_id?: string | null

The ID of the EmbeddingModelConfig this pipeline is using.

formatuuid

llama_parse_parameters?: LlamaParseParameters { adaptive_long_table, aggressive_table_extraction, annotate_links, 116 more } | null

Settings that can be configured for how to use LlamaParse to parse files within a LlamaCloud pipeline.

adaptive_long_table?: boolean | null

aggressive_table_extraction?: boolean | null

annotate_links?: boolean | null

auto_mode?: boolean | null

auto_mode_configuration_json?: string | null

auto_mode_trigger_on_image_in_page?: boolean | null

auto_mode_trigger_on_regexp_in_page?: string | null

auto_mode_trigger_on_table_in_page?: boolean | null

auto_mode_trigger_on_text_in_page?: string | null

azure_openai_api_version?: string | null

azure_openai_deployment_name?: string | null

azure_openai_endpoint?: string | null

azure_openai_key?: string | null

bbox_bottom?: number | null

bbox_left?: number | null

bbox_right?: number | null

bbox_top?: number | null

bounding_box?: string | null

compact_markdown_table?: boolean | null

complemental_formatting_instruction?: string | null

content_guideline_instruction?: string | null

continuous_mode?: boolean | null

disable_image_extraction?: boolean | null

disable_ocr?: boolean | null

disable_reconstruction?: boolean | null

do_not_cache?: boolean | null

do_not_unroll_columns?: boolean | null

enable_cost_optimizer?: boolean | null

extract_charts?: boolean | null

extract_layout?: boolean | null

extract_printed_page_number?: boolean | null

fast_mode?: boolean | null

formatting_instruction?: string | null

gpt4o_api_key?: string | null

gpt4o_mode?: boolean | null

guess_xlsx_sheet_name?: boolean | null

hide_footers?: boolean | null

hide_headers?: boolean | null

high_res_ocr?: boolean | null

html_make_all_elements_visible?: boolean | null

html_remove_fixed_elements?: boolean | null

html_remove_navigation_elements?: boolean | null

http_proxy?: string | null

ignore_document_elements_for_layout_detection?: boolean | null

images_to_save?: Array<"screenshot" | "embedded" | "layout"> | null

One of the following:

"screenshot"

"embedded"

"layout"

inline_images_in_markdown?: boolean | null

input_s3_path?: string | null

input_s3_region?: string | null

input_url?: string | null

internal_is_screenshot_job?: boolean | null

invalidate_cache?: boolean | null

is_formatting_instruction?: boolean | null

job_timeout_extra_time_per_page_in_seconds?: number | null

job_timeout_in_seconds?: number | null

keep_page_separator_when_merging_tables?: boolean | null

languages?: Array<ParsingLanguages>

One of the following:

"af"

"az"

"bs"

"cs"

"cy"

"da"

"de"

"en"

"es"

"et"

"fr"

"ga"

"hr"

"hu"

"id"

"is"

"it"

"ku"

"la"

"lt"

"lv"

"mi"

"ms"

"mt"

"nl"

"no"

"oc"

"pi"

"pl"

"pt"

"ro"

"rs_latin"

"sk"

"sl"

"sq"

"sv"

"sw"

"tl"

"tr"

"uz"

"vi"

"ar"

"fa"

"ug"

"ur"

"bn"

"as"

"mni"

"ru"

"rs_cyrillic"

"be"

"bg"

"uk"

"mn"

"abq"

"ady"

"kbd"

"ava"

"dar"

"inh"

"che"

"lbe"

"lez"

"tab"

"tjk"

"hi"

"mr"

"ne"

"bh"

"mai"

"ang"

"bho"

"mah"

"sck"

"new"

"gom"

"sa"

"bgc"

"th"

"ch_sim"

"ch_tra"

"ja"

"ko"

"ta"

"te"

"kn"

layout_aware?: boolean | null

line_level_bounding_box?: boolean | null

markdown_table_multiline_header_separator?: string | null

max_pages?: number | null

max_pages_enforced?: number | null

merge_tables_across_pages_in_markdown?: boolean | null

model?: string | null

outlined_table_extraction?: boolean | null

output_pdf_of_document?: boolean | null

output_s3_path_prefix?: string | null

output_s3_region?: string | null

output_tables_as_HTML?: boolean | null

page_error_tolerance?: number | null

page_footer_prefix?: string | null

page_footer_suffix?: string | null

page_header_prefix?: string | null

page_header_suffix?: string | null

page_prefix?: string | null

page_separator?: string | null

page_suffix?: string | null

parse_mode?: ParsingMode | null

Enum for representing the mode of parsing to be used.

One of the following:

"parse_page_without_llm"

"parse_page_with_llm"

"parse_page_with_lvm"

"parse_page_with_agent"

"parse_page_with_layout_agent"

"parse_document_with_llm"

"parse_document_with_lvm"

"parse_document_with_agent"

parsing_instruction?: string | null

precise_bounding_box?: boolean | null

premium_mode?: boolean | null

presentation_out_of_bounds_content?: boolean | null

presentation_skip_embedded_data?: boolean | null

preserve_layout_alignment_across_pages?: boolean | null

preserve_very_small_text?: boolean | null

preset?: string | null

priority?: "low" | "medium" | "high" | "critical" | null

The priority for the request. This field may be ignored or overwritten depending on the organization tier.

One of the following:

"low"

"medium"

"high"

"critical"

project_id?: string | null

remove_hidden_text?: boolean | null

replace_failed_page_mode?: FailPageMode | null

Enum for representing the different available page error handling modes.

One of the following:

"raw_text"

"blank_page"

"error_message"

replace_failed_page_with_error_message_prefix?: string | null

replace_failed_page_with_error_message_suffix?: string | null

save_images?: boolean | null

skip_diagonal_text?: boolean | null

specialized_chart_parsing_agentic?: boolean | null

specialized_chart_parsing_efficient?: boolean | null

specialized_chart_parsing_plus?: boolean | null

specialized_image_parsing?: boolean | null

spreadsheet_extract_sub_tables?: boolean | null

spreadsheet_force_formula_computation?: boolean | null

spreadsheet_include_hidden_sheets?: boolean | null

strict_mode_buggy_font?: boolean | null

strict_mode_image_extraction?: boolean | null

strict_mode_image_ocr?: boolean | null

strict_mode_reconstruction?: boolean | null

structured_output?: boolean | null

structured_output_json_schema?: string | null

structured_output_json_schema_name?: string | null

system_prompt?: string | null

system_prompt_append?: string | null

take_screenshot?: boolean | null

target_pages?: string | null

tier?: string | null

use_vendor_multimodal_model?: boolean | null

user_prompt?: string | null

vendor_multimodal_api_key?: string | null

vendor_multimodal_model_name?: string | null

version?: string | null

webhook_configurations?: Array<WebhookConfiguration> | null

Outbound webhook endpoints to notify on job status changes

webhook_events?: Array<"extract.pending" | "extract.success" | "extract.error" | 20 more> | null

Events to subscribe to (e.g. ‘parse.success’, ‘extract.error’). If null, all events are delivered.

One of the following:

"extract.pending"

"extract.success"

"extract.error"

"extract.partial_success"

"extract.cancelled"

"parse.pending"

"parse.running"

"parse.success"

"parse.error"

"parse.partial_success"

"parse.cancelled"

"classify.pending"

"classify.running"

"classify.success"

"classify.error"

"classify.partial_success"

"classify.cancelled"

"sheets.pending"

"sheets.success"

"sheets.error"

"sheets.partial_success"

"sheets.cancelled"

"unmapped_event"

webhook_headers?: Record<string, string> | null

Custom HTTP headers sent with each webhook request (e.g. auth tokens)

webhook_output_format?: string | null

Response format sent to the webhook: ‘string’ (default) or ‘json’

webhook_url?: string | null

URL to receive webhook POST notifications

webhook_url?: string | null

managed_pipeline_id?: string | null

The ID of the ManagedPipeline this playground pipeline is linked to.

formatuuid

metadata_config?: PipelineMetadataConfig { excluded_embed_metadata_keys, excluded_llm_metadata_keys } | null

Metadata configuration for the pipeline.

excluded_embed_metadata_keys?: Array<string>

List of metadata keys to exclude from embeddings

excluded_llm_metadata_keys?: Array<string>

List of metadata keys to exclude from LLM during retrieval

pipeline_type?: PipelineType

Type of pipeline. Either PLAYGROUND or MANAGED.

One of the following:

"PLAYGROUND"

"MANAGED"

preset_retrieval_parameters?: PresetRetrievalParams { alpha, class_name, dense_similarity_cutoff, 11 more }

Preset retrieval parameters for the pipeline.

alpha?: number | null

Alpha value for hybrid retrieval to determine the weights between dense and sparse retrieval. 0 is sparse retrieval and 1 is dense retrieval.

maximum1

minimum0

class_name?: string

dense_similarity_cutoff?: number | null

Minimum similarity score wrt query for retrieval

maximum1

minimum0

dense_similarity_top_k?: number | null

Number of nodes for dense retrieval.

maximum100

minimum1

enable_reranking?: boolean | null

Enable reranking for retrieval

files_top_k?: number | null

Number of files to retrieve (only for retrieval mode files_via_metadata and files_via_content).

maximum5

minimum1

rerank_top_n?: number | null

Number of reranked nodes for returning.

maximum100

minimum1

retrieval_mode?: RetrievalMode

The retrieval mode for the query.

One of the following:

"chunks"

"files_via_metadata"

"files_via_content"

"auto_routed"

Deprecatedretrieve_image_nodes?: boolean

Whether to retrieve image nodes.

retrieve_page_figure_nodes?: boolean

Whether to retrieve page figure nodes.

retrieve_page_screenshot_nodes?: boolean

Whether to retrieve page screenshot nodes.

Metadata filters for vector stores.

One of the following:

Comprehensive metadata filter for vector stores to support more operators.

Value uses Strict types, as int, float and str are compatible types and were all converted to string before.

See: https://docs.pydantic.dev/latest/usage/types/#strict-types

One of the following:

Vector store filter operator.

One of the following:

Vector store filter conditions to combine different filters.

One of the following:

JSON Schema that will be used to infer search_filters. Omit or leave as null to skip inference.

One of the following:

Record<string, unknown>

Array<unknown>

string

number

boolean

sparse_similarity_top_k?: number | null

Number of nodes for sparse retrieval.

maximum100

minimum1

sparse_model_config?: SparseModelConfig { class_name, model_type } | null

Configuration for sparse embedding models used in hybrid search.

This allows users to choose between Splade and BM25 models for sparse retrieval in managed data sinks.

class_name?: string

model_type?: "splade" | "bm25" | "auto"

One of the following:

"splade"

"bm25"

"auto"

status?: "CREATED" | "DELETING" | null

Status of the pipeline.

One of the following:

"CREATED"

"DELETING"

transform_config?: AutoTransformConfig { chunk_overlap, chunk_size, mode } | AdvancedModeTransformConfig { chunking_config, mode, segmentation_config }

Configuration for the transformation.

One of the following:

AutoTransformConfig { chunk_overlap, chunk_size, mode }

chunk_overlap?: number

Chunk overlap for the transformation.

chunk_size?: number

Chunk size for the transformation.

exclusiveMinimum0

mode?: "auto"

AdvancedModeTransformConfig { chunking_config, mode, segmentation_config }

chunking_config?: NoneChunkingConfig { mode } | CharacterChunkingConfig { chunk_overlap, chunk_size, mode } | TokenChunkingConfig { chunk_overlap, chunk_size, mode, separator } | 2 more

Configuration for the chunking.

One of the following:

NoneChunkingConfig { mode }

mode?: "none"

CharacterChunkingConfig { chunk_overlap, chunk_size, mode }

chunk_overlap?: number

chunk_size?: number

mode?: "character"

TokenChunkingConfig { chunk_overlap, chunk_size, mode, separator }

chunk_overlap?: number

chunk_size?: number

mode?: "token"

separator?: string

SentenceChunkingConfig { chunk_overlap, chunk_size, mode, 2 more }

chunk_overlap?: number

chunk_size?: number

mode?: "sentence"

paragraph_separator?: string

separator?: string

SemanticChunkingConfig { breakpoint_percentile_threshold, buffer_size, mode }

breakpoint_percentile_threshold?: number

buffer_size?: number

mode?: "semantic"

mode?: "advanced"

segmentation_config?: NoneSegmentationConfig { mode } | PageSegmentationConfig { mode, page_separator } | ElementSegmentationConfig { mode }

Configuration for the segmentation.

One of the following:

NoneSegmentationConfig { mode }

mode?: "none"

PageSegmentationConfig { mode, page_separator }

mode?: "page"

page_separator?: string

ElementSegmentationConfig { mode }

mode?: "element"

updated_at?: string | null

Update datetime

formatdate-time

PipelineRetrieveResponse { pipeline_id, retrieval_nodes, class_name, 5 more }

Schema for the result of an retrieval execution.

pipeline_id: string

The ID of the pipeline that the query was retrieved against.

formatuuid

retrieval_nodes: Array<RetrievalNode>

The nodes retrieved by the pipeline for the given query.

node: TextNode { class_name, embedding, end_char_idx, 11 more }

Provided for backward compatibility.

class_name?: string

embedding?: Array<number> | null

Embedding of the node.

end_char_idx?: number | null

End char index of the node.

excluded_embed_metadata_keys?: Array<string>

Metadata keys that are excluded from text for the embed model.

excluded_llm_metadata_keys?: Array<string>

Metadata keys that are excluded from text for the LLM.

extra_info?: Record<string, unknown>

A flat dictionary of metadata fields

id_?: string

Unique ID of the node.

metadata_seperator?: string

Separator between metadata fields when converting to string.

metadata_template?: string

Template for how metadata is formatted, with {key} and {value} placeholders.

mimetype?: string

MIME type of the node content.

relationships?: Record<string, RelatedNodeInfo { node_id, class_name, hash, 2 more } | Array<UnionMember1>>

A mapping of relationships to other node information.

One of the following:

RelatedNodeInfo { node_id, class_name, hash, 2 more }

node_id: string

class_name?: string

hash?: string | null

metadata?: Record<string, unknown>

node_type?: "1" | "2" | "3" | 2 more | (string & {}) | null

One of the following:

"1" | "2" | "3" | 2 more

"1"

"2"

"3"

"4"

"5"

(string & {})

Array<UnionMember1>

node_id: string

class_name?: string

hash?: string | null

metadata?: Record<string, unknown>

node_type?: "1" | "2" | "3" | 2 more | (string & {}) | null

One of the following:

"1" | "2" | "3" | 2 more

"1"

"2"

"3"

"4"

"5"

(string & {})

start_char_idx?: number | null

Start char index of the node.

text?: string

Text content of the node.

text_template?: string

Template for how text is formatted, with {content} and {metadata_str} placeholders.

class_name?: string

score?: number | null

class_name?: string

Deprecatedimage_nodes?: Array<PageScreenshotNodeWithScore { node, score, class_name } >

The image nodes retrieved by the pipeline for the given query. Deprecated - will soon be replaced with ‘page_screenshot_nodes’.

node: Node { file_id, image_size, page_index, metadata }

file_id: string

The ID of the file that the page screenshot was taken from

formatuuid

image_size: number

The size of the image in bytes

minimum0

page_index: number

The index of the page for which the screenshot is taken (0-indexed)

minimum0

metadata?: Record<string, unknown> | null

Metadata for the screenshot

score: number

The score of the screenshot node

class_name?: string

inferred_search_filters?: MetadataFilters { filters, condition } | null

Metadata filters for vector stores.

filters: Array<MetadataFilter { key, value, operator } | MetadataFilters { filters, condition } >

One of the following:

MetadataFilter { key, value, operator }

Comprehensive metadata filter for vector stores to support more operators.

Value uses Strict types, as int, float and str are compatible types and were all converted to string before.

See: https://docs.pydantic.dev/latest/usage/types/#strict-types

key: string

value: number | string | Array<string> | 2 more | null

One of the following:

number

string

Array<string>

Array<number>

operator?: "==" | ">" | "<" | 11 more

Vector store filter operator.

One of the following:

"=="

">"

"<"

"!="

">="

"<="

"in"

"nin"

"any"

"all"

"text_match"

"text_match_insensitive"

"contains"

"is_empty"

MetadataFilters = MetadataFilters { filters, condition }

condition?: "and" | "or" | "not" | null

Vector store filter conditions to combine different filters.

One of the following:

"and"

"or"

"not"

metadata?: Record<string, string>

Metadata associated with the retrieval execution

page_figure_nodes?: Array<PageFigureNodeWithScore { node, score, class_name } >

The page figure nodes retrieved by the pipeline for the given query.

node: Node { confidence, figure_name, figure_size, 4 more }

confidence: number

The confidence of the figure

maximum1

minimum0

figure_name: string

The name of the figure

figure_size: number

The size of the figure in bytes

minimum0

file_id: string

The ID of the file that the figure was taken from

formatuuid

page_index: number

The index of the page for which the figure is taken (0-indexed)

minimum0

is_likely_noise?: boolean

Whether the figure is likely to be noise

metadata?: Record<string, unknown> | null

Metadata for the figure

score: number

The score of the figure node

class_name?: string

retrieval_latency?: Record<string, number>

The end-to-end latency for retrieval and reranking.

PipelinesSync

Sync Pipeline

client.pipelines.sync.create(, ?): Pipeline { id, embedding_config, name, 15 more }

POST/api/v1/pipelines/{pipeline_id}/sync

Cancel Pipeline Sync

client.pipelines.sync.cancel(, ?): Pipeline { id, embedding_config, name, 15 more }

POST/api/v1/pipelines/{pipeline_id}/sync/cancel

PipelinesData Sources

List Pipeline Data Sources

client.pipelines.dataSources.getDataSources(, ?): DataSourceGetDataSourcesResponse { id, component, data_source_id, 13 more }

GET/api/v1/pipelines/{pipeline_id}/data-sources

Add Data Sources To Pipeline

client.pipelines.dataSources.updateDataSources(, , ?): DataSourceUpdateDataSourcesResponse { id, component, data_source_id, 13 more }

PUT/api/v1/pipelines/{pipeline_id}/data-sources

Update Pipeline Data Source

client.pipelines.dataSources.update(, , ?): PipelineDataSource { id, component, data_source_id, 13 more }

PUT/api/v1/pipelines/{pipeline_id}/data-sources/{data_source_id}

Get Pipeline Data Source Status

client.pipelines.dataSources.getStatus(, , ?): ManagedIngestionStatusResponse { status, deployment_date, effective_at, 2 more }

GET/api/v1/pipelines/{pipeline_id}/data-sources/{data_source_id}/status

Sync Pipeline Data Source

client.pipelines.dataSources.sync(, , ?): Pipeline { id, embedding_config, name, 15 more }

POST/api/v1/pipelines/{pipeline_id}/data-sources/{data_source_id}/sync

ModelsExpand Collapse

PipelineDataSource { id, component, data_source_id, 13 more }

Schema for a data source in a pipeline.

id: string

Unique identifier

formatuuid

component: Record<string, unknown> | CloudS3DataSource { bucket, aws_access_id, aws_access_secret, 5 more } | CloudAzStorageBlobDataSource { account_url, container_name, account_key, 8 more } | 9 more

Component that implements the data source

One of the following:

Record<string, unknown>

CloudS3DataSource { bucket, aws_access_id, aws_access_secret, 5 more }

bucket: string

The name of the S3 bucket to read from.

aws_access_id?: string | null

The AWS access ID to use for authentication.

aws_access_secret?: string | null

The AWS access secret to use for authentication.

formatpassword

class_name?: string

prefix?: string | null

The prefix of the S3 objects to read from.

regex_pattern?: string | null

The regex pattern to filter S3 objects. Must be a valid regex pattern.

s3_endpoint_url?: string | null

The S3 endpoint URL to use for authentication.

supports_access_control?: boolean

CloudAzStorageBlobDataSource { account_url, container_name, account_key, 8 more }

account_url: string

The Azure Storage Blob account URL to use for authentication.

container_name: string

The name of the Azure Storage Blob container to read from.

account_key?: string | null

The Azure Storage Blob account key to use for authentication.

formatpassword

account_name?: string | null

The Azure Storage Blob account name to use for authentication.

blob?: string | null

The blob name to read from.

class_name?: string

client_id?: string | null

The Azure AD client ID to use for authentication.

client_secret?: string | null

The Azure AD client secret to use for authentication.

formatpassword

prefix?: string | null

The prefix of the Azure Storage Blob objects to read from.

supports_access_control?: boolean

tenant_id?: string | null

The Azure AD tenant ID to use for authentication.

CloudGoogleDriveDataSource { folder_id, class_name, service_account_key, supports_access_control }

folder_id: string

The ID of the Google Drive folder to read from.

class_name?: string

service_account_key?: Record<string, string> | null

A dictionary containing secret values

supports_access_control?: boolean

CloudOneDriveDataSource { client_id, client_secret, tenant_id, 6 more }

client_id: string

The client ID to use for authentication.

client_secret: string

The client secret to use for authentication.

formatpassword

tenant_id: string

The tenant ID to use for authentication.

user_principal_name: string

The user principal name to use for authentication.

class_name?: string

folder_id?: string | null

The ID of the OneDrive folder to read from.

folder_path?: string | null

The path of the OneDrive folder to read from.

required_exts?: Array<string> | null

The list of required file extensions.

supports_access_control?: true

CloudSharepointDataSource { client_id, client_secret, tenant_id, 11 more }

client_id: string

The client ID to use for authentication.

client_secret: string

The client secret to use for authentication.

formatpassword

tenant_id: string

The tenant ID to use for authentication.

class_name?: string

drive_name?: string | null

The name of the Sharepoint drive to read from.

exclude_path_patterns?: Array<string> | null

List of regex patterns for file paths to exclude. Files whose paths (including filename) match any pattern will be excluded. Example: [‘/temp/’, ‘/backup/’, ‘.git/’, ‘.tmp$’, ’^~’]

folder_id?: string | null

The ID of the Sharepoint folder to read from.

folder_path?: string | null

The path of the Sharepoint folder to read from.

get_permissions?: boolean

Whether to get permissions for the sharepoint site.

include_path_patterns?: Array<string> | null

List of regex patterns for file paths to include. Full paths (including filename) must match at least one pattern to be included. Example: [‘/reports/’, ‘/docs/..pdf$’, ‘^Report..pdf$’]

required_exts?: Array<string> | null

The list of required file extensions.

site_id?: string | null

The ID of the SharePoint site to download from.

site_name?: string | null

The name of the SharePoint site to download from.

supports_access_control?: true

CloudSlackDataSource { slack_token, channel_ids, channel_patterns, 6 more }

slack_token: string

Slack Bot Token.

formatpassword

channel_ids?: string | null

Slack Channel.

channel_patterns?: string | null

Slack Channel name pattern.

class_name?: string

earliest_date?: string | null

Earliest date.

earliest_date_timestamp?: number | null

Earliest date timestamp.

latest_date?: string | null

Latest date.

latest_date_timestamp?: number | null

Latest date timestamp.

supports_access_control?: boolean

CloudNotionPageDataSource { integration_token, class_name, database_ids, 2 more }

integration_token: string

The integration token to use for authentication.

formatpassword

class_name?: string

database_ids?: string | null

The Notion Database Id to read content from.

page_ids?: string | null

The Page ID’s of the Notion to read from.

supports_access_control?: boolean

CloudConfluenceDataSource { authentication_mechanism, server_url, api_token, 10 more }

authentication_mechanism: string

Type of Authentication for connecting to Confluence APIs.

server_url: string

The server URL of the Confluence instance.

api_token?: string | null

The API token to use for authentication.

formatpassword

class_name?: string

cql?: string | null

The CQL query to use for fetching pages.

failure_handling?: FailureHandlingConfig { skip_list_failures }

Configuration for handling failures during processing. Key-value object controlling failure handling behaviors.

Example: { “skip_list_failures”: true }

Currently supports:

skip_list_failures: Skip failed batches/lists and continue processing

skip_list_failures?: boolean

Whether to skip failed batches/lists and continue processing

index_restricted_pages?: boolean

Whether to index restricted pages.

keep_markdown_format?: boolean

Whether to keep the markdown format.

label?: string | null

The label to use for fetching pages.

page_ids?: string | null

The page IDs of the Confluence to read from.

space_key?: string | null

The space key to read from.

supports_access_control?: boolean

user_name?: string | null

The username to use for authentication.

CloudJiraDataSource { authentication_mechanism, query, api_token, 5 more }

Cloud Jira Data Source integrating JiraReader.

authentication_mechanism: string

Type of Authentication for connecting to Jira APIs.

query: string

JQL (Jira Query Language) query to search.

api_token?: string | null

The API/ Access Token used for Basic, PAT and OAuth2 authentication.

formatpassword

class_name?: string

cloud_id?: string | null

The cloud ID, used in case of OAuth2.

email?: string | null

The email address to use for authentication.

server_url?: string | null

The server url for Jira Cloud.

supports_access_control?: boolean

CloudJiraDataSourceV2 { authentication_mechanism, query, server_url, 10 more }

Cloud Jira Data Source integrating JiraReaderV2.

authentication_mechanism: string

Type of Authentication for connecting to Jira APIs.

query: string

JQL (Jira Query Language) query to search.

server_url: string

The server url for Jira Cloud.

api_token?: string | null

The API Access Token used for Basic, PAT and OAuth2 authentication.

formatpassword

api_version?: "2" | "3"

Jira REST API version to use (2 or 3). 3 supports Atlassian Document Format (ADF).

One of the following:

"2"

"3"

class_name?: string

cloud_id?: string | null

The cloud ID, used in case of OAuth2.

email?: string | null

The email address to use for authentication.

expand?: string | null

Fields to expand in the response.

fields?: Array<string> | null

List of fields to retrieve from Jira. If None, retrieves all fields.

get_permissions?: boolean

Whether to fetch project role permissions and issue-level security

requests_per_minute?: number | null

Rate limit for Jira API requests per minute.

supports_access_control?: boolean

CloudBoxDataSource { authentication_mechanism, class_name, client_id, 6 more }

authentication_mechanism: "developer_token" | "ccg"

The type of authentication to use (Developer Token or CCG)

One of the following:

"developer_token"

"ccg"

class_name?: string

client_id?: string | null

Box API key used for identifying the application the user is authenticating with

client_secret?: string | null

Box API secret used for making auth requests.

formatpassword

developer_token?: string | null

Developer token for authentication if authentication_mechanism is ‘developer_token’.

formatpassword

enterprise_id?: string | null

Box Enterprise ID, if provided authenticates as service.

folder_id?: string | null

The ID of the Box folder to read from.

supports_access_control?: boolean

user_id?: string | null

Box User ID, if provided authenticates as user.

data_source_id: string

The ID of the data source.

formatuuid

last_synced_at: string

The last time the data source was automatically synced.

formatdate-time

The name of the data source.

pipeline_id: string

The ID of the pipeline.

formatuuid

project_id: string

source_type: "S3" | "AZURE_STORAGE_BLOB" | "GOOGLE_DRIVE" | 8 more

One of the following:

"S3"

"AZURE_STORAGE_BLOB"

"GOOGLE_DRIVE"

"MICROSOFT_ONEDRIVE"

"MICROSOFT_SHAREPOINT"

"SLACK"

"NOTION_PAGE"

"CONFLUENCE"

"JIRA"

"JIRA_V2"

"BOX"

created_at?: string | null

Creation datetime

formatdate-time

Custom metadata that will be present on all data loaded from the data source

One of the following:

Record<string, unknown>

Array<unknown>

string

number

boolean

status?: "NOT_STARTED" | "IN_PROGRESS" | "SUCCESS" | 2 more | null

The status of the data source in the pipeline.

One of the following:

"NOT_STARTED"

"IN_PROGRESS"

"SUCCESS"

"ERROR"

"CANCELLED"

status_updated_at?: string | null

The last time the status was updated.

formatdate-time

sync_interval?: number | null

The interval at which the data source should be synced.

sync_schedule_set_by?: string | null

The id of the user who set the sync schedule.

updated_at?: string | null

Update datetime

formatdate-time

version_metadata?: DataSourceReaderVersionMetadata { reader_version } | null

Version metadata for the data source

reader_version?: "1.0" | "2.0" | "2.1" | null

The version of the reader to use for this data source.

One of the following:

"1.0"

"2.0"

"2.1"

DataSourceGetDataSourcesResponse = Array<PipelineDataSource { id, component, data_source_id, 13 more } >

id: string

Unique identifier

formatuuid

Component that implements the data source

One of the following:

Record<string, unknown>

CloudS3DataSource { bucket, aws_access_id, aws_access_secret, 5 more }

bucket: string

The name of the S3 bucket to read from.

aws_access_id?: string | null

The AWS access ID to use for authentication.

aws_access_secret?: string | null

The AWS access secret to use for authentication.

formatpassword

class_name?: string

prefix?: string | null

The prefix of the S3 objects to read from.

regex_pattern?: string | null

The regex pattern to filter S3 objects. Must be a valid regex pattern.

s3_endpoint_url?: string | null

The S3 endpoint URL to use for authentication.

supports_access_control?: boolean

CloudAzStorageBlobDataSource { account_url, container_name, account_key, 8 more }

account_url: string

The Azure Storage Blob account URL to use for authentication.

container_name: string

The name of the Azure Storage Blob container to read from.

account_key?: string | null

The Azure Storage Blob account key to use for authentication.

formatpassword

account_name?: string | null

The Azure Storage Blob account name to use for authentication.

blob?: string | null

The blob name to read from.

class_name?: string

client_id?: string | null

The Azure AD client ID to use for authentication.

client_secret?: string | null

The Azure AD client secret to use for authentication.

formatpassword

prefix?: string | null

The prefix of the Azure Storage Blob objects to read from.

supports_access_control?: boolean

tenant_id?: string | null

The Azure AD tenant ID to use for authentication.

CloudGoogleDriveDataSource { folder_id, class_name, service_account_key, supports_access_control }

folder_id: string

The ID of the Google Drive folder to read from.

class_name?: string

service_account_key?: Record<string, string> | null

A dictionary containing secret values

supports_access_control?: boolean

CloudOneDriveDataSource { client_id, client_secret, tenant_id, 6 more }

client_id: string

The client ID to use for authentication.

client_secret: string

The client secret to use for authentication.

formatpassword

tenant_id: string

The tenant ID to use for authentication.

user_principal_name: string

The user principal name to use for authentication.

class_name?: string

folder_id?: string | null

The ID of the OneDrive folder to read from.

folder_path?: string | null

The path of the OneDrive folder to read from.

required_exts?: Array<string> | null

The list of required file extensions.

supports_access_control?: true

CloudSharepointDataSource { client_id, client_secret, tenant_id, 11 more }

client_id: string

The client ID to use for authentication.

client_secret: string

The client secret to use for authentication.

formatpassword

tenant_id: string

The tenant ID to use for authentication.

class_name?: string

drive_name?: string | null

The name of the Sharepoint drive to read from.

exclude_path_patterns?: Array<string> | null

List of regex patterns for file paths to exclude. Files whose paths (including filename) match any pattern will be excluded. Example: [‘/temp/’, ‘/backup/’, ‘.git/’, ‘.tmp$’, ’^~’]

folder_id?: string | null

The ID of the Sharepoint folder to read from.

folder_path?: string | null

The path of the Sharepoint folder to read from.

get_permissions?: boolean

Whether to get permissions for the sharepoint site.

include_path_patterns?: Array<string> | null

List of regex patterns for file paths to include. Full paths (including filename) must match at least one pattern to be included. Example: [‘/reports/’, ‘/docs/..pdf$’, ‘^Report..pdf$’]

required_exts?: Array<string> | null

The list of required file extensions.

site_id?: string | null

The ID of the SharePoint site to download from.

site_name?: string | null

The name of the SharePoint site to download from.

supports_access_control?: true

CloudSlackDataSource { slack_token, channel_ids, channel_patterns, 6 more }

slack_token: string

Slack Bot Token.

formatpassword

channel_ids?: string | null

Slack Channel.

channel_patterns?: string | null

Slack Channel name pattern.

class_name?: string

earliest_date?: string | null

Earliest date.

earliest_date_timestamp?: number | null

Earliest date timestamp.

latest_date?: string | null

Latest date.

latest_date_timestamp?: number | null

Latest date timestamp.

supports_access_control?: boolean

CloudNotionPageDataSource { integration_token, class_name, database_ids, 2 more }

integration_token: string

The integration token to use for authentication.

formatpassword

class_name?: string

database_ids?: string | null

The Notion Database Id to read content from.

page_ids?: string | null

The Page ID’s of the Notion to read from.

supports_access_control?: boolean

CloudConfluenceDataSource { authentication_mechanism, server_url, api_token, 10 more }

authentication_mechanism: string

Type of Authentication for connecting to Confluence APIs.

server_url: string

The server URL of the Confluence instance.

api_token?: string | null

The API token to use for authentication.

formatpassword

class_name?: string

cql?: string | null

The CQL query to use for fetching pages.

failure_handling?: FailureHandlingConfig { skip_list_failures }

Configuration for handling failures during processing. Key-value object controlling failure handling behaviors.

Example: { “skip_list_failures”: true }

Currently supports:

skip_list_failures: Skip failed batches/lists and continue processing

skip_list_failures?: boolean

Whether to skip failed batches/lists and continue processing

index_restricted_pages?: boolean

Whether to index restricted pages.

keep_markdown_format?: boolean

Whether to keep the markdown format.

label?: string | null

The label to use for fetching pages.

page_ids?: string | null

The page IDs of the Confluence to read from.

space_key?: string | null

The space key to read from.

supports_access_control?: boolean

user_name?: string | null

The username to use for authentication.

CloudJiraDataSource { authentication_mechanism, query, api_token, 5 more }

Cloud Jira Data Source integrating JiraReader.

authentication_mechanism: string

Type of Authentication for connecting to Jira APIs.

query: string

JQL (Jira Query Language) query to search.

api_token?: string | null

The API/ Access Token used for Basic, PAT and OAuth2 authentication.

formatpassword

class_name?: string

cloud_id?: string | null

The cloud ID, used in case of OAuth2.

email?: string | null

The email address to use for authentication.

server_url?: string | null

The server url for Jira Cloud.

supports_access_control?: boolean

CloudJiraDataSourceV2 { authentication_mechanism, query, server_url, 10 more }

Cloud Jira Data Source integrating JiraReaderV2.

authentication_mechanism: string

Type of Authentication for connecting to Jira APIs.

query: string

JQL (Jira Query Language) query to search.

server_url: string

The server url for Jira Cloud.

api_token?: string | null

The API Access Token used for Basic, PAT and OAuth2 authentication.

formatpassword

api_version?: "2" | "3"

Jira REST API version to use (2 or 3). 3 supports Atlassian Document Format (ADF).

One of the following:

"2"

"3"

class_name?: string

cloud_id?: string | null

The cloud ID, used in case of OAuth2.

email?: string | null

The email address to use for authentication.

expand?: string | null

Fields to expand in the response.

fields?: Array<string> | null

List of fields to retrieve from Jira. If None, retrieves all fields.

get_permissions?: boolean

Whether to fetch project role permissions and issue-level security

requests_per_minute?: number | null

Rate limit for Jira API requests per minute.

supports_access_control?: boolean

CloudBoxDataSource { authentication_mechanism, class_name, client_id, 6 more }

authentication_mechanism: "developer_token" | "ccg"

The type of authentication to use (Developer Token or CCG)

One of the following:

"developer_token"

"ccg"

class_name?: string

client_id?: string | null

Box API key used for identifying the application the user is authenticating with

client_secret?: string | null

Box API secret used for making auth requests.

formatpassword

developer_token?: string | null

Developer token for authentication if authentication_mechanism is ‘developer_token’.

formatpassword

enterprise_id?: string | null

Box Enterprise ID, if provided authenticates as service.

folder_id?: string | null

The ID of the Box folder to read from.

supports_access_control?: boolean

user_id?: string | null

Box User ID, if provided authenticates as user.

data_source_id: string

The ID of the data source.

formatuuid

last_synced_at: string

The last time the data source was automatically synced.

formatdate-time

The name of the data source.

pipeline_id: string

The ID of the pipeline.

formatuuid

project_id: string

source_type: "S3" | "AZURE_STORAGE_BLOB" | "GOOGLE_DRIVE" | 8 more

One of the following:

"S3"

"AZURE_STORAGE_BLOB"

"GOOGLE_DRIVE"

"MICROSOFT_ONEDRIVE"

"MICROSOFT_SHAREPOINT"

"SLACK"

"NOTION_PAGE"

"CONFLUENCE"

"JIRA"

"JIRA_V2"

"BOX"

created_at?: string | null

Creation datetime

formatdate-time

Custom metadata that will be present on all data loaded from the data source

One of the following:

Record<string, unknown>

Array<unknown>

string

number

boolean

status?: "NOT_STARTED" | "IN_PROGRESS" | "SUCCESS" | 2 more | null

The status of the data source in the pipeline.

One of the following:

"NOT_STARTED"

"IN_PROGRESS"

"SUCCESS"

"ERROR"

"CANCELLED"

status_updated_at?: string | null

The last time the status was updated.

formatdate-time

sync_interval?: number | null

The interval at which the data source should be synced.

sync_schedule_set_by?: string | null

The id of the user who set the sync schedule.

updated_at?: string | null

Update datetime

formatdate-time

version_metadata?: DataSourceReaderVersionMetadata { reader_version } | null

Version metadata for the data source

reader_version?: "1.0" | "2.0" | "2.1" | null

The version of the reader to use for this data source.

One of the following:

"1.0"

"2.0"

"2.1"

DataSourceUpdateDataSourcesResponse = Array<PipelineDataSource { id, component, data_source_id, 13 more } >

id: string

Unique identifier

formatuuid

Component that implements the data source

One of the following:

Record<string, unknown>

CloudS3DataSource { bucket, aws_access_id, aws_access_secret, 5 more }

bucket: string

The name of the S3 bucket to read from.

aws_access_id?: string | null

The AWS access ID to use for authentication.

aws_access_secret?: string | null

The AWS access secret to use for authentication.

formatpassword

class_name?: string

prefix?: string | null

The prefix of the S3 objects to read from.

regex_pattern?: string | null

The regex pattern to filter S3 objects. Must be a valid regex pattern.

s3_endpoint_url?: string | null

The S3 endpoint URL to use for authentication.

supports_access_control?: boolean

CloudAzStorageBlobDataSource { account_url, container_name, account_key, 8 more }

account_url: string

The Azure Storage Blob account URL to use for authentication.

container_name: string

The name of the Azure Storage Blob container to read from.

account_key?: string | null

The Azure Storage Blob account key to use for authentication.

formatpassword

account_name?: string | null

The Azure Storage Blob account name to use for authentication.

blob?: string | null

The blob name to read from.

class_name?: string

client_id?: string | null

The Azure AD client ID to use for authentication.

client_secret?: string | null

The Azure AD client secret to use for authentication.

formatpassword

prefix?: string | null

The prefix of the Azure Storage Blob objects to read from.

supports_access_control?: boolean

tenant_id?: string | null

The Azure AD tenant ID to use for authentication.

CloudGoogleDriveDataSource { folder_id, class_name, service_account_key, supports_access_control }

folder_id: string

The ID of the Google Drive folder to read from.

class_name?: string

service_account_key?: Record<string, string> | null

A dictionary containing secret values

supports_access_control?: boolean

CloudOneDriveDataSource { client_id, client_secret, tenant_id, 6 more }

client_id: string

The client ID to use for authentication.

client_secret: string

The client secret to use for authentication.

formatpassword

tenant_id: string

The tenant ID to use for authentication.

user_principal_name: string

The user principal name to use for authentication.

class_name?: string

folder_id?: string | null

The ID of the OneDrive folder to read from.

folder_path?: string | null

The path of the OneDrive folder to read from.

required_exts?: Array<string> | null

The list of required file extensions.

supports_access_control?: true

CloudSharepointDataSource { client_id, client_secret, tenant_id, 11 more }

client_id: string

The client ID to use for authentication.

client_secret: string

The client secret to use for authentication.

formatpassword

tenant_id: string

The tenant ID to use for authentication.

class_name?: string

drive_name?: string | null

The name of the Sharepoint drive to read from.

exclude_path_patterns?: Array<string> | null

List of regex patterns for file paths to exclude. Files whose paths (including filename) match any pattern will be excluded. Example: [‘/temp/’, ‘/backup/’, ‘.git/’, ‘.tmp$’, ’^~’]

folder_id?: string | null

The ID of the Sharepoint folder to read from.

folder_path?: string | null

The path of the Sharepoint folder to read from.

get_permissions?: boolean

Whether to get permissions for the sharepoint site.

include_path_patterns?: Array<string> | null

List of regex patterns for file paths to include. Full paths (including filename) must match at least one pattern to be included. Example: [‘/reports/’, ‘/docs/..pdf$’, ‘^Report..pdf$’]

required_exts?: Array<string> | null

The list of required file extensions.

site_id?: string | null

The ID of the SharePoint site to download from.

site_name?: string | null

The name of the SharePoint site to download from.

supports_access_control?: true

CloudSlackDataSource { slack_token, channel_ids, channel_patterns, 6 more }

slack_token: string

Slack Bot Token.

formatpassword

channel_ids?: string | null

Slack Channel.

channel_patterns?: string | null

Slack Channel name pattern.

class_name?: string

earliest_date?: string | null

Earliest date.

earliest_date_timestamp?: number | null

Earliest date timestamp.

latest_date?: string | null

Latest date.

latest_date_timestamp?: number | null

Latest date timestamp.

supports_access_control?: boolean

CloudNotionPageDataSource { integration_token, class_name, database_ids, 2 more }

integration_token: string

The integration token to use for authentication.

formatpassword

class_name?: string

database_ids?: string | null

The Notion Database Id to read content from.

page_ids?: string | null

The Page ID’s of the Notion to read from.

supports_access_control?: boolean

CloudConfluenceDataSource { authentication_mechanism, server_url, api_token, 10 more }

authentication_mechanism: string

Type of Authentication for connecting to Confluence APIs.

server_url: string

The server URL of the Confluence instance.

api_token?: string | null

The API token to use for authentication.

formatpassword

class_name?: string

cql?: string | null

The CQL query to use for fetching pages.

failure_handling?: FailureHandlingConfig { skip_list_failures }

Configuration for handling failures during processing. Key-value object controlling failure handling behaviors.

Example: { “skip_list_failures”: true }

Currently supports:

skip_list_failures: Skip failed batches/lists and continue processing

skip_list_failures?: boolean

Whether to skip failed batches/lists and continue processing

index_restricted_pages?: boolean

Whether to index restricted pages.

keep_markdown_format?: boolean

Whether to keep the markdown format.

label?: string | null

The label to use for fetching pages.

page_ids?: string | null

The page IDs of the Confluence to read from.

space_key?: string | null

The space key to read from.

supports_access_control?: boolean

user_name?: string | null

The username to use for authentication.

CloudJiraDataSource { authentication_mechanism, query, api_token, 5 more }

Cloud Jira Data Source integrating JiraReader.

authentication_mechanism: string

Type of Authentication for connecting to Jira APIs.

query: string

JQL (Jira Query Language) query to search.

api_token?: string | null

The API/ Access Token used for Basic, PAT and OAuth2 authentication.

formatpassword

class_name?: string

cloud_id?: string | null

The cloud ID, used in case of OAuth2.

email?: string | null

The email address to use for authentication.

server_url?: string | null

The server url for Jira Cloud.

supports_access_control?: boolean

CloudJiraDataSourceV2 { authentication_mechanism, query, server_url, 10 more }

Cloud Jira Data Source integrating JiraReaderV2.

authentication_mechanism: string

Type of Authentication for connecting to Jira APIs.

query: string

JQL (Jira Query Language) query to search.

server_url: string

The server url for Jira Cloud.

api_token?: string | null

The API Access Token used for Basic, PAT and OAuth2 authentication.

formatpassword

api_version?: "2" | "3"

Jira REST API version to use (2 or 3). 3 supports Atlassian Document Format (ADF).

One of the following:

"2"

"3"

class_name?: string

cloud_id?: string | null

The cloud ID, used in case of OAuth2.

email?: string | null

The email address to use for authentication.

expand?: string | null

Fields to expand in the response.

fields?: Array<string> | null

List of fields to retrieve from Jira. If None, retrieves all fields.

get_permissions?: boolean

Whether to fetch project role permissions and issue-level security

requests_per_minute?: number | null

Rate limit for Jira API requests per minute.

supports_access_control?: boolean

CloudBoxDataSource { authentication_mechanism, class_name, client_id, 6 more }

authentication_mechanism: "developer_token" | "ccg"

The type of authentication to use (Developer Token or CCG)

One of the following:

"developer_token"

"ccg"

class_name?: string

client_id?: string | null

Box API key used for identifying the application the user is authenticating with

client_secret?: string | null

Box API secret used for making auth requests.

formatpassword

developer_token?: string | null

Developer token for authentication if authentication_mechanism is ‘developer_token’.

formatpassword

enterprise_id?: string | null

Box Enterprise ID, if provided authenticates as service.

folder_id?: string | null

The ID of the Box folder to read from.

supports_access_control?: boolean

user_id?: string | null

Box User ID, if provided authenticates as user.

data_source_id: string

The ID of the data source.

formatuuid

last_synced_at: string

The last time the data source was automatically synced.

formatdate-time

The name of the data source.

pipeline_id: string

The ID of the pipeline.

formatuuid

project_id: string

source_type: "S3" | "AZURE_STORAGE_BLOB" | "GOOGLE_DRIVE" | 8 more

One of the following:

"S3"

"AZURE_STORAGE_BLOB"

"GOOGLE_DRIVE"

"MICROSOFT_ONEDRIVE"

"MICROSOFT_SHAREPOINT"

"SLACK"

"NOTION_PAGE"

"CONFLUENCE"

"JIRA"

"JIRA_V2"

"BOX"

created_at?: string | null

Creation datetime

formatdate-time

Custom metadata that will be present on all data loaded from the data source

One of the following:

Record<string, unknown>

Array<unknown>

string

number

boolean

status?: "NOT_STARTED" | "IN_PROGRESS" | "SUCCESS" | 2 more | null

The status of the data source in the pipeline.

One of the following:

"NOT_STARTED"

"IN_PROGRESS"

"SUCCESS"

"ERROR"

"CANCELLED"

status_updated_at?: string | null

The last time the status was updated.

formatdate-time

sync_interval?: number | null

The interval at which the data source should be synced.

sync_schedule_set_by?: string | null

The id of the user who set the sync schedule.

updated_at?: string | null

Update datetime

formatdate-time

version_metadata?: DataSourceReaderVersionMetadata { reader_version } | null

Version metadata for the data source

reader_version?: "1.0" | "2.0" | "2.1" | null

The version of the reader to use for this data source.

One of the following:

"1.0"

"2.0"

"2.1"

PipelinesImages

List File Page Screenshots

client.pipelines.images.listPageScreenshots(, ?, ?): ImageListPageScreenshotsResponse { file_id, image_size, page_index, metadata }

GET/api/v1/files/{id}/page_screenshots

Get File Page Screenshot

client.pipelines.images.getPageScreenshot(, , ?): ImageGetPageScreenshotResponse

GET/api/v1/files/{id}/page_screenshots/{page_index}

Get File Page Figure

client.pipelines.images.getPageFigure(, , ?): ImageGetPageFigureResponse

GET/api/v1/files/{id}/page-figures/{page_index}/{figure_name}

List File Pages Figures

client.pipelines.images.listPageFigures(, ?, ?): ImageListPageFiguresResponse { confidence, figure_name, figure_size, 4 more }

GET/api/v1/files/{id}/page-figures

ModelsExpand Collapse

ImageListPageScreenshotsResponse = Array<ImageListPageScreenshotsResponseItem>

file_id: string

The ID of the file that the page screenshot was taken from

formatuuid

image_size: number

The size of the image in bytes

minimum0

page_index: number

The index of the page for which the screenshot is taken (0-indexed)

minimum0

metadata?: Record<string, unknown> | null

Metadata for the screenshot

ImageGetPageScreenshotResponse = unknown

ImageGetPageFigureResponse = unknown

ImageListPageFiguresResponse = Array<ImageListPageFiguresResponseItem>

confidence: number

The confidence of the figure

maximum1

minimum0

figure_name: string

The name of the figure

figure_size: number

The size of the figure in bytes

minimum0

file_id: string

The ID of the file that the figure was taken from

formatuuid

page_index: number

The index of the page for which the figure is taken (0-indexed)

minimum0

is_likely_noise?: boolean

Whether the figure is likely to be noise

metadata?: Record<string, unknown> | null

Metadata for the figure

PipelinesFiles

Get Pipeline File Status Counts

client.pipelines.files.getStatusCounts(, ?, ?): FileGetStatusCountsResponse { counts, total_count, data_source_id, 2 more }

GET/api/v1/pipelines/{pipeline_id}/files/status-counts

Get Pipeline File Status

client.pipelines.files.getStatus(, , ?): ManagedIngestionStatusResponse { status, deployment_date, effective_at, 2 more }

GET/api/v1/pipelines/{pipeline_id}/files/{file_id}/status

Add Files To Pipeline Api

client.pipelines.files.create(, , ?): FileCreateResponse { id, pipeline_id, config_hash, 16 more }

PUT/api/v1/pipelines/{pipeline_id}/files

Update Pipeline File

client.pipelines.files.update(, , ?): PipelineFile { id, pipeline_id, config_hash, 16 more }

PUT/api/v1/pipelines/{pipeline_id}/files/{file_id}

Delete Pipeline File

client.pipelines.files.delete(, , ?): void

DELETE/api/v1/pipelines/{pipeline_id}/files/{file_id}

List Pipeline Files2

Deprecated

client.pipelines.files.list(, ?, ?): PaginatedPipelineFiles<PipelineFile { id, pipeline_id, config_hash, 16 more } >

GET/api/v1/pipelines/{pipeline_id}/files2

ModelsExpand Collapse

PipelineFile { id, pipeline_id, config_hash, 16 more }

A file associated with a pipeline.

id: string

Unique identifier for the pipeline file.

formatuuid

pipeline_id: string

The ID of the pipeline that the file is associated with.

formatuuid

Hashes for the configuration of the pipeline.

One of the following:

Record<string, unknown>

Array<unknown>

string

number

boolean

created_at?: string | null

When the pipeline file was created.

formatdate-time

Custom metadata for the file.

One of the following:

Record<string, unknown>

Array<unknown>

string

number

boolean

data_source_id?: string | null

The ID of the data source that the file belongs to.

formatuuid

external_file_id?: string | null

The ID of the file in the external system.

file_id?: string | null

The ID of the file.

formatuuid

file_size?: number | null

Size of the file in bytes.

file_type?: string | null

File type (e.g. pdf, docx, etc.).

indexed_page_count?: number | null

The number of pages that have been indexed for this file.

last_modified_at?: string | null

The last modified time of the file.

formatdate-time

name?: string | null

Name of the file.

Permission information for the file.

One of the following:

Record<string, unknown>

Array<unknown>

string

number

boolean

project_id?: string | null

The ID of the project that the file belongs to.

formatuuid

Resource information for the file.

One of the following:

Record<string, unknown>

Array<unknown>

string

number

boolean

status?: "NOT_STARTED" | "IN_PROGRESS" | "SUCCESS" | 2 more | null

Status of the pipeline file.

One of the following:

"NOT_STARTED"

"IN_PROGRESS"

"SUCCESS"

"ERROR"

"CANCELLED"

status_updated_at?: string | null

The last time the status was updated.

formatdate-time

updated_at?: string | null

When the pipeline file was last updated.

formatdate-time

FileGetStatusCountsResponse { counts, total_count, data_source_id, 2 more }

counts: Record<string, number>

The counts of files by status

total_count: number

The total number of files

data_source_id?: string | null

The ID of the data source that the files belong to

formatuuid

only_manually_uploaded?: boolean

Whether to only count manually uploaded files

pipeline_id?: string | null

The ID of the pipeline that the files belong to

formatuuid

FileCreateResponse = Array<PipelineFile { id, pipeline_id, config_hash, 16 more } >

id: string

Unique identifier for the pipeline file.

formatuuid

pipeline_id: string

The ID of the pipeline that the file is associated with.

formatuuid

Hashes for the configuration of the pipeline.

One of the following:

Record<string, unknown>

Array<unknown>

string

number

boolean

created_at?: string | null

When the pipeline file was created.

formatdate-time

Custom metadata for the file.

One of the following:

Record<string, unknown>

Array<unknown>

string

number

boolean

data_source_id?: string | null

The ID of the data source that the file belongs to.

formatuuid

external_file_id?: string | null

The ID of the file in the external system.

file_id?: string | null

The ID of the file.

formatuuid

file_size?: number | null

Size of the file in bytes.

file_type?: string | null

File type (e.g. pdf, docx, etc.).

indexed_page_count?: number | null

The number of pages that have been indexed for this file.

last_modified_at?: string | null

The last modified time of the file.

formatdate-time

name?: string | null

Name of the file.

Permission information for the file.

One of the following:

Record<string, unknown>

Array<unknown>

string

number

boolean

project_id?: string | null

The ID of the project that the file belongs to.

formatuuid

Resource information for the file.

One of the following:

Record<string, unknown>

Array<unknown>

string

number

boolean

status?: "NOT_STARTED" | "IN_PROGRESS" | "SUCCESS" | 2 more | null

Status of the pipeline file.

One of the following:

"NOT_STARTED"

"IN_PROGRESS"

"SUCCESS"

"ERROR"

"CANCELLED"

status_updated_at?: string | null

The last time the status was updated.

formatdate-time

updated_at?: string | null

When the pipeline file was last updated.

formatdate-time

PipelinesMetadata

Import Pipeline Metadata

client.pipelines.metadata.create(, , ?): MetadataCreateResponse

PUT/api/v1/pipelines/{pipeline_id}/metadata

Delete Pipeline Files Metadata

client.pipelines.metadata.deleteAll(, ?): void

DELETE/api/v1/pipelines/{pipeline_id}/metadata

ModelsExpand Collapse

MetadataCreateResponse = Record<string, string>

PipelinesDocuments

Create Batch Pipeline Documents

client.pipelines.documents.create(, , ?): DocumentCreateResponse { id, metadata, text, 4 more }

POST/api/v1/pipelines/{pipeline_id}/documents

Paginated List Pipeline Documents

client.pipelines.documents.list(, ?, ?): PaginatedCloudDocuments<CloudDocument { id, metadata, text, 4 more } >

GET/api/v1/pipelines/{pipeline_id}/documents/paginated

Get Pipeline Document

client.pipelines.documents.get(, , ?): CloudDocument { id, metadata, text, 4 more }

GET/api/v1/pipelines/{pipeline_id}/documents/{document_id}

Delete Pipeline Document

client.pipelines.documents.delete(, , ?): void

DELETE/api/v1/pipelines/{pipeline_id}/documents/{document_id}

Get Pipeline Document Status

client.pipelines.documents.getStatus(, , ?): ManagedIngestionStatusResponse { status, deployment_date, effective_at, 2 more }

GET/api/v1/pipelines/{pipeline_id}/documents/{document_id}/status

Sync Pipeline Document

client.pipelines.documents.sync(, , ?): DocumentSyncResponse

POST/api/v1/pipelines/{pipeline_id}/documents/{document_id}/sync

List Pipeline Document Chunks

client.pipelines.documents.getChunks(, , ?): DocumentGetChunksResponse { class_name, embedding, end_char_idx, 11 more }

GET/api/v1/pipelines/{pipeline_id}/documents/{document_id}/chunks

Upsert Batch Pipeline Documents

client.pipelines.documents.upsert(, , ?): DocumentUpsertResponse { id, metadata, text, 4 more }

PUT/api/v1/pipelines/{pipeline_id}/documents

ModelsExpand Collapse

CloudDocument { id, metadata, text, 4 more }

Cloud document stored in S3.

id: string

metadata: Record<string, unknown>

text: string

excluded_embed_metadata_keys?: Array<string>

excluded_llm_metadata_keys?: Array<string>

page_positions?: Array<number> | null

indices in the CloudDocument.text where a new page begins. e.g. Second page starts at index specified by page_positions[1].

status_metadata?: Record<string, unknown> | null

CloudDocumentCreate { metadata, text, id, 3 more }

Create a new cloud document.

metadata: Record<string, unknown>

text: string

id?: string | null

excluded_embed_metadata_keys?: Array<string>

excluded_llm_metadata_keys?: Array<string>

page_positions?: Array<number> | null

indices in the CloudDocument.text where a new page begins. e.g. Second page starts at index specified by page_positions[1].

TextNode { class_name, embedding, end_char_idx, 11 more }

Provided for backward compatibility.

class_name?: string

embedding?: Array<number> | null

Embedding of the node.

end_char_idx?: number | null

End char index of the node.

excluded_embed_metadata_keys?: Array<string>

Metadata keys that are excluded from text for the embed model.

excluded_llm_metadata_keys?: Array<string>

Metadata keys that are excluded from text for the LLM.

extra_info?: Record<string, unknown>

A flat dictionary of metadata fields

id_?: string

Unique ID of the node.

metadata_seperator?: string

Separator between metadata fields when converting to string.

metadata_template?: string

Template for how metadata is formatted, with {key} and {value} placeholders.

mimetype?: string

MIME type of the node content.

relationships?: Record<string, RelatedNodeInfo { node_id, class_name, hash, 2 more } | Array<UnionMember1>>

A mapping of relationships to other node information.

One of the following:

RelatedNodeInfo { node_id, class_name, hash, 2 more }

node_id: string

class_name?: string

hash?: string | null

metadata?: Record<string, unknown>

node_type?: "1" | "2" | "3" | 2 more | (string & {}) | null

One of the following:

"1" | "2" | "3" | 2 more

"1"

"2"

"3"

"4"

"5"

(string & {})

Array<UnionMember1>

node_id: string

class_name?: string

hash?: string | null

metadata?: Record<string, unknown>

node_type?: "1" | "2" | "3" | 2 more | (string & {}) | null

One of the following:

"1" | "2" | "3" | 2 more

"1"

"2"

"3"

"4"

"5"

(string & {})

start_char_idx?: number | null

Start char index of the node.

text?: string

Text content of the node.

text_template?: string

Template for how text is formatted, with {content} and {metadata_str} placeholders.

DocumentCreateResponse = Array<CloudDocument { id, metadata, text, 4 more } >

id: string

metadata: Record<string, unknown>

text: string

excluded_embed_metadata_keys?: Array<string>

excluded_llm_metadata_keys?: Array<string>

page_positions?: Array<number> | null

indices in the CloudDocument.text where a new page begins. e.g. Second page starts at index specified by page_positions[1].

status_metadata?: Record<string, unknown> | null

DocumentSyncResponse = unknown

DocumentGetChunksResponse = Array<TextNode { class_name, embedding, end_char_idx, 11 more } >

class_name?: string

embedding?: Array<number> | null

Embedding of the node.

end_char_idx?: number | null

End char index of the node.

excluded_embed_metadata_keys?: Array<string>

Metadata keys that are excluded from text for the embed model.

excluded_llm_metadata_keys?: Array<string>

Metadata keys that are excluded from text for the LLM.

extra_info?: Record<string, unknown>

A flat dictionary of metadata fields

id_?: string

Unique ID of the node.

metadata_seperator?: string

Separator between metadata fields when converting to string.

metadata_template?: string

Template for how metadata is formatted, with {key} and {value} placeholders.

mimetype?: string

MIME type of the node content.

relationships?: Record<string, RelatedNodeInfo { node_id, class_name, hash, 2 more } | Array<UnionMember1>>

A mapping of relationships to other node information.

One of the following:

RelatedNodeInfo { node_id, class_name, hash, 2 more }

node_id: string

class_name?: string

hash?: string | null

metadata?: Record<string, unknown>

node_type?: "1" | "2" | "3" | 2 more | (string & {}) | null

One of the following:

"1" | "2" | "3" | 2 more

"1"

"2"

"3"

"4"

"5"

(string & {})

Array<UnionMember1>

node_id: string

class_name?: string

hash?: string | null

metadata?: Record<string, unknown>

node_type?: "1" | "2" | "3" | 2 more | (string & {}) | null

One of the following:

"1" | "2" | "3" | 2 more

"1"

"2"

"3"

"4"

"5"

(string & {})

start_char_idx?: number | null

Start char index of the node.

text?: string

Text content of the node.

text_template?: string

Template for how text is formatted, with {content} and {metadata_str} placeholders.

DocumentUpsertResponse = Array<CloudDocument { id, metadata, text, 4 more } >

id: string

metadata: Record<string, unknown>

text: string

excluded_embed_metadata_keys?: Array<string>

excluded_llm_metadata_keys?: Array<string>

page_positions?: Array<number> | null

indices in the CloudDocument.text where a new page begins. e.g. Second page starts at index specified by page_positions[1].

status_metadata?: Record<string, unknown> | null