Skip to content
⌘K

CodeSplitter Chunking

Example demonstrating the new token-based CodeSplitter functionality.

This example shows how to use both character-based and token-based code splitting modes to achieve more precise control over chunk sizes when working with language models.

Let’s install the needed dependencies and import them within our code:

! pip install -q llama-index-core tree-sitter tree-sitter-language-pack
from typing import List
from llama_index.core.node_parser.text.code import CodeSplitter
from llama_index.core.schema import Document

Here is some code we can use to test the splitter:

SAMPLE_PYTHON_CODE = '''
def fibonacci(n):
"""Calculate the nth Fibonacci number using dynamic programming."""
if n <= 1:
return n
# Initialize the first two Fibonacci numbers
fib_prev = 0
fib_curr = 1
# Calculate subsequent Fibonacci numbers
for i in range(2, n + 1):
fib_next = fib_prev + fib_curr
fib_prev = fib_curr
fib_curr = fib_next
return fib_curr
def factorial(n):
"""Calculate the factorial of n using recursion."""
if n <= 1:
return 1
return n * factorial(n - 1)
class Calculator:
"""A simple calculator class with basic operations."""
def __init__(self):
self.history = []
def add(self, a, b):
"""Add two numbers."""
result = a + b
self.history.append(f"{a} + {b} = {result}")
return result
def multiply(self, a, b):
"""Multiply two numbers."""
result = a * b
self.history.append(f"{a} * {b} = {result}")
return result
def get_history(self):
"""Get calculation history."""
return self.history
def main():
"""Main function to demonstrate calculator usage."""
calc = Calculator()
# Perform some calculations
sum_result = calc.add(10, 5)
product_result = calc.multiply(3, 4)
# Calculate Fibonacci and factorial
fib_10 = fibonacci(10)
fact_5 = factorial(5)
print(f"Sum: {sum_result}")
print(f"Product: {product_result}")
print(f"10th Fibonacci number: {fib_10}")
print(f"5! = {fact_5}")
print("History:", calc.get_history())
if __name__ == "__main__":
main()
'''

You can now use the splitter with a charachter- or token-based approach for splitting the code:

def split_by_characther():
# Create a character-based splitter
char_splitter = CodeSplitter(
language="python",
count_mode="char",
max_chars=200, # Small character limit for demonstration
chunk_lines=10,
chunk_lines_overlap=2,
)
chunks = char_splitter.split_text(SAMPLE_PYTHON_CODE)
print(f"Number of chunks: {len(chunks)}")
print("Sample chunks:")
for i, chunk in enumerate(chunks[:2]):
char_count = len(chunk)
print(f"\nChunk {i+1} ({char_count} characters):")
print("-" * 40)
print(chunk[:100] + "..." if len(chunk) > 100 else chunk)
split_by_characther()
Number of chunks: 14
Sample chunks:
Chunk 1 (17 characters):
----------------------------------------
def fibonacci(n):
Chunk 2 (183 characters):
----------------------------------------
"""Calculate the nth Fibonacci number using dynamic programming."""
if n <= 1:
return n
...
def split_by_token():
# Create a token-based splitter
token_splitter = CodeSplitter(
language="python",
count_mode="token",
max_tokens=50, # Small token limit for demonstration
chunk_lines=10,
chunk_lines_overlap=2,
)
chunks = token_splitter.split_text(SAMPLE_PYTHON_CODE)
print(f"Number of chunks: {len(chunks)}")
print("Sample chunks:")
for i, chunk in enumerate(chunks[:2]):
# Get token count using the same tokenizer
token_count = len(token_splitter._tokenizer(chunk))
char_count = len(chunk)
print(
f"\nChunk {i+1} ({token_count} tokens, {char_count} characters):"
)
print("-" * 50)
print(chunk[:150] + "..." if len(chunk) > 150 else chunk)
split_by_token()
Number of chunks: 14
Sample chunks:
Chunk 1 (4 tokens, 17 characters):
--------------------------------------------------
def fibonacci(n):
Chunk 2 (43 tokens, 183 characters):
--------------------------------------------------
"""Calculate the nth Fibonacci number using dynamic programming."""
if n <= 1:
return n
# Initialize the first two Fibonacci numbers
...

You can also use a custom tokenizer:

def split_with_custom_tokenizer():
def simple_word_tokenizer(text: str) -> List[str]:
"""Simple tokenizer that splits on whitespace and punctuation."""
import re
return re.findall(r"\b\w+\b", text)
# Create a splitter with custom tokenizer
custom_splitter = CodeSplitter(
language="python",
count_mode="token",
max_tokens=30, # Token limit using custom tokenizer
tokenizer=simple_word_tokenizer,
)
chunks = custom_splitter.split_text(SAMPLE_PYTHON_CODE)
print(f"Number of chunks with custom tokenizer: {len(chunks)}")
print("Sample chunks:")
for i, chunk in enumerate(chunks[:2]):
token_count = len(simple_word_tokenizer(chunk))
print(f"\nChunk {i+1} ({token_count} word tokens):")
print("-" * 40)
print(chunk[:100] + "..." if len(chunk) > 100 else chunk)
split_with_custom_tokenizer()
Number of chunks with custom tokenizer: 12
Sample chunks:
Chunk 1 (3 word tokens):
----------------------------------------
def fibonacci(n):
Chunk 2 (27 word tokens):
----------------------------------------
"""Calculate the nth Fibonacci number using dynamic programming."""
if n <= 1:
return n
...