CodeSplitter Chunking
Example demonstrating the new token-based CodeSplitter functionality.
This example shows how to use both character-based and token-based code splitting modes to achieve more precise control over chunk sizes when working with language models.
Let’s install the needed dependencies and import them within our code:
! pip install -q llama-index-core tree-sitter tree-sitter-language-packfrom typing import List
from llama_index.core.node_parser.text.code import CodeSplitterfrom llama_index.core.schema import DocumentHere is some code we can use to test the splitter:
SAMPLE_PYTHON_CODE = '''def fibonacci(n): """Calculate the nth Fibonacci number using dynamic programming.""" if n <= 1: return n
# Initialize the first two Fibonacci numbers fib_prev = 0 fib_curr = 1
# Calculate subsequent Fibonacci numbers for i in range(2, n + 1): fib_next = fib_prev + fib_curr fib_prev = fib_curr fib_curr = fib_next
return fib_curr
def factorial(n): """Calculate the factorial of n using recursion.""" if n <= 1: return 1 return n * factorial(n - 1)
class Calculator: """A simple calculator class with basic operations."""
def __init__(self): self.history = []
def add(self, a, b): """Add two numbers.""" result = a + b self.history.append(f"{a} + {b} = {result}") return result
def multiply(self, a, b): """Multiply two numbers.""" result = a * b self.history.append(f"{a} * {b} = {result}") return result
def get_history(self): """Get calculation history.""" return self.history
def main(): """Main function to demonstrate calculator usage.""" calc = Calculator()
# Perform some calculations sum_result = calc.add(10, 5) product_result = calc.multiply(3, 4)
# Calculate Fibonacci and factorial fib_10 = fibonacci(10) fact_5 = factorial(5)
print(f"Sum: {sum_result}") print(f"Product: {product_result}") print(f"10th Fibonacci number: {fib_10}") print(f"5! = {fact_5}") print("History:", calc.get_history())
if __name__ == "__main__": main()'''You can now use the splitter with a charachter- or token-based approach for splitting the code:
def split_by_characther(): # Create a character-based splitter char_splitter = CodeSplitter( language="python", count_mode="char", max_chars=200, # Small character limit for demonstration chunk_lines=10, chunk_lines_overlap=2, )
chunks = char_splitter.split_text(SAMPLE_PYTHON_CODE)
print(f"Number of chunks: {len(chunks)}") print("Sample chunks:") for i, chunk in enumerate(chunks[:2]): char_count = len(chunk) print(f"\nChunk {i+1} ({char_count} characters):") print("-" * 40) print(chunk[:100] + "..." if len(chunk) > 100 else chunk)
split_by_characther()Number of chunks: 14Sample chunks:
Chunk 1 (17 characters):----------------------------------------def fibonacci(n):
Chunk 2 (183 characters):----------------------------------------"""Calculate the nth Fibonacci number using dynamic programming.""" if n <= 1: return n...def split_by_token(): # Create a token-based splitter token_splitter = CodeSplitter( language="python", count_mode="token", max_tokens=50, # Small token limit for demonstration chunk_lines=10, chunk_lines_overlap=2, )
chunks = token_splitter.split_text(SAMPLE_PYTHON_CODE)
print(f"Number of chunks: {len(chunks)}") print("Sample chunks:") for i, chunk in enumerate(chunks[:2]): # Get token count using the same tokenizer token_count = len(token_splitter._tokenizer(chunk)) char_count = len(chunk) print( f"\nChunk {i+1} ({token_count} tokens, {char_count} characters):" ) print("-" * 50) print(chunk[:150] + "..." if len(chunk) > 150 else chunk)
split_by_token()Number of chunks: 14Sample chunks:
Chunk 1 (4 tokens, 17 characters):--------------------------------------------------def fibonacci(n):
Chunk 2 (43 tokens, 183 characters):--------------------------------------------------"""Calculate the nth Fibonacci number using dynamic programming.""" if n <= 1: return n
# Initialize the first two Fibonacci numbers...You can also use a custom tokenizer:
def split_with_custom_tokenizer(): def simple_word_tokenizer(text: str) -> List[str]: """Simple tokenizer that splits on whitespace and punctuation.""" import re
return re.findall(r"\b\w+\b", text)
# Create a splitter with custom tokenizer custom_splitter = CodeSplitter( language="python", count_mode="token", max_tokens=30, # Token limit using custom tokenizer tokenizer=simple_word_tokenizer, )
chunks = custom_splitter.split_text(SAMPLE_PYTHON_CODE)
print(f"Number of chunks with custom tokenizer: {len(chunks)}") print("Sample chunks:") for i, chunk in enumerate(chunks[:2]): token_count = len(simple_word_tokenizer(chunk)) print(f"\nChunk {i+1} ({token_count} word tokens):") print("-" * 40) print(chunk[:100] + "..." if len(chunk) > 100 else chunk)
split_with_custom_tokenizer()Number of chunks with custom tokenizer: 12Sample chunks:
Chunk 1 (3 word tokens):----------------------------------------def fibonacci(n):
Chunk 2 (27 word tokens):----------------------------------------"""Calculate the nth Fibonacci number using dynamic programming.""" if n <= 1: return n...