feast/sdk/python/feast/chunker.py at update-lineage · feast-dev/feast

History

150 lines (123 loc) · 4.27 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

from abc import ABC, abstractmethod

from dataclasses import dataclass

from typing import Any, Optional

import pandas as pd

@dataclass

class ChunkingConfig:

chunk_size: int = 100

chunk_overlap: int = 20

min_chunk_size: int = 20

max_chunk_chars: Optional[int] = 500

class BaseChunker(ABC):

"""

Abstract base class for document chunking.

Subclasses implement load_parse_and_chunk() with their own:

- Loading logic

- Parsing logic

- Chunking strategy

"""

def __init__(self, config: Optional[ChunkingConfig] = None):

self.config = config or ChunkingConfig()

@abstractmethod

def load_parse_and_chunk(

self,

source: Any,

source_id: str,

source_column: str,

source_type: Optional[str] = None,

) -> list[dict]:

"""

Load, parse, and chunk a document.

Args:

source: File path, raw text, bytes, etc.

source_id: Document identifier.

source_type: Optional type hint.

source_column: The column containing the document sources.

Returns:

List of chunk dicts with keys:

- chunk_id: str

- original_id: str

- text: str

- chunk_index: int

- (any additional metadata)

"""

pass

def chunk_dataframe(

self,

df: pd.DataFrame,

id_column: str,

source_column: str,

type_column: Optional[str] = None,

) -> pd.DataFrame:

"""

Chunk all documents in a DataFrame.

Args:

df: The DataFrame containing the documents to chunk.

id_column: The column containing the document IDs.

source_column: The column containing the document sources.

type_column: The column containing the document types.

"""

all_chunks = []

for row in df.itertuples(index=False):

chunks = self.load_parse_and_chunk(

getattr(row, source_column),

str(getattr(row, id_column)),

source_column,

getattr(row, type_column) if type_column else None,

)

all_chunks.extend(chunks)

if not all_chunks:

return pd.DataFrame(

columns=["chunk_id", "original_id", source_column, "chunk_index"]

)

return pd.DataFrame(all_chunks)

class TextChunker(BaseChunker):

"""Default chunker for plain text. Chunks by word count."""

def load_parse_and_chunk(

self,

source: Any,

source_id: str,

source_column: str,

source_type: Optional[str] = None,

) -> list[dict]:

# Load

text = self._load(source)

# Chunk by words

return self._chunk_by_words(text, source_id, source_column)

def _load(self, source: Any) -> str:

from pathlib import Path

if isinstance(source, Path) and source.exists():

return Path(source).read_text()

if isinstance(source, str):

if source.endswith(".txt") and Path(source).exists():

return Path(source).read_text()

return str(source)

def _chunk_by_words(

self, text: str, source_id: str, source_column: str

) -> list[dict]:

words = text.split()

chunks = []

step = self.config.chunk_size - self.config.chunk_overlap

if step <= 0:

raise ValueError(

f"chunk_overlap ({self.config.chunk_overlap}) must be less than "

f"chunk_size ({self.config.chunk_size})"

)

chunk_index = 0

for i in range(0, len(words), step):

chunk_words = words[i : i + self.config.chunk_size]

if len(chunk_words) < self.config.min_chunk_size:

continue

chunk_text = " ".join(chunk_words)

if self.config.max_chunk_chars:

chunk_text = chunk_text[: self.config.max_chunk_chars]

chunks.append(

{

"chunk_id": f"{source_id}_{chunk_index}",

"original_id": source_id,

source_column: chunk_text,

"chunk_index": chunk_index,

}

)

chunk_index += 1

return chunks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

chunker.py

Latest commit

History

chunker.py

File metadata and controls