|
10 | 10 | (in-package #:autocontext) |
11 | 11 |
|
12 | 12 | (defclass auto-context () |
13 | | - ((chunks :reader chunks :documentation "A list of original text chunks.") |
14 | | - (bm25-index :reader bm25-index :documentation "The BM25 sparse index.") |
15 | | - (chunk-embeddings :reader chunk-embeddings :documentation "A magicl matrix of dense embeddings."))) |
| 13 | + ((chunks :reader chunks :initarg :chunks :documentation "A list of original text chunks.") |
| 14 | + (bm25-index :reader bm25-index :initarg :bm25-index :documentation "The BM25 sparse index.") |
| 15 | + (chunk-embeddings :reader chunk-embeddings :initarg :chunk-embeddings :documentation "A magicl matrix of dense embeddings."))) |
16 | 16 |
|
17 | 17 | ;;; Initialization |
18 | 18 |
|
|
35 | 35 | "Simple whitespace tokenizer." |
36 | 36 | (split-sequence:split-sequence #\Space text :remove-empty-subseqs t)) |
37 | 37 |
|
| 38 | +(defun list-txt-files-in-directory (dir-path) |
| 39 | + "Return a list of pathnames for *.txt files in the directory given by DIR-PATH (a string or pathname)." |
| 40 | + (let* ((base (uiop:parse-native-namestring dir-path)) |
| 41 | + ;; ensure base is a directory pathname |
| 42 | + (dir (uiop:ensure-directory-pathname base)) |
| 43 | + ;; create a wildcard pathname for *.txt under that directory |
| 44 | + (pattern (uiop:merge-pathnames* |
| 45 | + (make-pathname :name :wild :type "txt") |
| 46 | + dir))) |
| 47 | + (directory pattern))) |
| 48 | + |
| 49 | + |
| 50 | +(defun split-into-sentences (text) |
| 51 | + "Splits text into a list of sentences based on punctuation. This is a heuristic approach and may not be perfect. It tries to avoid splitting on abbreviations like 'e.g.'." |
| 52 | + (let ((sentences '()) |
| 53 | + (start 0)) |
| 54 | + (loop for i from 0 below (length text) |
| 55 | + do (when (and (member (char text i) '(#\. #\? #\!)) |
| 56 | + (or (= (1+ i) (length text)) |
| 57 | + (member (char text (1+ i)) '(#\Space #\Newline #\")))) |
| 58 | + (push (string-trim '(#\Space #\Newline) (subseq text start (1+ i))) sentences) |
| 59 | + (setf start (1+ i)))) |
| 60 | + (let ((last-part (string-trim '(#\Space #\Newline) (subseq text start)))) |
| 61 | + (when (plusp (length last-part)) |
| 62 | + (push last-part sentences))) |
| 63 | + (remove-if (lambda (s) (zerop (length s))) (nreverse sentences)))) |
| 64 | + |
| 65 | +(defun chunk-text (text &key (chunk-size 3)) |
| 66 | + "Splits text into sentences and then groups them into chunks of chunk-size sentences." |
| 67 | + (let ((sentences (split-into-sentences text))) |
| 68 | + (loop for i from 0 below (length sentences) by chunk-size |
| 69 | + collect (let* ((end (min (+ i chunk-size) (length sentences))) |
| 70 | + (sentence-group (subseq sentences i end))) |
| 71 | + (string-trim '(#\Space #\Newline) (format nil "~{~a~^ ~}" sentence-group)))))) |
| 72 | + |
38 | 73 | (defun load-and-chunk-documents (directory-path) |
39 | | - "Loads .txt files and splits them into paragraph chunks." |
40 | | - (let ((chunks ())) |
41 | | - (unless (uiop:directory-exists-p directory-path) |
42 | | - (error "Directory not found: ~a" directory-path)) |
43 | | - (dolist (file (uiop:directory-files directory-path #P"*.txt")) |
44 | | - (let* ((content (uiop:read-file-string file)) |
45 | | - (paragraphs (split-sequence:split-sequence #\Newline content |
46 | | - :count 2 |
47 | | - :remove-empty-subseqs t))) |
48 | | - (setf chunks (nconc chunks (mapcar (lambda (p) (string-trim '(#\Space #\Newline) p)) paragraphs))))) |
| 74 | + "Loads .txt files and splits them into chunks of a few sentences." |
| 75 | + (format t "&load-and-chunk-documents: directory-path=~A~%" directory-path) |
| 76 | + (let* ((chunks (remove-if (lambda (s) (string= s "")) |
| 77 | + (loop for file in (list-txt-files-in-directory directory-path) |
| 78 | + nconcing (chunk-text (uiop:read-file-string file)))))) |
49 | 79 | (format t "~&Loaded ~d text chunks." (length chunks)) |
50 | | - chunks)) |
51 | | - |
52 | | -;;; Embedding Generation (Interface to Python) |
| 80 | + ;;(format t "&load-and-chunk-documents: chunks=~A~%~%" chunks) |
| 81 | + chunks));;; Embedding Generation (Interface to Python) |
53 | 82 |
|
54 | 83 | (defun generate-embeddings (text-list) |
55 | 84 | "Calls the Python script to generate embeddings for a list of strings." |
|
58 | 87 | (json-output (uiop:run-program command |
59 | 88 | :input (make-string-input-stream input-string) |
60 | 89 | :output :string))) |
61 | | - (let ((parsed (yason:parse json-output))) |
62 | | - (pprint parsed) |
63 | | - (magicl:from-list parsed (list (length parsed) (length (first parsed))) :type 'double-float)))) |
| 90 | + (let* ((parsed (yason:parse json-output)) |
| 91 | + (num-embeddings (length parsed)) |
| 92 | + (embedding-dim (if (> num-embeddings 0) (length (first parsed)) 0)) |
| 93 | + (flat-data (apply #'append parsed))) |
| 94 | + (magicl:from-list flat-data (list num-embeddings embedding-dim) :type 'double-float)))) |
64 | 95 |
|
65 | 96 | ;;; Core Method |
66 | 97 |
|
|
69 | 100 | (/ (magicl:dot vec1 vec2) |
70 | 101 | (* (magicl:norm vec1) (magicl:norm vec2)))) |
71 | 102 |
|
| 103 | +(defun get-row-vector (matrix row-index) |
| 104 | + "Extracts a row from a matrix and returns it as a magicl vector." |
| 105 | + (let* ((num-cols (magicl:ncols matrix)) |
| 106 | + (row-elements (loop for col-index from 0 below num-cols |
| 107 | + collect (magicl:tref matrix row-index col-index)))) |
| 108 | + (magicl:from-list row-elements (list num-cols) :type (magicl:element-type matrix)))) |
| 109 | + |
72 | 110 | (defmethod get-prompt ((ac auto-context) query &key (num-results 5)) |
73 | 111 | "Retrieves context and formats it into a prompt for an LLM." |
74 | 112 | (format t "~&--- Retrieving context for query: '~a' ---" query) |
|
77 | 115 | (let* ((query-tokens (tokenize query)) |
78 | 116 | (bm25-docs (bm25:get-top-n (bm25-index ac) query-tokens num-results)) |
79 | 117 | (bm25-results (mapcar (lambda (tokens) (format nil "~{~a~^ ~}" tokens)) bm25-docs))) |
80 | | - (format t "~&BM25 found ~d keyword-based results." (length bm25-results)) |
| 118 | + (format t "~&BM25 found ~d keyword-based results." (length bm25-results)) |
| 119 | + (format t "~%~%bm25-results:~%~A~%~%" bm25-results) |
81 | 120 |
|
82 | 121 | ;; 2. Dense Search (Vector Similarity) |
83 | 122 | (let* ((query-embedding-matrix (generate-embeddings (list query))) |
84 | | - (query-vector (magicl:slice query-embedding-matrix 0 0 :n2 (magicl:ncols query-embedding-matrix))) |
| 123 | + (query-vector (get-row-vector query-embedding-matrix 0)) |
85 | 124 | (all-embeddings (chunk-embeddings ac)) |
86 | 125 | (similarities (loop for i from 0 below (magicl:nrows all-embeddings) |
87 | 126 | collect (cons (cosine-similarity query-vector |
88 | | - (magicl:slice all-embeddings i 0 :n2 (magicl:ncols all-embeddings))) |
| 127 | + (get-row-vector all-embeddings i)) |
89 | 128 | i))) |
90 | 129 | (sorted-sim (sort similarities #'> :key #'car)) |
91 | 130 | (top-indices (mapcar #'cdr (subseq sorted-sim 0 (min num-results (length sorted-sim))))) |
92 | 131 | (vector-results (mapcar (lambda (i) (nth i (chunks ac))) top-indices))) |
93 | | - (format t "~&Vector search found ~d semantic-based results." (length vector-results)) |
| 132 | + (format t "~&Vector search found ~d semantic-based results." (length vector-results)) |
| 133 | + (format t "~%~%vector-results:~%~A~%~%" vector-results) |
94 | 134 |
|
95 | 135 | ;; 3. Combine and deduplicate |
96 | 136 | (let* ((combined (append bm25-results vector-results)) |
|
107 | 147 | (defun test2 () |
108 | 148 | "A simple top-level function to demonstrate the system." |
109 | 149 | (let* ((ac (make-instance 'auto-context :directory-path "../data")) |
110 | | - (query "who was the first person to walk on the lunar surface?") |
| 150 | + (query "who says that economics is bullshit?") |
111 | 151 | (prompt (get-prompt ac query :num-results 2))) |
112 | 152 | (format t "~&~%--- Generated Prompt for LLM ---~%~a" prompt))) |
0 commit comments