Skip to content

Commit e430bc4

Browse files
committed
code seems to be working (at least for test cases)
1 parent c16dbff commit e430bc4

File tree

3 files changed

+67
-26
lines changed

3 files changed

+67
-26
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22
*.fasl
33
*~
44
system-index.txt
5+
.aider*

src/autocontext/bm25.lisp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
"Calculates the IDF for a given term."
5757
(let* ((doc-freq (gethash term (doc-freqs index) 0))
5858
(corpus-size (corpus-size index)))
59-
(log (/ (- corpus-size doc-freq 0.5) (+ doc-freq 0.5)) 10)))
59+
(log (/ (+ (- corpus-size doc-freq) 0.5) (+ doc-freq 0.5)) 10)))
6060

6161
(defmethod score-doc ((index bm25-index) query-tokens doc-index)
6262
"Calculates the BM25 score for a single document."
@@ -70,5 +70,5 @@
7070
(let* ((term-freq (count term doc :test #'string=))
7171
(idf (inverse-document-frequency index term)))
7272
(incf score (* idf (/ (* term-freq (+ k1 1))
73-
(+ term-freq (* k1 (- 1 b (* b (/ doc-length avg-dl))))))))))
73+
(+ term-freq (* k1 (+ (- 1 b) (* b (/ doc-length avg-dl))))))))))
7474
score))

src/autocontext/main.lisp

Lines changed: 64 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@
1010
(in-package #:autocontext)
1111

1212
(defclass auto-context ()
13-
((chunks :reader chunks :documentation "A list of original text chunks.")
14-
(bm25-index :reader bm25-index :documentation "The BM25 sparse index.")
15-
(chunk-embeddings :reader chunk-embeddings :documentation "A magicl matrix of dense embeddings.")))
13+
((chunks :reader chunks :initarg :chunks :documentation "A list of original text chunks.")
14+
(bm25-index :reader bm25-index :initarg :bm25-index :documentation "The BM25 sparse index.")
15+
(chunk-embeddings :reader chunk-embeddings :initarg :chunk-embeddings :documentation "A magicl matrix of dense embeddings.")))
1616

1717
;;; Initialization
1818

@@ -35,21 +35,50 @@
3535
"Simple whitespace tokenizer."
3636
(split-sequence:split-sequence #\Space text :remove-empty-subseqs t))
3737

38+
(defun list-txt-files-in-directory (dir-path)
39+
"Return a list of pathnames for *.txt files in the directory given by DIR-PATH (a string or pathname)."
40+
(let* ((base (uiop:parse-native-namestring dir-path))
41+
;; ensure base is a directory pathname
42+
(dir (uiop:ensure-directory-pathname base))
43+
;; create a wildcard pathname for *.txt under that directory
44+
(pattern (uiop:merge-pathnames*
45+
(make-pathname :name :wild :type "txt")
46+
dir)))
47+
(directory pattern)))
48+
49+
50+
(defun split-into-sentences (text)
51+
"Splits text into a list of sentences based on punctuation. This is a heuristic approach and may not be perfect. It tries to avoid splitting on abbreviations like 'e.g.'."
52+
(let ((sentences '())
53+
(start 0))
54+
(loop for i from 0 below (length text)
55+
do (when (and (member (char text i) '(#\. #\? #\!))
56+
(or (= (1+ i) (length text))
57+
(member (char text (1+ i)) '(#\Space #\Newline #\"))))
58+
(push (string-trim '(#\Space #\Newline) (subseq text start (1+ i))) sentences)
59+
(setf start (1+ i))))
60+
(let ((last-part (string-trim '(#\Space #\Newline) (subseq text start))))
61+
(when (plusp (length last-part))
62+
(push last-part sentences)))
63+
(remove-if (lambda (s) (zerop (length s))) (nreverse sentences))))
64+
65+
(defun chunk-text (text &key (chunk-size 3))
66+
"Splits text into sentences and then groups them into chunks of chunk-size sentences."
67+
(let ((sentences (split-into-sentences text)))
68+
(loop for i from 0 below (length sentences) by chunk-size
69+
collect (let* ((end (min (+ i chunk-size) (length sentences)))
70+
(sentence-group (subseq sentences i end)))
71+
(string-trim '(#\Space #\Newline) (format nil "~{~a~^ ~}" sentence-group))))))
72+
3873
(defun load-and-chunk-documents (directory-path)
39-
"Loads .txt files and splits them into paragraph chunks."
40-
(let ((chunks ()))
41-
(unless (uiop:directory-exists-p directory-path)
42-
(error "Directory not found: ~a" directory-path))
43-
(dolist (file (uiop:directory-files directory-path #P"*.txt"))
44-
(let* ((content (uiop:read-file-string file))
45-
(paragraphs (split-sequence:split-sequence #\Newline content
46-
:count 2
47-
:remove-empty-subseqs t)))
48-
(setf chunks (nconc chunks (mapcar (lambda (p) (string-trim '(#\Space #\Newline) p)) paragraphs)))))
74+
"Loads .txt files and splits them into chunks of a few sentences."
75+
(format t "&load-and-chunk-documents: directory-path=~A~%" directory-path)
76+
(let* ((chunks (remove-if (lambda (s) (string= s ""))
77+
(loop for file in (list-txt-files-in-directory directory-path)
78+
nconcing (chunk-text (uiop:read-file-string file))))))
4979
(format t "~&Loaded ~d text chunks." (length chunks))
50-
chunks))
51-
52-
;;; Embedding Generation (Interface to Python)
80+
;;(format t "&load-and-chunk-documents: chunks=~A~%~%" chunks)
81+
chunks));;; Embedding Generation (Interface to Python)
5382

5483
(defun generate-embeddings (text-list)
5584
"Calls the Python script to generate embeddings for a list of strings."
@@ -58,9 +87,11 @@
5887
(json-output (uiop:run-program command
5988
:input (make-string-input-stream input-string)
6089
:output :string)))
61-
(let ((parsed (yason:parse json-output)))
62-
(pprint parsed)
63-
(magicl:from-list parsed (list (length parsed) (length (first parsed))) :type 'double-float))))
90+
(let* ((parsed (yason:parse json-output))
91+
(num-embeddings (length parsed))
92+
(embedding-dim (if (> num-embeddings 0) (length (first parsed)) 0))
93+
(flat-data (apply #'append parsed)))
94+
(magicl:from-list flat-data (list num-embeddings embedding-dim) :type 'double-float))))
6495

6596
;;; Core Method
6697

@@ -69,6 +100,13 @@
69100
(/ (magicl:dot vec1 vec2)
70101
(* (magicl:norm vec1) (magicl:norm vec2))))
71102

103+
(defun get-row-vector (matrix row-index)
104+
"Extracts a row from a matrix and returns it as a magicl vector."
105+
(let* ((num-cols (magicl:ncols matrix))
106+
(row-elements (loop for col-index from 0 below num-cols
107+
collect (magicl:tref matrix row-index col-index))))
108+
(magicl:from-list row-elements (list num-cols) :type (magicl:element-type matrix))))
109+
72110
(defmethod get-prompt ((ac auto-context) query &key (num-results 5))
73111
"Retrieves context and formats it into a prompt for an LLM."
74112
(format t "~&--- Retrieving context for query: '~a' ---" query)
@@ -77,20 +115,22 @@
77115
(let* ((query-tokens (tokenize query))
78116
(bm25-docs (bm25:get-top-n (bm25-index ac) query-tokens num-results))
79117
(bm25-results (mapcar (lambda (tokens) (format nil "~{~a~^ ~}" tokens)) bm25-docs)))
80-
(format t "~&BM25 found ~d keyword-based results." (length bm25-results))
118+
(format t "~&BM25 found ~d keyword-based results." (length bm25-results))
119+
(format t "~%~%bm25-results:~%~A~%~%" bm25-results)
81120

82121
;; 2. Dense Search (Vector Similarity)
83122
(let* ((query-embedding-matrix (generate-embeddings (list query)))
84-
(query-vector (magicl:slice query-embedding-matrix 0 0 :n2 (magicl:ncols query-embedding-matrix)))
123+
(query-vector (get-row-vector query-embedding-matrix 0))
85124
(all-embeddings (chunk-embeddings ac))
86125
(similarities (loop for i from 0 below (magicl:nrows all-embeddings)
87126
collect (cons (cosine-similarity query-vector
88-
(magicl:slice all-embeddings i 0 :n2 (magicl:ncols all-embeddings)))
127+
(get-row-vector all-embeddings i))
89128
i)))
90129
(sorted-sim (sort similarities #'> :key #'car))
91130
(top-indices (mapcar #'cdr (subseq sorted-sim 0 (min num-results (length sorted-sim)))))
92131
(vector-results (mapcar (lambda (i) (nth i (chunks ac))) top-indices)))
93-
(format t "~&Vector search found ~d semantic-based results." (length vector-results))
132+
(format t "~&Vector search found ~d semantic-based results." (length vector-results))
133+
(format t "~%~%vector-results:~%~A~%~%" vector-results)
94134

95135
;; 3. Combine and deduplicate
96136
(let* ((combined (append bm25-results vector-results))
@@ -107,6 +147,6 @@
107147
(defun test2 ()
108148
"A simple top-level function to demonstrate the system."
109149
(let* ((ac (make-instance 'auto-context :directory-path "../data"))
110-
(query "who was the first person to walk on the lunar surface?")
150+
(query "who says that economics is bullshit?")
111151
(prompt (get-prompt ac query :num-results 2)))
112152
(format t "~&~%--- Generated Prompt for LLM ---~%~a" prompt)))

0 commit comments

Comments
 (0)