Skip to content

Commit fbd0873

Browse files
authored
Merge pull request #2 from mark-watson/improve-webscraper
Refactor and enhance webscraping utility.
2 parents 8f5b22c + b157941 commit fbd0873

File tree

3 files changed

+54
-18
lines changed

3 files changed

+54
-18
lines changed

webscraping/project.clj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@
44
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
55
:url "https://www.eclipse.org/legal/epl-2.0/"}
66
:dependencies [[org.clojure/clojure "1.11.1"]
7-
[org.jsoup/jsoup "1.14.3"]]
7+
[org.jsoup/jsoup "1.17.2"]]
88
:repl-options {:init-ns webscraping.core})

webscraping/src/webscraping/core.clj

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,15 @@
55

66
(defn get-html-anchors [jsoup-web-page-contents]
77
(let [anchors (. jsoup-web-page-contents select "a[href]")]
8-
(for [anchor anchors]
9-
(try
10-
(let [anchor-text (. (first (. anchor childNodes)) text)
11-
anchor-uri-base (. (first (. anchor childNodes)) baseUri)
12-
href-attribute (. (. anchor attributes) get "href")
13-
anchor-uri
14-
(if (str/starts-with? href-attribute "http")
15-
href-attribute
16-
(str/join "" [anchor-uri-base (. (. anchor attributes) get "href")]))
17-
furi (first (. anchor childNodes))]
18-
{:text (str/trim anchor-text) :uri anchor-uri})
19-
(catch Exception e {:text (ex-message e) :uri ""})))))
8+
(->> anchors
9+
(map (fn [anchor]
10+
(try
11+
{:text (str/trim (. anchor text))
12+
:uri (. anchor absUrl "href")}
13+
(catch Exception e
14+
(binding [*out* *err*] (println (str "Error processing anchor: " (.getMessage e) " on page: " (. jsoup-web-page-contents title))))
15+
nil))))
16+
(filterv some?))))
2017

2118
(defn fetch-web-page-data
2219
"Get the <a> anchor data and full text from a web URI"
Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,49 @@
11
(ns webscraping.core-test
22
(:require [clojure.test :refer :all]
33
[clojure.pprint :as pp]
4-
[webscraping.core :refer :all]))
4+
[clojure.string :as str] ;; Added for string checks
5+
[webscraping.core :refer :all])
6+
(:import (org.jsoup Jsoup))) ;; Added Jsoup import
57

6-
(deftest a-test
7-
(testing
8-
"Fetch my personal website and check number key/values in results"
8+
(deftest mark-watson-website-test
9+
(testing "Fetch Mark Watson's website and verify basic anchor extraction"
910
(let [page-data (fetch-web-page-data "https://markwatson.com")]
10-
(pp/pprint page-data))))
11+
(is (string? (:page-text page-data)))
12+
(is (not (str/blank? (:page-text page-data))))
13+
(is (vector? (:anchors page-data)))
14+
(is (pos? (count (:anchors page-data))))
15+
(let [anchors (:anchors page-data)]
16+
(is (some #(str/includes? (:text %) "Read My Blog on Blogspot") anchors)
17+
"Expected to find an anchor with 'Read My Blog on Blogspot' in its text")
18+
(is (some #(= (:uri %) "https://mark-watson.blogspot.com/") anchors)
19+
"Expected to find an anchor linking to https://mark-watson.blogspot.com/")
20+
21+
(is (some #(str/includes? (str/lower-case (:text %)) "clojure") anchors)
22+
"Expected to find an anchor with 'Clojure' (case-insensitive) in its text")
23+
(is (some #(= (:uri %) "https://leanpub.com/clojureai") anchors)
24+
"Expected to find an anchor linking to Leanpub Clojure AI book")
25+
26+
;; Add a check for one more reasonably stable link, e.g., "My Books"
27+
(is (some #(and (str/includes? (:text %) "My Books")
28+
(= (:uri %) "https://markwatson.com#books"))
29+
anchors)
30+
"Expected to find an anchor 'My Books' linking to '#books'")))))
31+
32+
(deftest no-anchors-test
33+
(testing "Page with no anchor tags"
34+
(let [html-doc (Jsoup/parse "<html><body><p>No links here.</p></body></html>")
35+
anchors (get-html-anchors html-doc)]
36+
(is (empty? anchors) "Expected no anchors from HTML with no links"))))
37+
38+
(deftest relative-and-absolute-uris-test
39+
(testing "Anchor URI resolution for relative and absolute paths"
40+
(let [base-uri "http://example.com/docs/"
41+
html-content "<html><body><a href=\"/page1\">Page 1</a> <a href=\"http://domain.com/page2\">Page 2</a> <a href=\"../page3\">Page 3</a> <a href=\"sub/page4\">Page 4</a><a href=\"page5.html\">Page 5</a></body></html>"
42+
html-doc (. Jsoup parse html-content base-uri)
43+
anchors (get-html-anchors html-doc)
44+
uris (set (map :uri anchors))]
45+
(is (contains? uris "http://example.com/page1") "Relative /page1 should resolve to http://example.com/page1")
46+
(is (contains? uris "http://domain.com/page2") "Absolute http://domain.com/page2 should remain unchanged")
47+
(is (contains? uris "http://example.com/page3") "Relative ../page3 should resolve to http://example.com/page3")
48+
(is (contains? uris "http://example.com/docs/sub/page4") "Relative sub/page4 should resolve to http://example.com/docs/sub/page4")
49+
(is (contains? uris "http://example.com/docs/page5.html") "Relative page5.html should resolve to http://example.com/docs/page5.html"))))

0 commit comments

Comments
 (0)