Skip to content

Commit 0089903

Browse files
committed
Merge pull request #126 from ninniuz/homepage_extractor_fixes
Fix HomepageExtractor issues
2 parents b0817f7 + aaa04dd commit 0089903

File tree

5 files changed

+289
-32
lines changed

5 files changed

+289
-32
lines changed

core/pom.xml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -105,15 +105,15 @@
105105
<repositories>
106106

107107
<repository>
108-
<id>osr-public-releases</id>
109-
<name>OSR Public Releases</name>
110-
<url>http://mojo.informatik.uni-erlangen.de/nexus/content/repositories/public-releases</url>
108+
<id>osr-public-releases</id>
109+
<name>OSR Public Releases</name>
110+
<url>http://mojo.informatik.uni-erlangen.de/nexus/content/repositories/public-releases</url>
111111
</repository>
112112

113113
<repository>
114-
<id>osr-public-snapshots</id>
115-
<name>OSR Public snapshots</name>
116-
<url>http://mojo.informatik.uni-erlangen.de/nexus/content/repositories/public-snapshots</url>
114+
<id>osr-public-snapshots</id>
115+
<name>OSR Public snapshots</name>
116+
<url>http://mojo.informatik.uni-erlangen.de/nexus/content/repositories/public-snapshots</url>
117117
</repository>
118118

119119
</repositories>

core/src/main/scala/org/dbpedia/extraction/config/mappings/HomepageExtractorConfig.scala

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,4 +64,17 @@ object HomepageExtractorConfig
6464
"ru" -> "официальный"
6565
)
6666

67+
// Map(language -> Map(templateName -> templatePropertyKey))
68+
val templateOfficialWebsite = Map(
69+
"ca" -> Map("Oficial" -> "1"),
70+
"el" -> Map("Επίσημη ιστοσελίδα" -> "1"),
71+
"en" -> Map("Official website" -> "1"),
72+
"eo" -> Map("Oficiala_retejo" -> "1"),
73+
"es" -> Map("Página_web" -> "1"),
74+
"fr" -> Map("Site_officiel" -> "url"),
75+
"ga" -> Map("Páxina_web" -> "1"),
76+
"pt" -> Map("Oficial" -> "1"),
77+
"ru" -> Map("Официальный сайт" -> "1")
78+
)
79+
6780
}

core/src/main/scala/org/dbpedia/extraction/mappings/HomepageExtractor.scala

Lines changed: 81 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -28,44 +28,58 @@ extends PageNodeExtractor
2828

2929
private val externalLinkSections = HomepageExtractorConfig.externalLinkSectionsMap(language)
3030

31+
private val templateOfficialWebsite = HomepageExtractorConfig.templateOfficialWebsite(language)
32+
3133
private val homepageProperty = context.ontology.properties("foaf:homepage")
3234

3335
private val listItemStartRegex = ("""(?msiu).*^\s*\*\s*[^^]*(\b""" + official + """\b)?[^^]*\z""").r
3436
private val officialRegex = ("(?iu)" + official).r
3537
private val officialAndLineEndRegex = ("""(?msiu)[^$]*\b""" + official + """\b.*$.*""").r
3638
private val officialAndNoLineEndRegex = ("""(?msiu)[^$]*\b""" + official + """\b[^$]*""").r
3739
private val lineEndRegex = "(?ms).*$.+".r
40+
// Similar to org.dbpedia.extraction.config.dataparser.DataParserConfig.splitPropertyNodeRegexLink - without '/' and ';'
41+
private val splitPropertyNodeLinkStrict = """<br\s*\/?>|\n| and | or |,| """
3842

3943
override val datasets = Set(DBpediaDatasets.Homepages)
4044

4145
override def extract(page: PageNode, subjectUri: String, pageContext: PageContext): Seq[Quad] =
4246
{
4347
if(page.title.namespace != Namespace.Main) return Seq.empty
44-
45-
val list = collectProperties(page).filter(p => propertyNames.contains(p.key.toLowerCase))
46-
list.foreach((property) => {
47-
property.children match
48-
{
49-
case (textNode @ TextNode(text, _)) :: _ =>
48+
49+
val list = collectProperties(page).filter(p => propertyNames.contains(p.key.toLowerCase)).flatMap {
50+
NodeUtil.splitPropertyNode(_, splitPropertyNodeLinkStrict, true)
51+
}
52+
53+
list.foreach((property) =>
54+
55+
// Find among children
56+
for (child <- property.children) {
57+
child match
5058
{
51-
val url = if (!text.startsWith("http")) "http://" + text else text
52-
val graph = generateStatement(subjectUri, pageContext, url, textNode)
53-
if (!graph.isEmpty)
59+
case (textNode @ TextNode(text, _)) =>
5460
{
55-
return graph
61+
val cleaned = cleanProperty(text)
62+
if (cleaned.nonEmpty) { // do not proceed if the property value is not a valid candidate
63+
val url = if (!cleaned.startsWith("http")) "http://" + cleaned else cleaned
64+
val graph = generateStatement(subjectUri, pageContext, url, textNode)
65+
if (!graph.isEmpty)
66+
{
67+
return graph
68+
}
69+
}
5670
}
57-
}
58-
case (linkNode @ ExternalLinkNode(destination, _, _, _)) :: _ =>
59-
{
60-
val graph = generateStatement(subjectUri, pageContext, destination.toString, linkNode)
61-
if (!graph.isEmpty)
71+
case (linkNode @ ExternalLinkNode(destination, _, _, _)) =>
6272
{
63-
return graph
73+
val graph = generateStatement(subjectUri, pageContext, destination.toString, linkNode)
74+
if (!graph.isEmpty)
75+
{
76+
return graph
77+
}
6478
}
79+
case _ =>
6580
}
66-
case _ =>
6781
}
68-
})
82+
)
6983

7084
for(externalLinkSectionChildren <- collectExternalLinkSection(page.children))
7185
{
@@ -84,6 +98,18 @@ extends PageNodeExtractor
8498
Seq.empty
8599
}
86100

101+
private def cleanProperty(text: String) : String = {
102+
103+
val candidateUrl = text.stripLineEnd.trim // remove ending new line
104+
105+
// While it is perfectly legal to have hostnames without dots in URLs
106+
// it is very unlikely that such URLs will be present in Wikipedia
107+
// Most of the times such values represent texts inserted by editors
108+
// to convey a "missing homepage" info, such as None, N/A, missing, down etc.
109+
if (candidateUrl.matches(""".*\w\.\w.*""")) candidateUrl
110+
else ""
111+
}
112+
87113
private def generateStatement(subjectUri: String, pageContext: PageContext, url: String, node: Node): Seq[Quad] =
88114
{
89115
try
@@ -101,20 +127,50 @@ extends PageNodeExtractor
101127
Seq.empty
102128
}
103129

130+
private def extractUrlFromProperty(node: PropertyNode): Option[String] = {
131+
132+
/*
133+
It could be:
134+
1) {{template | key = example.com }}
135+
2) {{template | key = http://example.com }}
136+
137+
In 1) => PropertyNode("key", List(TextNode("example.com", _))
138+
In 2) => PropertyNode("key", List(ExternalLinkNode(URI("http://example.com"), ...)))
139+
*/
140+
val url = node.children.collect {
141+
case TextNode(t, _) => t
142+
case ExternalLinkNode(destination, _, _, _) => destination.toString
143+
}.mkString.trim
144+
145+
if (url.isEmpty) {
146+
None
147+
} else {
148+
try {
149+
val uri = new URI(url)
150+
if (uri.getScheme == null) Some("http://" + uri.toString)
151+
else Some(uri.toString)
152+
} catch {
153+
case _ : Exception => None
154+
}
155+
}
156+
}
157+
104158
private def findLinkTemplateInSection(nodes: List[Node]): Option[(String, Node)] =
105159
{
106160
// TODO: use for-loop instead of recursion
107161
nodes match
108162
{
109-
// TODO: use language-specific name
110-
case (templateNode @ TemplateNode(title, _, _, _)) :: _
111-
if ((title.decoded == "Official") || ((context.redirects.map.contains(title.decoded)) && (context.redirects.map(title.decoded) == "Official"))) =>
163+
case (templateNode @ TemplateNode(title, _, _, _)) :: tail =>
112164
{
113-
templateNode.property("1") match
114-
{
115-
case Some(propertyNode) => propertyNode.retrieveText.map(url => (url, propertyNode))
116-
case _ => None
165+
val templateRedirect = context.redirects.resolve(title).decoded
166+
if (templateOfficialWebsite.contains(templateRedirect)) {
167+
templateNode.property(templateOfficialWebsite(templateRedirect)) match
168+
{
169+
case Some(propertyNode) => extractUrlFromProperty(propertyNode).map(url => (url, propertyNode))
170+
case None => findLinkTemplateInSection(tail) // do not stop the recursion - there might be other templates
171+
}
117172
}
173+
else findLinkTemplateInSection(tail)
118174
}
119175
case head :: tail => findLinkTemplateInSection(tail)
120176
case Nil => None
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
package org.dbpedia.extraction.mappings
2+
3+
import org.dbpedia.extraction.wikiparser._
4+
import org.dbpedia.extraction.sources.{WikiPage, XMLSource}
5+
import org.dbpedia.extraction.destinations.{QuadBuilder, Quad}
6+
import org.dbpedia.extraction.destinations.formatters.TerseFormatter
7+
import org.dbpedia.extraction.ontology.io.OntologyReader
8+
import io.Source
9+
import org.dbpedia.extraction.util.Language
10+
import java.io.{FilenameFilter, File}
11+
import scala.collection.mutable.ArrayBuffer
12+
import org.junit.Test
13+
import org.dbpedia.extraction.ontology.{OntologyProperty, Ontology}
14+
import org.scalatest.FlatSpec
15+
import org.scalatest.matchers.ShouldMatchers
16+
import org.scalatest.junit.JUnitRunner
17+
import org.junit.runner.RunWith
18+
import org.dbpedia.extraction.dataparser.BooleanParser
19+
20+
/**
21+
*
22+
*/
23+
@RunWith(classOf[JUnitRunner])
24+
class HomepageExtractorTest extends FlatSpec with ShouldMatchers
25+
{
26+
// Tests
27+
"HomepageExtractor" should "return http://example.com from ExternalLink in External links section" in {
28+
29+
val lang = Language.English
30+
val quad : Seq[Quad] = HomepageExtractorTest.datasets.toList.map(dataset =>
31+
new Quad(lang, dataset, "TestPage", HomepageExtractorTest.homepageProperty, "http://example.com", null, null)
32+
)
33+
34+
parse(
35+
"""
36+
|==External links==
37+
|
38+
|* [http://example.com Official website]
39+
|
40+
""".stripMargin, "TestPage", lang) should equal (quad)
41+
}
42+
43+
"HomepageExtractor" should """return http://example.com from Template 'Official website' in External links section""" in {
44+
45+
val lang = Language.English
46+
val quad : Seq[Quad] = HomepageExtractorTest.datasets.toList.map(dataset =>
47+
new Quad(lang, dataset, "TestPage", HomepageExtractorTest.homepageProperty, "http://example.com", null, null)
48+
)
49+
50+
parse(
51+
"""
52+
|==External links==
53+
|
54+
|* {{Official website|example.com}}
55+
|
56+
""".stripMargin, "TestPage", lang) should equal (quad)
57+
}
58+
59+
it should """return http://correct.example.com from Template 'Official website' in External links section""" in {
60+
61+
val lang = Language.English
62+
val quad : Seq[Quad] = HomepageExtractorTest.datasets.toList.map(dataset =>
63+
new Quad(lang, dataset, "TestPage", HomepageExtractorTest.homepageProperty, "http://correct.example.com", null, null)
64+
)
65+
66+
parse(
67+
"""
68+
|==External links==
69+
|
70+
|* {{Official website|2=example.com}}
71+
|* {{Official website|correct.example.com}}
72+
|
73+
""".stripMargin, "TestPage", lang) should equal (quad)
74+
}
75+
76+
"HomepageExtractor" should """return http://example.com from Template 'Official' in External links section""" in {
77+
78+
val lang = Language.English
79+
val quad : Seq[Quad] = HomepageExtractorTest.datasets.toList.map(dataset =>
80+
new Quad(lang, dataset, "TestPage", HomepageExtractorTest.homepageProperty, "http://example.com", null, null)
81+
)
82+
83+
parse(
84+
"""
85+
|==External links==
86+
|
87+
|* {{Official|http://example.com}}
88+
|
89+
""".stripMargin, "TestPage", lang) should equal (quad)
90+
}
91+
92+
"HomepageExtractor" should """return http://example.com from Template property 'website = example.com'""" in {
93+
94+
val lang = Language.English
95+
val quad : Seq[Quad] = HomepageExtractorTest.datasets.toList.map(dataset =>
96+
new Quad(lang, dataset, "TestPage", HomepageExtractorTest.homepageProperty, "http://example.com", null, null)
97+
)
98+
99+
parse("""{{Infobox | website = example.com}}""", "TestPage", lang) should equal (quad)
100+
}
101+
102+
"HomepageExtractor" should """return http://example.com from Template property 'website = http://example.com'""" in {
103+
104+
val lang = Language.English
105+
val quad : Seq[Quad] = HomepageExtractorTest.datasets.toList.map(dataset =>
106+
new Quad(lang, dataset, "TestPage", HomepageExtractorTest.homepageProperty, "http://example.com", null, null)
107+
)
108+
109+
parse("""{{Infobox | website = http://example.com}}""", "TestPage", lang) should equal (quad)
110+
}
111+
112+
"HomepageExtractor" should """return http://example.com from Template property 'website = http://example.com or http://or.com'""" in {
113+
114+
val lang = Language.English
115+
val quad : Seq[Quad] = HomepageExtractorTest.datasets.toList.map(dataset =>
116+
new Quad(lang, dataset, "TestPage", HomepageExtractorTest.homepageProperty, "http://example.com", null, null)
117+
)
118+
119+
parse("""{{Infobox | website = http://example.com or http://or.com}}""", "TestPage", lang) should equal (quad)
120+
}
121+
122+
it should """return Seq.empty from Template property 'website = N/A'""" in {
123+
124+
val lang = Language.English
125+
val quad : Seq[Quad] = HomepageExtractorTest.datasets.toList.map(dataset =>
126+
new Quad(lang, dataset, "TestPage", HomepageExtractorTest.homepageProperty, "http://example.com", null, null)
127+
)
128+
129+
parse("""{{Infobox | website = N/A}}""", "TestPage", lang) should equal (Seq.empty)
130+
}
131+
132+
// end of tests
133+
134+
private val parser = WikiParser.getInstance()
135+
136+
private def parse(input : String, title: String = "TestPage", lang: Language = Language.English) : Seq[Quad] =
137+
{
138+
val page = new WikiPage(WikiTitle.parse(title, lang), input)
139+
val context = new {
140+
def ontology = HomepageExtractorTest.ontology;
141+
def language = lang;
142+
def redirects = new Redirects(Map("Official" -> "Official website"))
143+
}
144+
145+
val extractor = new HomepageExtractor(context)
146+
147+
extractor.extract(parser(page),"TestPage", new PageContext())
148+
}
149+
}
150+
151+
object HomepageExtractorTest {
152+
153+
// We need the OntologyProperty for "foaf:homepage"
154+
private val homepageProperty = new OntologyProperty("Foaf:homepage", Map(Language.English -> "homepage"), Map(), null, null, false, Set())
155+
156+
/**
157+
* val classes : Map[String, OntologyClass],
158+
val properties : Map[String, OntologyProperty],
159+
val datatypes : Map[String, Datatype],
160+
val specializations : Map[(OntologyClass, OntologyProperty), UnitDatatype],
161+
val equivalentPropertiesMap : Map[OntologyProperty,Set[OntologyProperty]],
162+
val equivalentClassesMap : Map[OntologyProperty,Set[OntologyProperty]]
163+
164+
name: String,
165+
labels: Map[Language, String],
166+
comments: Map[Language, String],
167+
val domain: OntologyClass,
168+
val range: OntologyType,
169+
val isFunctional: Boolean,
170+
val equivalentProperties: Set[OntologyProperty]
171+
*/
172+
private val ontology = new Ontology(
173+
Map(),
174+
Map("foaf:homepage" -> homepageProperty),
175+
Map(),
176+
Map(),
177+
Map(),
178+
Map()
179+
)
180+
181+
private val datasets = new HomepageExtractor(new {
182+
def ontology = HomepageExtractorTest.ontology;
183+
def language = Language.English;
184+
def redirects = null
185+
}).datasets
186+
}

pom.xml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,8 @@
162162
<scope>test</scope>
163163
</dependency>
164164

165-
</dependencies>
165+
</dependencies>
166+
166167
</dependencyManagement>
167168

168169
<distributionManagement>
@@ -179,3 +180,4 @@
179180
</distributionManagement>
180181

181182
</project>
183+

0 commit comments

Comments
 (0)