Skip to content

Commit 5b31dd5

Browse files
authored
Merge pull request #649 from jlareck/rebaseTemp
Implemented Wikimedia Commons extraction
2 parents c75710a + 31c9f1e commit 5b31dd5

File tree

2 files changed

+335
-0
lines changed

2 files changed

+335
-0
lines changed
Lines changed: 329 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,329 @@
1+
package org.dbpedia.extraction.mappings
2+
import org.dbpedia.extraction.config.provenance.DBpediaDatasets
3+
import org.dbpedia.extraction.transform.Quad
4+
5+
import collection.mutable.HashSet
6+
import org.dbpedia.extraction.ontology.datatypes.{Datatype, DimensionDatatype}
7+
import org.dbpedia.extraction.wikiparser._
8+
import org.dbpedia.extraction.dataparser._
9+
import org.dbpedia.extraction.util.RichString.wrapString
10+
import org.dbpedia.extraction.ontology.Ontology
11+
import org.dbpedia.extraction.util._
12+
import org.dbpedia.extraction.config.mappings.InfoboxExtractorConfig
13+
14+
import scala.collection.mutable.ArrayBuffer
15+
import org.dbpedia.extraction.config.dataparser.DataParserConfig
16+
import org.dbpedia.iri.UriUtils
17+
18+
import scala.language.reflectiveCalls
19+
/**
20+
Wikimedia Commmons Extractor extracts data
21+
from Wikimedia Commons articles infoboxes.
22+
It is based on Infobox Extractor (this is the
23+
copy of it but with permissions extraction)
24+
**/
25+
class WikimediaCommonsInfoboxExtractor(context : {
26+
def ontology : Ontology
27+
def language : Language
28+
def redirects : Redirects
29+
}
30+
)
31+
extends PageNodeExtractor
32+
{
33+
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
34+
// Configuration
35+
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
36+
37+
private val ontology = context.ontology
38+
39+
private val language = context.language
40+
41+
private val wikiCode = language.wikiCode
42+
43+
private val minPropertyCount = InfoboxExtractorConfig.minPropertyCount
44+
45+
private val minRatioOfExplicitPropertyKeys = InfoboxExtractorConfig.minRatioOfExplicitPropertyKeys
46+
47+
private val ignoreTemplates = InfoboxExtractorConfig.ignoreTemplates
48+
49+
private val ignoreTemplatesRegex = InfoboxExtractorConfig.ignoreTemplatesRegex
50+
51+
private val ignoreProperties = InfoboxExtractorConfig.ignoreProperties
52+
private val xsdStringDt = ontology.datatypes("xsd:string")
53+
54+
private val labelProperty = ontology.properties("rdfs:label")
55+
private val typeProperty = ontology.properties("rdf:type")
56+
private val propertyClass = ontology.classes("rdf:Property")
57+
private val rdfLangStrDt = ontology.datatypes("rdf:langString")
58+
59+
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
60+
// Regexes
61+
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
62+
63+
// TODO: i18n
64+
private val RankRegex = InfoboxExtractorConfig.RankRegex
65+
66+
private val SplitWordsRegex = InfoboxExtractorConfig.SplitWordsRegex
67+
68+
private val TrailingNumberRegex = InfoboxExtractorConfig.TrailingNumberRegex
69+
70+
private val splitPropertyNodeRegexInfobox = if (DataParserConfig.splitPropertyNodeRegexInfobox.contains(wikiCode))
71+
DataParserConfig.splitPropertyNodeRegexInfobox.get(wikiCode).get
72+
else DataParserConfig.splitPropertyNodeRegexInfobox.get("en").get
73+
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
74+
// Parsers
75+
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
76+
77+
private val unitValueParsers = ontology.datatypes.values
78+
.filter(_.isInstanceOf[DimensionDatatype])
79+
.map(dimension => new UnitValueParser(context, dimension, true))
80+
81+
private val intParser = new IntegerParser(context, true, validRange = (i => i%1==0))
82+
83+
private val doubleParser = new DoubleParser(context, true)
84+
85+
private val dateTimeParsers = List("xsd:date", "xsd:gMonthYear", "xsd:gMonthDay", "xsd:gMonth" /*, "xsd:gYear", "xsd:gDay"*/)
86+
.map(datatype => new DateTimeParser(context, new Datatype(datatype), true))
87+
88+
private val singleGeoCoordinateParser = new SingleGeoCoordinateParser(context)
89+
90+
private val objectParser = new ObjectParser(context, true)
91+
92+
private val linkParser = new LinkParser(true)
93+
94+
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
95+
// State
96+
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
97+
98+
private val seenProperties = HashSet[String]()
99+
100+
override val datasets = Set(DBpediaDatasets.InfoboxProperties, DBpediaDatasets.InfoboxTest, DBpediaDatasets.InfoboxPropertyDefinitions)
101+
102+
override def extract(node : PageNode, subjectUri : String) : Seq[Quad] =
103+
{
104+
if(node.title.namespace != Namespace.Main && !ExtractorUtils.titleContainsCommonsMetadata(node.title)) return Seq.empty
105+
106+
val quads = new ArrayBuffer[Quad]()
107+
108+
/** Retrieve all templates on the page which are not ignored */
109+
for { template <- InfoboxExtractor.collectTemplates(node)
110+
resolvedTitle = context.redirects.resolve(template.title).decoded.toLowerCase
111+
if !ignoreTemplates.contains(resolvedTitle)
112+
if !ignoreTemplatesRegex.exists(regex => regex.unapplySeq(resolvedTitle).isDefined)
113+
}
114+
{
115+
val propertyList = template.children.filterNot(property => ignoreProperties.get(wikiCode).getOrElse(ignoreProperties("en")).contains(property.key.toLowerCase))
116+
117+
var propertiesFound = false
118+
119+
// check how many property keys are explicitly defined
120+
val countExplicitPropertyKeys = propertyList.count(property => !property.key.forall(_.isDigit))
121+
if ((countExplicitPropertyKeys >= minPropertyCount) && (countExplicitPropertyKeys.toDouble / propertyList.size) > minRatioOfExplicitPropertyKeys)
122+
{
123+
for(property <- propertyList; if (!property.key.forall(_.isDigit))) {
124+
// TODO clean HTML
125+
126+
val cleanedPropertyNode = NodeUtil.removeParentheses(property)
127+
128+
val splitPropertyNodes = NodeUtil.splitPropertyNode(cleanedPropertyNode, splitPropertyNodeRegexInfobox)
129+
130+
//for(splitNode <- splitPropertyNodes; pr <- extractValue(splitNode); if pr.unit.nonEmpty)
131+
//sh: removed pr.unit.nonEmpty as it kicked out all objectproperty triples from wikilinks,
132+
// didn't test for further side-effects seems to work
133+
for(splitNode <- splitPropertyNodes; pr <- extractValue(splitNode))
134+
{
135+
val propertyUri = getPropertyUri(property.key)
136+
try
137+
{
138+
//sh: pr.unit should be empty (null) for objects
139+
quads += new Quad(language, DBpediaDatasets.InfoboxProperties, subjectUri, propertyUri, pr.value, splitNode.sourceIri, pr.unit.getOrElse(null))
140+
141+
if (InfoboxExtractorConfig.extractTemplateStatistics)
142+
{
143+
val stat_template = language.resourceUri.append(template.title.decodedWithNamespace)
144+
val stat_property = property.key.replace("\n", " ").replace("\t", " ").trim
145+
quads += new Quad(language, DBpediaDatasets.InfoboxTest, subjectUri, stat_template,
146+
stat_property, node.sourceIri, ontology.datatypes("xsd:string"))
147+
}
148+
}
149+
catch
150+
{
151+
case ex : IllegalArgumentException => println(ex)
152+
}
153+
propertiesFound = true
154+
seenProperties.synchronized
155+
{
156+
if (!seenProperties.contains(propertyUri))
157+
{
158+
val propertyLabel = getPropertyLabel(property.key)
159+
seenProperties += propertyUri
160+
quads += new Quad(language, DBpediaDatasets.InfoboxPropertyDefinitions, propertyUri, typeProperty, propertyClass.uri, splitNode.sourceIri)
161+
quads += new Quad(language, DBpediaDatasets.InfoboxPropertyDefinitions, propertyUri, labelProperty, propertyLabel, splitNode.sourceIri, rdfLangStrDt)
162+
}
163+
}
164+
}
165+
166+
}
167+
}
168+
}
169+
170+
quads
171+
}
172+
173+
private def extractValue(node : PropertyNode) : List[ParseResult[String]] =
174+
{
175+
// TODO don't convert to SI units (what happens to {{convert|25|kg}} ?)
176+
extractUnitValue(node).foreach(result => return List(result))
177+
extractPermission(node).foreach(result => return List(result))
178+
extractDates(node) match
179+
{
180+
case dates if dates.nonEmpty => return dates
181+
case _ =>
182+
}
183+
extractSingleCoordinate(node).foreach(result => return List(result))
184+
extractNumber(node).foreach(result => return List(result))
185+
extractRankNumber(node).foreach(result => return List(result))
186+
extractLinks(node) match
187+
{
188+
case links if links.nonEmpty => { return links}
189+
case _ =>
190+
}
191+
val res = StringParser.parse(node).map(value => ParseResult(value.value, None, Some(rdfLangStrDt))).toList
192+
res
193+
}
194+
195+
private def extractPermission(node: PropertyNode) : Option[ParseResult[String]] = {
196+
if (node.key.contains("permission")) {
197+
if(node.children.nonEmpty){
198+
node.children.head match {
199+
case item: TemplateNode => {
200+
if (!item.title.decoded.isEmpty) {
201+
val permissionLink = "http://purl.oclc.org/NET/rdflicense/" + item.title.decoded
202+
return Some(ParseResult(permissionLink))
203+
}
204+
}
205+
}
206+
}
207+
}
208+
None
209+
}
210+
211+
private def extractUnitValue(node : PropertyNode) : Option[ParseResult[String]] =
212+
{
213+
val unitValues =
214+
for (unitValueParser <- unitValueParsers;
215+
pr <- unitValueParser.parse(node) )
216+
yield pr
217+
218+
if (unitValues.size > 1)
219+
{
220+
StringParser.parse(node).map(value => ParseResult(value.value, None, Some(rdfLangStrDt)))
221+
}
222+
else if (unitValues.size == 1)
223+
{
224+
val pr = unitValues.head
225+
Some(ParseResult(pr.value.toString, None, pr.unit))
226+
}
227+
else
228+
{
229+
None
230+
}
231+
}
232+
233+
private def extractNumber(node : PropertyNode) : Option[ParseResult[String]] =
234+
{
235+
intParser.parse(node).foreach(value => return Some(ParseResult(value.value.toString, None, Some(new Datatype("xsd:integer")))))
236+
doubleParser.parse(node).foreach(value => return Some(ParseResult(value.value.toString, None, Some(new Datatype("xsd:double")))))
237+
None
238+
}
239+
240+
private def extractRankNumber(node : PropertyNode) : Option[ParseResult[String]] =
241+
{
242+
StringParser.parse(node) match
243+
{
244+
case Some(RankRegex(number)) => Some(ParseResult(number, None, Some(new Datatype("xsd:integer"))))
245+
case _ => None
246+
}
247+
}
248+
249+
private def extractSingleCoordinate(node : PropertyNode) : Option[ParseResult[String]] =
250+
{
251+
singleGeoCoordinateParser.parse(node).foreach(value => return Some(ParseResult(value.value.toDouble.toString, None, Some(new Datatype("xsd:double")))))
252+
None
253+
}
254+
255+
private def extractDates(node : PropertyNode) : List[ParseResult[String]] =
256+
{
257+
for(date <- extractDate(node))
258+
{
259+
return List(date)
260+
}
261+
262+
//Split the node. Note that even if some of these hyphens are looking similar, they represent different Unicode numbers.
263+
val splitNodes = NodeUtil.splitPropertyNode(node, "(—|–|-|&mdash;|&ndash;|,|;)")
264+
265+
splitNodes.flatMap(extractDate(_)) match
266+
{
267+
case dates if dates.size == splitNodes.size => dates
268+
case _ => List.empty
269+
}
270+
}
271+
272+
private def extractDate(node : PropertyNode) : Option[ParseResult[String]] =
273+
{
274+
for (dateTimeParser <- dateTimeParsers;
275+
date <- dateTimeParser.parse(node))
276+
{
277+
return Some(ParseResult(date.value.toString, None, Some(date.value.datatype)))
278+
}
279+
None
280+
}
281+
282+
private def extractLinks(node : PropertyNode) : List[ParseResult[String]] =
283+
{
284+
val splitNodes = NodeUtil.splitPropertyNode(node, """\s*\W+\s*""")
285+
286+
splitNodes.flatMap(splitNode => objectParser.parse(splitNode)) match
287+
{
288+
// TODO: explain why we check links.size == splitNodes.size
289+
case links if links.size == splitNodes.size => return links
290+
case _ => List.empty
291+
}
292+
293+
splitNodes.flatMap(splitNode => linkParser.parse(splitNode)) match
294+
{
295+
// TODO: explain why we check links.size == splitNodes.size
296+
case links if links.size == splitNodes.size => links.map(x => UriUtils.cleanLink(x.value)).collect{case Some(link) => ParseResult(link)}
297+
case _ => List.empty
298+
}
299+
}
300+
301+
private def getPropertyUri(key : String) : String =
302+
{
303+
// convert property key to camelCase
304+
var result = key.toLowerCase(language.locale).trim
305+
result = result.toCamelCase(SplitWordsRegex, language.locale)
306+
307+
// Rename Properties like LeaderName1, LeaderName2, ... to LeaderName
308+
result = TrailingNumberRegex.replaceFirstIn(result, "")
309+
310+
result = WikiUtil.cleanSpace(result)
311+
312+
language.propertyUri.append(result)
313+
}
314+
315+
private def getPropertyLabel(key : String) : String =
316+
{
317+
// convert property key to camelCase
318+
var result = key
319+
320+
result = result.replace("_", " ")
321+
322+
// Rename Properties like LeaderName1, LeaderName2, ... to LeaderName
323+
result = TrailingNumberRegex.replaceFirstIn(result, "")
324+
325+
result
326+
}
327+
328+
329+
}

dump/extraction.commons.properties

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
2+
require-download-complete=false
3+
4+
languages=commons
5+
6+
extractors=.WikimediaCommonsInfoboxExtractor, .GeoExtractor, .ArticleCategoriesExtractor

0 commit comments

Comments
 (0)