Skip to content

Commit 7d7889e

Browse files
committed
updated Datetimeparser for parsing iso8601 datetimeformat , generating mapped Wikidata Time triples
1 parent d4c5bd2 commit 7d7889e

File tree

4 files changed

+172
-48
lines changed

4 files changed

+172
-48
lines changed

core/src/main/scala/org/dbpedia/extraction/dataparser/DateTimeParser.scala

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ class DateTimeParser ( context : {
4141
private val prefix = if(strict) """\s*""" else """.*?"""
4242
private val postfix = if(strict) """\s*""" else ".*"
4343

44+
//wikidata: Catch Dates of format ISO8601 with 11 digits for year : "+00000001931-03-03T00:00:00Z"
45+
private val DateRegexISO = """([+-])(\d{11})-(\d{2})-(\d{2})T(\d{2})\:(\d{2})\:(\d{2})(Z|[+-]\d{2}\:\d{2})""".r
46+
4447
// catch dates like: "8 June 07" or "07 June 45"
4548
private val DateRegex1 = ("""(?iu)""" + prefix + """([0-9]{1,2})\s*("""+monthRegex+""")\s*([0-9]{2})(?!\d)\s*(?!\s)(?!"""+ eraRegex +""").*""" + postfix).r
4649

@@ -225,6 +228,12 @@ class DateTimeParser ( context : {
225228
*/
226229
private def catchDate(input: String) : Option[Date] =
227230
{
231+
for(DateRegexISO(sign,year,month,day,hour,minute,second,timezone) <- List(input))
232+
{
233+
// there's no Hour minute second or timezone Date in DBpedia
234+
return new Some(new Date(Some(year.toInt), Some(month.toInt), Some(day.toInt), datatype))
235+
}
236+
228237
for(DateRegex1(day, month, year) <- List(input))
229238
{
230239
// the century (1900 or 2000) depends on the last 2-digit number in the inputstring: >20 -> 1900

core/src/main/scala/org/dbpedia/extraction/mappings/WikidataMappedFactsExtractor.scala

Lines changed: 80 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
package org.dbpedia.extraction.mappings
22

3-
import org.dbpedia.extraction.ontology.Ontology
3+
import org.dbpedia.extraction.ontology.{OntologyProperty, Ontology}
44
import org.dbpedia.extraction.util.Language
55
import org.dbpedia.extraction.destinations.{Quad, DBpediaDatasets}
6-
import org.dbpedia.extraction.wikiparser.{SimpleNode, PageNode}
6+
import org.dbpedia.extraction.wikiparser.{SimpleNode, PageNode , TextNode}
77
import collection.mutable.ArrayBuffer
88
import org.dbpedia.extraction.ontology.io.OntologyReader
9+
import org.dbpedia.extraction.dataparser.DateTimeParser
10+
import org.dbpedia.extraction.ontology.datatypes.Datatype
911

1012
/**
1113
* Extracts Wikidata claims
@@ -19,6 +21,7 @@ import org.dbpedia.extraction.ontology.io.OntologyReader
1921
class WikidataMappedFactsExtractor(
2022
context : {
2123
def ontology : Ontology
24+
def redirects : Redirects // redirects required by DateTimeParser
2225
def language : Language
2326
}
2427
)
@@ -50,37 +53,80 @@ class WikidataMappedFactsExtractor(
5053
//Generating Quads for ValueTriples
5154
for (property <- node.getValueTriples.keys)
5255
{
56+
val valueFacts = node.getValueTriples(property)
57+
for( fact <- valueFacts.keys)
58+
{
5359
//check for triples that doesn't contain Label or sameas properties only
54-
// if(property != "http://www.w3.org/2000/01/rdf-schema#label" && property != "http://www.w3.org/2002/07/owl#sameAs" ){
60+
node.NodeType match {
61+
case SimpleNode.CoordinatesFacts => {
62+
quads += new Quad(context.language, DBpediaDatasets.WikidataFacts, subjectUri, context.ontology.properties(property) ,fact , page.sourceUri)
63+
}
64+
case SimpleNode.CommonMediaFacts => {
65+
//map the property to equivalent one //to do make helper function for getting equivalent list of properties
66+
//make also helper function to get properties with it's dataTypes
67+
// take into consideration that dataTypes of properties should be URI not strings
68+
69+
}
70+
case SimpleNode.StringFacts =>{
71+
//lot of parsing has to be done depending on data-type categories
72+
73+
74+
}
75+
case SimpleNode.TimeFacts =>{
76+
77+
//add new regex to DateTime parser
78+
//parse time
79+
//just write the triple and it will get parsed depending on it's type
80+
getDBpediaSameasProperties(property).foreach{dbProp =>
81+
82+
val dateParser = new DateTimeParser(context, dbProp.range.asInstanceOf[Datatype])
83+
dateParser.parse(new TextNode(fact,0)) match {
84+
case Some(date) => quads += new Quad(context.language, DBpediaDatasets.WikidataFacts, subjectUri, dbProp,date.toString, page.sourceUri)
85+
case None =>
86+
}
87+
88+
}
89+
90+
91+
}
92+
93+
case _ =>
94+
95+
}
96+
}
97+
98+
// if(node.NodeType == SimpleNode.Facts || node.NodeType == SimpleNode.MappedFacts){
99+
//
55100
//
56101
// val valueFacts = node.getValueTriples(property)
102+
//
57103
// for( fact <- valueFacts.keys)
58104
// {
59-
// quads += new Quad(Language.apply("en"), DBpediaDatasets.WikidataFacts, subjectUri, property ,fact , page.sourceUri, context.ontology.datatypes("xsd:string"))
105+
// //String WikiValues
106+
// if(valueFacts(fact)=="")
107+
// quads += new Quad(null , DBpediaDatasets.WikidataFacts, subjectUri, property ,fact , page.sourceUri, context.ontology.datatypes("xsd:string"))
108+
// //CommonMedia Files WikiValues
109+
// else if (valueFacts(fact) == "CommonMediaFile")
110+
// quads += new Quad(context.language, DBpediaDatasets.WikidataFacts, subjectUri, property,fact , page.sourceUri,null)
111+
// else if (valueFacts(fact) == "xsd:date")
112+
// quads += new Quad(context.language, DBpediaDatasets.WikidataFacts, subjectUri, property ,fact , page.sourceUri, context.ontology.datatypes(valueFacts(fact)))
60113
// }
61114
// }
62115
}
63116

64-
//Generating Quads for Uri
117+
//Generating Quads for Uri and Replace Wikidata property with DBpedia mapped one
65118
for (property <- node.getUriTriples.keys)
66119
{
67-
//check for triples that doesn't contain Label or sameas properties only
68-
if(property != "http://www.w3.org/2000/01/rdf-schema#label" && property != "http://www.w3.org/2002/07/owl#sameAs" ){
120+
//check for triples that doesn't contain Lpropertyabel or sameas properties only
121+
if(node.NodeType == SimpleNode.Facts || node.NodeType == SimpleNode.MappedFacts){
69122

70123
//labels are in the form of valuesTriples so SimpleNode.getValueTriples method is used which returns Map[String,String]
71124
val UriFacts = node.getUriTriples(property)
72125
for( fact <- UriFacts)
73126
{
74-
//print(context.ontology.equivalentPropertiesMap.size)
75-
76-
context.ontology.equivalentPropertiesMap.foreach({map =>
77-
if (map._1.toString.matches(property))
78-
{
79-
map._2.foreach{mappedProp =>
80-
quads += new Quad(Language.apply("en"), DBpediaDatasets.WikidataFacts, subjectUri, mappedProp.toString,fact , page.sourceUri,null)
81-
}
82-
}
83-
})
127+
getDBpediaSameasProperties(property).foreach({mappedProp =>
128+
quads += new Quad(Language.apply("en"), DBpediaDatasets.WikidataFacts, subjectUri, mappedProp.toString,fact , page.sourceUri,null)
129+
})
84130
}
85131
}
86132
}
@@ -93,6 +139,23 @@ class WikidataMappedFactsExtractor(
93139

94140
quads
95141
}
142+
143+
144+
def getDBpediaSameasProperties(property:String) : Set[OntologyProperty] =
145+
{
146+
var properties = Set[OntologyProperty]()
147+
context.ontology.equivalentPropertiesMap.foreach({map =>
148+
if (map._1.toString.matches(property))
149+
{
150+
map._2.foreach{mappedProp =>
151+
properties += mappedProp
152+
}
153+
}
154+
})
155+
156+
properties
157+
}
158+
96159
}
97160

98161

core/src/main/scala/org/dbpedia/extraction/wikiparser/SimpleNode.scala

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,21 @@ import org.dbpedia.extraction.sources.WikiPage
88
*/
99
object SimpleNode {
1010
type NodeType = String
11-
val Facts = "Facts"
12-
val MappedFacts = "MappedFacts"
11+
1312
val LanguageLinks = "LanguageLinks"
1413
val Labels = "Labels"
14+
15+
val Facts = "Facts" //for normal Wikidata dump extractor , data weren't adapted for MappedDBpedia dump
16+
17+
val MappedFacts = "MappedFacts"
18+
19+
val CoordinatesFacts = "MappedCoordinates"
20+
val TimeFacts = "TimeFacts"
21+
val StringFacts = "StringFacts"
22+
val CommonMediaFacts = "CommonMediaFacts"
23+
1524
val NotImportant = "NotImportant"
25+
1626
}
1727

1828

core/src/main/scala/org/dbpedia/extraction/wikiparser/impl/json/JsonWikiParser.scala

Lines changed: 71 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -164,19 +164,29 @@ class JsonWikiParser {
164164

165165

166166
/**
167-
* Main functionality is parsing the WikiData Json page and extract facts in different languages the form
167+
* Main functionality is parsing the WikiData Json page and extract facts with it's Datatype
168168
*
169-
* <http://www.w3.org/2000/01/rdf-schema#label> "New York City"@en
170-
* "New York "@fr
171-
* "New York"@co
172-
* @param page
173-
* @return SimpleObject that contains no UriTriples and it's valueTriples are filled with different labels on the form
174-
* Labelproperty ->
175-
* lang -> label
169+
* time triple: <http://wikidata.org/entity/P227> "+00000001931-03-03T00:00:00Z"^^xsd:date
170+
* URI triple: <http://wikidata.org/entity/P5> http://wikidata.org/entity/Q22552>
171+
* String triple: <http://wikidata.org/entity/P2> "anyString"
172+
* coordinates triple: <http://wikidata.org/entity/P225> "122 2215"
173+
* <http://www.w3.org/2003/01/geo/wgs84_pos#lat> "31.2167"
174+
* <http://www.w3.org/2003/01/geo/wgs84_pos#geometry> "POINT(31.2167 30.0333)"^^<http://www.openlinksw.com/schemas/virtrdf#Geometry>
175+
* <http://www.w3.org/2003/01/geo/wgs84_pos#long> "30.00333"
176176
*
177-
* <http://www.w3.org/2000/01/rdf-schema#label> ->
178-
* "en" -> "New York City"
179-
* "fr" -> "New York" "co" -> "New York"
177+
* scenario is as following :
178+
* 1- check that m (claim) has "value" not some value or no value
179+
* 2- check that it's "rank":1
180+
* 3- check for the third item in the claim
181+
* a- string > write as it is
182+
* b- time > - take time property of the 4th item "time":"+00000001931-03-03T00:00:00Z" and it's type would be xsd:datetime
183+
* c- globe coordinate > - for unmapped facts change them to "lat long" without datatype
184+
* - for mapped facts create three triples lat , long , gpoint and return them as value types
185+
*
186+
* d- common media > relpace spaces with _ and add "http://commons.wikimedia.org/wiki/File:" to begining of it and it's datatype is null
187+
* e- wikibase-entityid : get entity id /numeric-id and add "http://wikipeida.dbpedia.org/resource/Q" to it
188+
*
189+
* 4- depending on the output type decide to add it to the URITriples or ValuesTriples or MappedValueTriples or MappedURItriples
180190
*/
181191
def getFacts(page: WikiPage) : List[Node] = {
182192

@@ -194,24 +204,9 @@ class JsonWikiParser {
194204
/** get all nodes under json key "claims" which will be in the form
195205
*Json sample : http://pastebin.com/9H6s2Nid
196206
*/
197-
198-
/** scenario is as following :
199-
* 1- check that m has "value" not some value or no value
200-
* 2- check that it's "rank":1
201-
* 3- check for the third item in the claim
202-
* a- string > write as it is
203-
* b- time > take time property of the 4th item "time":"+00000001931-03-03T00:00:00Z" and it's type would be xsd:datetime
204-
* c- globe coordinate > change them to DBpedia point(lat long)
205-
* d- common media > relpace spaces with _ and add "http://commons.wikimedia.org/wiki/File:" to begining of it and it's datatype is null
206-
* e- wikibase-entityid : get entity id /numeric-id and add "http://wikipeida.dbpedia.org/resource/Q" to it
207-
*
208-
* 4- depending on the output type decide to add it to the URITriples or ValuesTriples
209-
*/
210-
211207
var valueTriples = collection.mutable.Map[String, collection.mutable.Map[String,String]]()
212208
var URITriples = collection.mutable.Map[String, List[String]]()
213209

214-
215210
//get claims only whose are values and has rank ==1 in List[JObject]
216211

217212
val claims = for {
@@ -226,7 +221,9 @@ class JsonWikiParser {
226221
for (claim <- claims)
227222
{
228223
val values = collection.mutable.Map[String,String]()
224+
val mappedValues = collection.mutable.Map[String,String]()
229225
var Uris = List[String]()
226+
val mappedUris = List[String]()
230227
val propID = (claim \ "m")(1).extract[Int]
231228
val property = "http://www.wikidata.org/entity/P"+propID
232229

@@ -246,11 +243,28 @@ class JsonWikiParser {
246243
val value = "http://commons.wikimedia.org/wiki/File:" + (claim \ "m")(3).extract[String].replace(" ","_") // "" empty datatype means no datatype for URIs and URLs
247244
values += value -> "CommonMediaFile"
248245
valueTriples += property -> values
246+
247+
var commonMediaValues = collection.mutable.Map[String,String]()
248+
var commonMediaValueTriples = collection.mutable.Map[String, collection.mutable.Map[String,String]]()
249+
250+
commonMediaValues += (claim \ "m")(3).extract[String] -> ""
251+
commonMediaValueTriples += property -> commonMediaValues
252+
253+
nodes::= new SimpleNode(null,commonMediaValueTriples,SimpleNode.CommonMediaFacts)
249254
}
250255
else
251256
{
252257
values += (claim \ "m")(3).extract[String] -> ""
253258
valueTriples += property -> values
259+
260+
261+
var stringValues = collection.mutable.Map[String,String]()
262+
var stringValueTriples = collection.mutable.Map[String, collection.mutable.Map[String,String]]()
263+
264+
stringValues += (claim \ "m")(3).extract[String] -> ""
265+
stringValueTriples += property -> stringValues
266+
267+
nodes::= new SimpleNode(null,stringValueTriples,SimpleNode.StringFacts)
254268
}
255269
}
256270

@@ -259,14 +273,40 @@ class JsonWikiParser {
259273
{
260274
values += ((claim \ "m")(3)\ "time").extract[String] -> "xsd:date"
261275
valueTriples += property -> values
276+
277+
278+
var timeValues = collection.mutable.Map[String,String]()
279+
var timeValueTriples = collection.mutable.Map[String, collection.mutable.Map[String,String]]()
280+
281+
timeValues += ((claim \ "m")(3)\ "time").extract[String] -> "xsd:date"
282+
timeValueTriples += property -> timeValues
283+
284+
285+
nodes::= new SimpleNode(null,timeValueTriples,SimpleNode.TimeFacts)
262286
}
263287
case "globecoordinate" =>
264288
{
265289
val lat = ((claim \ "m")(3)\ "latitude").extract[Int]
266290
val long = ((claim \ "m")(3)\ "longitude").extract[Int]
267291

292+
//for wikidata parser
268293
values += lat +" "+long -> ""
269294
valueTriples += property -> values
295+
296+
//for mappedwikidata parser
297+
//todo : add properties values in the wikidata mapped extractors
298+
var coordinatesValueTriples = collection.mutable.Map[String, collection.mutable.Map[String,String]]()
299+
300+
val latValue = collection.mutable.Map[String,String](lat.toString -> "")
301+
coordinatesValueTriples += "geo:lat" -> latValue
302+
303+
val longValue = collection.mutable.Map[String,String](long.toString -> "")
304+
coordinatesValueTriples += "geo:long" -> longValue
305+
306+
val pointValue = collection.mutable.Map[String,String]("POINT("+ lat +" "+long+")" -> "http://www.openlinksw.com/schemas/virtrdf#Geometry")
307+
coordinatesValueTriples += "georss:point" -> pointValue
308+
309+
nodes::= new SimpleNode(null,coordinatesValueTriples,SimpleNode.CoordinatesFacts)
270310
}
271311
case _=>
272312

@@ -280,7 +320,10 @@ class JsonWikiParser {
280320
}
281321

282322

283-
//helper function for checking the type of property , used in getFacts method
323+
324+
325+
326+
//helper function for checking the type of property , used in getFacts method
284327
def isCommonMediaFiles(prop:String) :Boolean = {
285328
val commonMediaFilesProperties = List("P10","P109","P117","P14","P15","P154","P158","P18","P181","P207","P242","P367","P368","P41","P443","P491","P51","P623","P692","P94")
286329
commonMediaFilesProperties.contains(prop)
@@ -289,5 +332,4 @@ class JsonWikiParser {
289332

290333

291334

292-
}
293-
335+
}

0 commit comments

Comments
 (0)