Skip to content

Commit d4c5bd2

Browse files
committed
added datatypes to SimpleNode for each extractor to know type of data returned from parser
1 parent 72f6436 commit d4c5bd2

File tree

6 files changed

+49
-101
lines changed

6 files changed

+49
-101
lines changed

core/src/main/scala/org/dbpedia/extraction/mappings/WikidataFactsExtractor.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ class WikidataFactsExtractor(
5151
{
5252

5353
//check for triples that doesn't contain Label or sameas properties only
54-
if(property != "http://www.w3.org/2000/01/rdf-schema#label" && property != "http://www.w3.org/2002/07/owl#sameAs" ){
54+
if(node.NodeType == SimpleNode.Facts){
5555

5656
val valueFacts = node.getValueTriples(property)
5757
for( fact <- valueFacts.keys)
@@ -70,7 +70,7 @@ class WikidataFactsExtractor(
7070
for (property <- node.getUriTriples.keys)
7171
{
7272
//check for triples that doesn't contain Label or sameas properties only
73-
if(property != "http://www.w3.org/2000/01/rdf-schema#label" && property != "http://www.w3.org/2002/07/owl#sameAs" ){
73+
if(node.NodeType == SimpleNode.Facts){
7474

7575
//labels are in the form of valuesTriples so SimpleNode.getValueTriples method is used which returns Map[String,String]
7676
val UriFacts = node.getUriTriples(property)

core/src/main/scala/org/dbpedia/extraction/mappings/WikidataLLExtractor.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ class WikidataLLExtractor(
4040
for (property <- node.getUriTriples.keys)
4141
{
4242
//check for triples that contains sameas properties only ie.(represents language links)
43-
property match {
44-
case "http://www.w3.org/2002/07/owl#sameAs" => {
43+
node.NodeType match {
44+
case SimpleNode.LanguageLinks => {
4545

4646
//make combinations for each language and write Quads in the form :
4747
// fr.dbpedia:New_york owl:sameas en.dbpedia:New_york_City

core/src/main/scala/org/dbpedia/extraction/mappings/WikidataLabelExtractor.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ class WikidataLabelExtractor(
4949
{
5050

5151
//check for triples that contains sameas properties only ie.(represents language links)
52-
property match {
53-
case "http://www.w3.org/2000/01/rdf-schema#label" => {
52+
node.NodeType match {
53+
case SimpleNode.Labels => {
5454

5555
//labels are in the form of valuesTriples so SimpleNode.getValueTriples method is used which returns Map[String,String]
5656
val labelsMap = node.getValueTriples(property)

core/src/main/scala/org/dbpedia/extraction/ontology/io/OntologyReader.scala

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -329,14 +329,6 @@ class OntologyReader
329329
val propertyMap = properties.map( property => (property.name, property) ).toMap
330330
val typeMap = datatypes.map( datatype => (datatype.name, datatype) ).toMap
331331

332-
// //change propertyBuilder to ontologyproperty class
333-
// var equivalentOntologyPropertiesMap = Map[OntologyProperty,Set[OntologyProperty]]()
334-
//
335-
// equivalentPropertiesBuilderMap.foreach({m=>
336-
// equivalentOntologyPropertiesMap += m._1 -> m._2.flatMap(_.build(classMap, typeMap))
337-
// })
338-
339-
//filling ontology class with properties , classes , sameas properties , sameas classes
340332
new Ontology( classes.flatMap(_.build(classMap)).map(c => (c.name, c)).toMap,
341333
properties.flatMap(_.build(classMap, typeMap)).map(p => (p.name, p)).toMap,
342334
datatypes.map(t => (t.name, t)).toMap,

core/src/main/scala/org/dbpedia/extraction/wikiparser/SimpleNode.scala

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,23 @@ package org.dbpedia.extraction.wikiparser
22

33
import org.dbpedia.extraction.sources.WikiPage
44

5+
/*enum to express type of node returned for parser to extractor in
6+
for each parser to deal with it's returned node
7+
todo : implement this using the AST
8+
*/
9+
object SimpleNode {
10+
type NodeType = String
11+
val Facts = "Facts"
12+
val MappedFacts = "MappedFacts"
13+
val LanguageLinks = "LanguageLinks"
14+
val Labels = "Labels"
15+
val NotImportant = "NotImportant"
16+
}
17+
18+
19+
20+
21+
522
/**
623
* it's a class which returns string triples in a Map in order to bypass that AST can't describe all triples
724
*
@@ -21,15 +38,30 @@ import org.dbpedia.extraction.sources.WikiPage
2138
* @param children The contents of this page
2239
*/
2340

24-
class SimpleNode (
25-
val uriTriples: collection.mutable.Map[String, List[String]] = collection.mutable.Map.empty,
26-
val valueTriples: collection.mutable.Map[String, collection.mutable.Map[String,String]] = collection.mutable.Map.empty,
27-
children: List[Node] = List.empty
41+
class SimpleNode (
42+
val uriTriples: collection.mutable.Map[String, List[String]],
43+
val valueTriples: collection.mutable.Map[String, collection.mutable.Map[String,String]],
44+
val NodeType : SimpleNode.NodeType,
45+
children: List[Node]
2846
)
2947
extends Node(children, 0)
3048
{
3149
def getUriTriples : collection.mutable.Map[String,List[String]] = uriTriples
3250
def getValueTriples : collection.mutable.Map[String,collection.mutable.Map[String,String]] = valueTriples
3351
def toPlainText : String = getUriTriples.mkString+getValueTriples.mkString
3452
def toWikiText : String = getUriTriples.mkString+getValueTriples.mkString
53+
54+
55+
def this (
56+
uriTriples: collection.mutable.Map[String, List[String]] = collection.mutable.Map.empty,
57+
valueTriples: collection.mutable.Map[String, collection.mutable.Map[String,String]]= collection.mutable.Map.empty,
58+
NodeType : SimpleNode.NodeType = SimpleNode.NotImportant
59+
) = this(
60+
if(uriTriples==null) collection.mutable.Map.empty else uriTriples,
61+
if(valueTriples==null) collection.mutable.Map.empty else valueTriples,
62+
if(NodeType == null) SimpleNode.NotImportant else NodeType,
63+
List.empty
64+
)
3565
}
66+
67+

core/src/main/scala/org/dbpedia/extraction/wikiparser/impl/json/JsonWikiParser.scala

Lines changed: 7 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -37,56 +37,11 @@ class JsonWikiParser {
3737
var nodes = getLanguageLinks(page)
3838
nodes = nodes ::: getLabels(page)
3939
nodes = nodes ::: getFacts(page)
40+
4041
// Return page node
4142
new PageNode(page.title, page.id, page.revision, page.timestamp, page.contributorID, page.contributorName, false, false, nodes)
4243
}
4344

44-
def collectInterLanguageLinks(page: WikiPage) : List[Node] = {
45-
46-
var nodes = List[Node]()
47-
val json = page.source
48-
49-
val parsedText = parseOpt(json)
50-
51-
val jsonObjMap = parsedText match {
52-
case Some(map) => map
53-
case _ => throw new IllegalStateException("Invalid JSON representation!")
54-
}
55-
56-
val interLinks = (jsonObjMap \ "links") match {
57-
case JObject(links) => links
58-
case _ => List()
59-
}
60-
61-
val interLinksMap = collection.mutable.Map[String, String]()
62-
63-
interLinks.foreach { interLink : JField =>
64-
interLink.name match {
65-
case WikiLanguageRegex(lang) => interLinksMap += lang -> interLink.value.extract[String]
66-
case _ =>
67-
}
68-
}
69-
70-
if (! interLinksMap.contains("en")) return nodes
71-
72-
val sourceTitle = WikiTitle.parse(interLinksMap.get("en").get, Language.English)
73-
// Do not generate a link to the default language itself
74-
interLinksMap -= "en"
75-
76-
interLinksMap.foreach {
77-
case (key, value) =>
78-
Language.map.get(key) match {
79-
case Some(lang) =>
80-
val destinationTitle = WikiTitle.parse(key + ":" + value, lang)
81-
nodes ::= WikidataInterWikiLinkNode(sourceTitle, destinationTitle)
82-
case _ =>
83-
}
84-
case _ =>
85-
}
86-
87-
nodes
88-
}
89-
9045

9146
/**
9247
* Main functionality is parsing the WikiData Json page and extract language links related tripleson the form
@@ -140,15 +95,13 @@ class JsonWikiParser {
14095

14196
interLinksMap += "http://www.w3.org/2002/07/owl#sameAs" -> values
14297

143-
nodes::= new SimpleNode(interLinksMap)
98+
nodes::= new SimpleNode(interLinksMap,null,SimpleNode.LanguageLinks)
14499

145100
nodes
146101
}
147102

148-
149103
/**
150104
* Main functionality is parsing the WikiData Json page and extract labels in different languages the form
151-
*
152105
* <http://www.w3.org/2000/01/rdf-schema#label> "New York City"@en
153106
* "New York "@fr
154107
* "New York"@co
@@ -182,10 +135,6 @@ class JsonWikiParser {
182135
// "fr": "New York",
183136
// "it": "New York",
184137
// "pl": "Nowy Jork",
185-
// "de": "New York",
186-
// "nl": "New York",
187-
// "be-tarask": "Нью-Ёрк",
188-
// "nan": "New York Chhī"
189138
// }
190139
// Json sample : http://pastebin.com/zygpzhJK
191140

@@ -206,10 +155,9 @@ class JsonWikiParser {
206155

207156
}
208157

209-
210158
labelsTriples += "http://www.w3.org/2000/01/rdf-schema#label" -> labelsMap
211159

212-
nodes::= new SimpleNode(collection.mutable.Map.empty,labelsTriples)
160+
nodes::= new SimpleNode(collection.mutable.Map.empty,labelsTriples,SimpleNode.Labels)
213161

214162
nodes
215163
}
@@ -221,15 +169,14 @@ class JsonWikiParser {
221169
* <http://www.w3.org/2000/01/rdf-schema#label> "New York City"@en
222170
* "New York "@fr
223171
* "New York"@co
224-
*@param page
172+
* @param page
225173
* @return SimpleObject that contains no UriTriples and it's valueTriples are filled with different labels on the form
226174
* Labelproperty ->
227175
* lang -> label
228176
*
229177
* <http://www.w3.org/2000/01/rdf-schema#label> ->
230178
* "en" -> "New York City"
231-
* "fr" -> "New York"
232-
* "co" -> "New York"
179+
* "fr" -> "New York" "co" -> "New York"
233180
*/
234181
def getFacts(page: WikiPage) : List[Node] = {
235182

@@ -245,29 +192,9 @@ class JsonWikiParser {
245192

246193

247194
/** get all nodes under json key "claims" which will be in the form
248-
[
249-
{
250-
"m":[
251-
"value",
252-
30,
253-
"wikibase-entityid",
254-
{
255-
"entity-type":"item",
256-
"numeric-id":49
257-
}
258-
],
259-
"q":[
260-
261-
],
262-
"g":"q30$F21480EF-73A9-44B7-922A-CEE9DE3FA3AC",
263-
"rank":1,
264-
"refs":[
265-
]
266-
}]
267195
*Json sample : http://pastebin.com/9H6s2Nid
268196
*/
269197

270-
271198
/** scenario is as following :
272199
* 1- check that m has "value" not some value or no value
273200
* 2- check that it's "rank":1
@@ -279,16 +206,12 @@ class JsonWikiParser {
279206
* e- wikibase-entityid : get entity id /numeric-id and add "http://wikipeida.dbpedia.org/resource/Q" to it
280207
*
281208
* 4- depending on the output type decide to add it to the URITriples or ValuesTriples
282-
*
283209
*/
284210

285-
//println("inside the parser")
286-
287211
var valueTriples = collection.mutable.Map[String, collection.mutable.Map[String,String]]()
288212
var URITriples = collection.mutable.Map[String, List[String]]()
289213

290214

291-
292215
//get claims only whose are values and has rank ==1 in List[JObject]
293216

294217
val claims = for {
@@ -351,7 +274,8 @@ class JsonWikiParser {
351274

352275
}
353276

354-
nodes::= new SimpleNode(URITriples,valueTriples)
277+
//nodes::= new SimpleNode()
278+
nodes::= new SimpleNode(URITriples,valueTriples,SimpleNode.Facts)
355279
nodes
356280
}
357281

0 commit comments

Comments
 (0)