Skip to content

Commit 7227a67

Browse files
committed
Merge pull request dbpedia#155 from hadyelsahar/wikidata
Wikidata integration + Refactoring Core to accept new formats
2 parents bf400f9 + cd0cb6d commit 7227a67

File tree

66 files changed

+1520
-1062
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+1520
-1062
lines changed

core/src/main/scala/org/dbpedia/extraction/destinations/DBpediaDatasets.scala

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,5 +63,36 @@ object DBpediaDatasets
6363
val PageLinks = new Dataset("page_links")
6464
val DisambiguationLinks = new Dataset("disambiguations")
6565
val Homepages = new Dataset("homepages")
66+
67+
68+
69+
/**
70+
* Wikidata outputDatasets
71+
*/
72+
73+
//for the dummy wikidata Extractor skeleton file
74+
val Wikidata = new Dataset("wikidata")
75+
76+
//language links dump in the form of
77+
//<http://L1.dbpedia.org/resource/X2> <http://www.w3.org/2002/07/owl#sameAs> <http://L2.dbpedia.org/resource/X2> .
78+
val WikidataLL = new Dataset("wikidata-ll")
79+
80+
//Multi lingual labels triples
81+
//<http://wikidata.dbpedia.org/resource/Q549> <http://www.w3.org/2000/01/rdf-schema#label> "Bojovnica pestrá"@sk .
82+
val WikidataLabels = new Dataset("wikidata-labels")
83+
84+
//mappings between Wikidata entities inside DBpedia and DBpedia entities using the owl:sameas property
85+
//<http://wikidata.dbpedia.org/resource/Q1934> <http://www.w3.org/2002/07/owl#sameAs> <http://ar.dbpedia.org/resource/سيدني_غوفو> .
86+
val WikidataSameAs = new Dataset("wikidata-sameas")
87+
88+
//Mapping between Wikidata Entities URIs and Their Equivalent URIs used in DBpedia
89+
//<http://wikidata.dbpedia.org/resource/Q18> <http://www.w3.org/2002/07/owl#sameAs> <http://wikidata.org/entity/Q18> .
90+
val WikidataNameSpaceSameAs = new Dataset("wikidata-namespace-sameas")
91+
92+
// wikidata facts triples
93+
val WikidataFacts = new Dataset("wikidata")
94+
//wikidata facts triples with mapped properties to DBpedia ones
95+
val WikidataMappedFacts = new Dataset("wikidata-mapped")
96+
6697

6798
}

core/src/main/scala/org/dbpedia/extraction/destinations/Quad.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ class Quad(
3838
extends Ordered[Quad]
3939
with Equals
4040
{
41+
//updated for allowing addition of Wikidata String properties with unknown language
4142
def this(
4243
language: Language,
4344
dataset: Dataset,
@@ -47,7 +48,7 @@ with Equals
4748
context: String,
4849
datatype: Datatype
4950
) = this(
50-
language.isoCode,
51+
if (language == null) null else language.isoCode,
5152
dataset.name,
5253
subject,
5354
predicate,
@@ -74,6 +75,7 @@ with Equals
7475
findType(datatype, predicate.range)
7576
)
7677

78+
7779
// Validate input
7880
if (subject == null) throw new NullPointerException("subject")
7981
if (predicate == null) throw new NullPointerException("predicate")
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
package org.dbpedia.extraction.mappings
2+
3+
import org.dbpedia.extraction.destinations.{DBpediaDatasets, Quad}
4+
import org.dbpedia.extraction.wikiparser._
5+
import org.dbpedia.extraction.ontology.Ontology
6+
import org.dbpedia.extraction.util.Language
7+
import scala.collection.mutable.ArrayBuffer
8+
import scala.language.reflectiveCalls
9+
10+
/**
11+
* Extracts links to corresponding Articles in Wikipedia.
12+
*/
13+
class ArticlePageExtractor(
14+
context : {
15+
def ontology : Ontology
16+
def language : Language
17+
}
18+
)
19+
extends PageNodeExtractor
20+
{
21+
// We used foaf:page here, but foaf:isPrimaryTopicOf is probably better.
22+
private val isPrimaryTopicOf = context.ontology.properties("foaf:isPrimaryTopicOf")
23+
private val primaryTopic = context.ontology.properties("foaf:primaryTopic")
24+
private val dcLanguage = context.ontology.properties("dc:language")
25+
private val typeOntProperty = context.ontology.properties("rdf:type")
26+
private val foafDocument = context.ontology.classes("foaf:Document")
27+
28+
override val datasets = Set(DBpediaDatasets.LinksToWikipediaArticle)
29+
30+
override def extract(page : PageNode, subjectUri : String, pageContext : PageContext): Seq[Quad] =
31+
{
32+
if(page.title.namespace != Namespace.Main) return Seq.empty
33+
34+
val quads = new ArrayBuffer[Quad]()
35+
36+
quads += new Quad(context.language, DBpediaDatasets.LinksToWikipediaArticle, subjectUri, isPrimaryTopicOf, page.title.pageIri, page.sourceUri)
37+
quads += new Quad(context.language, DBpediaDatasets.LinksToWikipediaArticle, page.title.pageIri, primaryTopic, subjectUri, page.sourceUri)
38+
quads += new Quad(context.language, DBpediaDatasets.LinksToWikipediaArticle, page.title.pageIri, dcLanguage, context.language.wikiCode, page.sourceUri)
39+
quads += new Quad(context.language, DBpediaDatasets.LinksToWikipediaArticle, page.title.pageIri, typeOntProperty, foafDocument.uri, page.sourceUri)
40+
41+
quads
42+
}
43+
}

core/src/main/scala/org/dbpedia/extraction/mappings/CategoryLabelExtractor.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,22 @@ import org.dbpedia.extraction.destinations.{DBpediaDatasets,Quad,QuadBuilder}
66
import org.dbpedia.extraction.ontology.Ontology
77
import org.dbpedia.extraction.util.Language
88
import scala.language.reflectiveCalls
9+
import org.dbpedia.extraction.sources.WikiPage
910

1011
/**
1112
* Extracts labels for Categories.
1213
*/
1314
class CategoryLabelExtractor( context : {
1415
def ontology : Ontology
15-
def language : Language } ) extends PageNodeExtractor
16+
def language : Language } ) extends WikiPageExtractor
1617
{
1718
private val labelProperty = context.ontology.properties("rdfs:label")
1819

1920
private val quad = QuadBuilder(context.language, DBpediaDatasets.CategoryLabels, labelProperty, new Datatype("xsd:string")) _
2021

2122
override val datasets = Set(DBpediaDatasets.CategoryLabels)
2223

23-
override def extract(node : PageNode, subjectUri : String, pageContext : PageContext) : Seq[Quad] =
24+
override def extract(node : WikiPage, subjectUri : String, pageContext : PageContext) : Seq[Quad] =
2425
{
2526
if(node.title.namespace != Namespace.Category) Seq.empty
2627
else Seq(quad(subjectUri, node.title.decoded, node.sourceUri))

core/src/main/scala/org/dbpedia/extraction/mappings/CompositeExtractor.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
package org.dbpedia.extraction.mappings
22

3-
import org.dbpedia.extraction.destinations.{Dataset,Quad}
3+
import org.dbpedia.extraction.destinations.Dataset
4+
import org.dbpedia.extraction.destinations.Quad
45

56
/**
67
* TODO: generic type may not be optimal.
78
*/
8-
class CompositeExtractor[N](mappings: Extractor[N] *) extends Extractor[N]
9+
class CompositeExtractor[N](mappings: Extractor[N]*) extends Extractor[N]
910
{
1011
override val datasets: Set[Dataset] = mappings.flatMap(_.datasets).toSet
1112

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package org.dbpedia.extraction.mappings
2+
import org.dbpedia.extraction.wikiparser.JsonNode
3+
4+
class CompositeJsonNodeExtractor(extractors: Extractor[JsonNode]*)
5+
extends CompositeExtractor[JsonNode](extractors: _*)
6+
with JsonNodeExtractor
7+
8+
/**
9+
* Creates new extractors.
10+
*/
11+
object CompositeJsonNodeExtractor
12+
{
13+
/**
14+
* Creates a new extractor.
15+
*
16+
* TODO: using reflection here loses compile-time type safety.
17+
*
18+
* @param extractors List of extractor classes to be instantiated
19+
* @param context Any type of object that implements the required parameter methods for the extractors
20+
*/
21+
def load(classes: Seq[Class[_ <: JsonNodeExtractor]], context: AnyRef): JsonNodeExtractor =
22+
{
23+
val extractors = classes.map(_.getConstructor(classOf[AnyRef]).newInstance(context))
24+
new CompositeJsonNodeExtractor(extractors: _*)
25+
}
26+
}
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
package org.dbpedia.extraction.mappings
2+
3+
import org.dbpedia.extraction.destinations.Dataset
4+
import org.dbpedia.extraction.destinations.Quad
5+
import org.dbpedia.extraction.sources.WikiPage
6+
import scala.collection.mutable.ArrayBuffer
7+
8+
/**
9+
* TODO: generic type may not be optimal.
10+
*/
11+
class CompositeParseExtractor(extractors: Extractor[_]*)
12+
extends WikiPageExtractor
13+
{
14+
override val datasets: Set[Dataset] = extractors.flatMap(_.datasets).toSet
15+
16+
override def extract(input: WikiPage, subjectUri: String, context: PageContext): Seq[Quad] = {
17+
18+
//val extractors = classes.map(_.getConstructor(classOf[AnyRef]).newInstance(context))
19+
20+
//define different types of Extractors
21+
val wikiPageExtractors = new ArrayBuffer[Extractor[WikiPage]]()
22+
val pageNodeExtractors = new ArrayBuffer[PageNodeExtractor]()
23+
val jsonNodeExtractors = new ArrayBuffer[JsonNodeExtractor]()
24+
val finalExtractors = new ArrayBuffer[Extractor[WikiPage]]()
25+
//to do: add json extractors
26+
27+
val quads = new ArrayBuffer[Quad]()
28+
29+
//if extractor is not either PageNodeExtractor or JsonNodeExtractor so it accepts WikiPage as input
30+
extractors foreach { extractor =>
31+
extractor match {
32+
33+
case _ :PageNodeExtractor => pageNodeExtractors += extractor.asInstanceOf[PageNodeExtractor] //select all extractors which take PageNode to wrap them in WikiParseExtractor
34+
case _ :JsonNodeExtractor => jsonNodeExtractors += extractor.asInstanceOf[JsonNodeExtractor]
35+
case _ :WikiPageExtractor => wikiPageExtractors += extractor.asInstanceOf[Extractor[WikiPage]] //select all extractors which take Wikipage to wrap them in a CompositeExtractor
36+
case _ =>
37+
}
38+
}
39+
40+
if (!wikiPageExtractors.isEmpty)
41+
finalExtractors += new CompositeWikiPageExtractor(wikiPageExtractors :_*)
42+
43+
//create and load WikiParseExtractor here
44+
if (!pageNodeExtractors.isEmpty)
45+
finalExtractors += new WikiParseExtractor(new CompositePageNodeExtractor(pageNodeExtractors :_*))
46+
47+
//create and load JsonParseExtractor here
48+
if (!jsonNodeExtractors.isEmpty)
49+
finalExtractors += new JsonParseExtractor(new CompositeJsonNodeExtractor(jsonNodeExtractors :_*))
50+
51+
if (finalExtractors.isEmpty)
52+
Seq.empty
53+
else
54+
new CompositeExtractor[WikiPage](finalExtractors :_*).extract(input, subjectUri, context)
55+
}
56+
}
57+
58+
/**
59+
* Creates new extractors.
60+
*/
61+
object CompositeParseExtractor
62+
{
63+
/**
64+
* Creates a new CompositeExtractor loaded with same type of Extractors[T]
65+
*
66+
* TODO: using reflection here loses compile-time type safety.
67+
*
68+
* @param classes List of extractor classes to be instantiated
69+
* @param context Any type of object that implements the required parameter methods for the extractors
70+
*/
71+
def load(classes: Seq[Class[_ <: Extractor[_]]], context: AnyRef): WikiPageExtractor =
72+
{
73+
val extractors = classes.map(_.getConstructor(classOf[AnyRef]).newInstance(context))
74+
new CompositeParseExtractor(extractors: _*)
75+
}
76+
}
77+
78+
79+
80+
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
package org.dbpedia.extraction.mappings
2+
3+
import org.dbpedia.extraction.sources.WikiPage
4+
5+
class CompositeWikiPageExtractor(extractors: Extractor[WikiPage]*)
6+
extends CompositeExtractor[WikiPage](extractors: _*)
7+
with WikiPageExtractor
8+
9+

core/src/main/scala/org/dbpedia/extraction/mappings/ContributorExtractor.scala

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import org.dbpedia.extraction.wikiparser._
66
import org.dbpedia.extraction.destinations.{DBpediaDatasets, Quad}
77
import java.net.{URLEncoder, URI}
88
import scala.language.reflectiveCalls
9+
import org.dbpedia.extraction.sources.WikiPage
910

1011
/**
1112
* Created by IntelliJ IDEA.
@@ -17,18 +18,18 @@ import scala.language.reflectiveCalls
1718

1819
class ContributorExtractor( context : {
1920
def ontology : Ontology
20-
def language : Language } ) extends PageNodeExtractor
21+
def language : Language } ) extends WikiPageExtractor
2122
{
2223

2324
override val datasets = Set(DBpediaDatasets.RevisionMeta)
2425

25-
override def extract(node : PageNode, subjectUri : String, pageContext : PageContext) : Seq[Quad] =
26+
override def extract(node : WikiPage, subjectUri : String, pageContext : PageContext) : Seq[Quad] =
2627
{
2728
if(node.title.namespace != Namespace.Main) return Seq.empty
2829

2930
if(node.contributorID <= 0) return Seq.empty
3031

31-
val pageURL = "http://" + context.language.wikiCode + ".wikipedia.org/wiki/" + node.root.title.encoded;
32+
val pageURL = "http://" + context.language.wikiCode + ".wikipedia.org/wiki/" + node.title.encoded;
3233

3334
//Required predicates
3435
val contributorPredicate = "http://dbpedia.org/meta/contributor";
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package org.dbpedia.extraction.mappings
2+
3+
import org.dbpedia.extraction.destinations.Quad
4+
import org.dbpedia.extraction.wikiparser._
5+
6+
/**
7+
* Extractors are mappings that extract data from a JsonNode.
8+
* Necessary to get some type safety in CompositeExtractor:
9+
* Class[_ <: Extractor] can be checked at runtime, but Class[_ <: Mapping[PageNode]] can not.
10+
*/
11+
trait JsonNodeExtractor extends Extractor[JsonNode]

0 commit comments

Comments
 (0)