Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
908cfce
draft core refactoring for wikidata
hadyelsahar Dec 11, 2013
3124d7d
fixed type conflicts
hadyelsahar Dec 11, 2013
e668fff
added wikiPageExtractor trait to match extractors that accepts WikiPage
hadyelsahar Dec 11, 2013
16ee866
various changes
jimkont Dec 12, 2013
7ce3d27
Merge pull request #3 from jimkont/wikidata
hadyelsahar Dec 12, 2013
1b7f815
added wikipage extractor
hadyelsahar Dec 12, 2013
2811e1f
various changes by dimitris
hadyelsahar Dec 12, 2013
fed7493
changed updated ArticlePageExtractor in prop file
hadyelsahar Dec 18, 2013
dd6f3ca
fixed issues in Sever Module to deal with Parser -> Option
hadyelsahar Dec 18, 2013
b0671ab
fixed bugs in server module due to chainging Extractor[PageNode] to E…
hadyelsahar Dec 19, 2013
9d96186
import scala.language.reflectiveCalls
jimkont Jan 17, 2014
e9976d4
adapt for the new parser output
jimkont Jan 17, 2014
f5eb297
remove obsolete classes
jimkont Jan 17, 2014
909a741
throw exception on parsing error
jimkont Jan 20, 2014
edab2b6
merge with latest master
jimkont Jan 20, 2014
3c73aa8
skipped HomepageExtractor test on merge
jimkont Jan 20, 2014
d5b578f
comment unneeded constants (todo: remove all & reuse Namespace class)
jimkont Jan 20, 2014
6209a69
Adapt Live core to work with the refactored framework
jimkont Jan 20, 2014
563287b
Merge pull request #4 from jimkont/wikidata
hadyelsahar Jan 23, 2014
8984a53
rename Extractor with a more representative name
jimkont Jan 24, 2014
0724dbe
fix HomepageExtractorTest (regression)
jimkont Jan 24, 2014
848b195
Use revision URI as context in WikiPageExtractors
jimkont Jan 24, 2014
28ca4fe
Naming coherence (review)
jimkont Jan 24, 2014
e50eefe
whitespace consistency
jimkont Jan 24, 2014
115159e
fix comment
jimkont Jan 24, 2014
460f3d0
adjust extraction properties with the renamed ProvenanceExtractor
jimkont Jan 24, 2014
8f16d5f
Merge pull request #5 from jimkont/wikidata
hadyelsahar Jan 24, 2014
e535730
added comments to Datasets Destinations
hadyelsahar Jan 24, 2014
867d0aa
remove extra space after Wikipage
hadyelsahar Jan 24, 2014
6cf62af
remove extra space
hadyelsahar Jan 24, 2014
50da5c0
remove extra space after JsonNode
hadyelsahar Jan 24, 2014
eda16c1
removing extra spaces
hadyelsahar Jan 24, 2014
31bf421
removing extra spaces before WikiPage
hadyelsahar Jan 24, 2014
52b2f09
removal of extraspace
hadyelsahar Jan 24, 2014
8bdd1ef
deleted wikidataExtractor template file
hadyelsahar Feb 23, 2014
a67749b
aaa
hadyelsahar Feb 23, 2014
bdb4101
remove wikidataExtractor template
hadyelsahar Feb 23, 2014
54c6b7d
add owl:sameas as ontologyproperty instead of string
hadyelsahar Feb 23, 2014
bc2b269
added edited namespace of wikdata entities inside dbpedia to be wikid…
hadyelsahar Feb 23, 2014
6b661cd
page.wikiPage.sourceUri instead of page.wikiPage.title.pageIri
hadyelsahar Feb 23, 2014
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -63,5 +63,36 @@ object DBpediaDatasets
val PageLinks = new Dataset("page_links")
val DisambiguationLinks = new Dataset("disambiguations")
val Homepages = new Dataset("homepages")



/**
* Wikidata outputDatasets
*/

//for the dummy wikidata Extractor skeleton file
val Wikidata = new Dataset("wikidata")

//language links dump in the form of
//<http://L1.dbpedia.org/resource/X2> <http://www.w3.org/2002/07/owl#sameAs> <http://L2.dbpedia.org/resource/X2> .
val WikidataLL = new Dataset("wikidata-ll")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comment to describe what this dataset is for?


//Multi lingual labels triples
//<http://wikidata.dbpedia.org/resource/Q549> <http://www.w3.org/2000/01/rdf-schema#label> "Bojovnica pestrá"@sk .
val WikidataLabels = new Dataset("wikidata-labels")

//mappings between Wikidata entities inside DBpedia and DBpedia entities using the owl:sameas property
//<http://wikidata.dbpedia.org/resource/Q1934> <http://www.w3.org/2002/07/owl#sameAs> <http://ar.dbpedia.org/resource/سيدني_غوفو> .
val WikidataSameAs = new Dataset("wikidata-sameas")

//Mapping between Wikidata Entities URIs and Their Equivalent URIs used in DBpedia
//<http://wikidata.dbpedia.org/resource/Q18> <http://www.w3.org/2002/07/owl#sameAs> <http://wikidata.org/entity/Q18> .
val WikidataNameSpaceSameAs = new Dataset("wikidata-namespace-sameas")

// wikidata facts triples
val WikidataFacts = new Dataset("wikidata")
//wikidata facts triples with mapped properties to DBpedia ones
val WikidataMappedFacts = new Dataset("wikidata-mapped")


}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class Quad(
extends Ordered[Quad]
with Equals
{
//updated for allowing addition of Wikidata String properties with unknown language
def this(
language: Language,
dataset: Dataset,
Expand All @@ -47,7 +48,7 @@ with Equals
context: String,
datatype: Datatype
) = this(
language.isoCode,
if (language == null) null else language.isoCode,
dataset.name,
subject,
predicate,
Expand All @@ -74,6 +75,7 @@ with Equals
findType(datatype, predicate.range)
)


// Validate input
if (subject == null) throw new NullPointerException("subject")
if (predicate == null) throw new NullPointerException("predicate")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package org.dbpedia.extraction.mappings

import org.dbpedia.extraction.destinations.{DBpediaDatasets, Quad}
import org.dbpedia.extraction.wikiparser._
import org.dbpedia.extraction.ontology.Ontology
import org.dbpedia.extraction.util.Language
import scala.collection.mutable.ArrayBuffer
import scala.language.reflectiveCalls

/**
* Extracts links to corresponding Articles in Wikipedia.
*/
class ArticlePageExtractor(
context : {
def ontology : Ontology
def language : Language
}
)
extends PageNodeExtractor
{
// We used foaf:page here, but foaf:isPrimaryTopicOf is probably better.
private val isPrimaryTopicOf = context.ontology.properties("foaf:isPrimaryTopicOf")
private val primaryTopic = context.ontology.properties("foaf:primaryTopic")
private val dcLanguage = context.ontology.properties("dc:language")
private val typeOntProperty = context.ontology.properties("rdf:type")
private val foafDocument = context.ontology.classes("foaf:Document")

override val datasets = Set(DBpediaDatasets.LinksToWikipediaArticle)

override def extract(page : PageNode, subjectUri : String, pageContext : PageContext): Seq[Quad] =
{
if(page.title.namespace != Namespace.Main) return Seq.empty

val quads = new ArrayBuffer[Quad]()

quads += new Quad(context.language, DBpediaDatasets.LinksToWikipediaArticle, subjectUri, isPrimaryTopicOf, page.title.pageIri, page.sourceUri)
quads += new Quad(context.language, DBpediaDatasets.LinksToWikipediaArticle, page.title.pageIri, primaryTopic, subjectUri, page.sourceUri)
quads += new Quad(context.language, DBpediaDatasets.LinksToWikipediaArticle, page.title.pageIri, dcLanguage, context.language.wikiCode, page.sourceUri)
quads += new Quad(context.language, DBpediaDatasets.LinksToWikipediaArticle, page.title.pageIri, typeOntProperty, foafDocument.uri, page.sourceUri)

quads
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,22 @@ import org.dbpedia.extraction.destinations.{DBpediaDatasets,Quad,QuadBuilder}
import org.dbpedia.extraction.ontology.Ontology
import org.dbpedia.extraction.util.Language
import scala.language.reflectiveCalls
import org.dbpedia.extraction.sources.WikiPage

/**
* Extracts labels for Categories.
*/
class CategoryLabelExtractor( context : {
def ontology : Ontology
def language : Language } ) extends PageNodeExtractor
def language : Language } ) extends WikiPageExtractor
{
private val labelProperty = context.ontology.properties("rdfs:label")

private val quad = QuadBuilder(context.language, DBpediaDatasets.CategoryLabels, labelProperty, new Datatype("xsd:string")) _

override val datasets = Set(DBpediaDatasets.CategoryLabels)

override def extract(node : PageNode, subjectUri : String, pageContext : PageContext) : Seq[Quad] =
override def extract(node : WikiPage, subjectUri : String, pageContext : PageContext) : Seq[Quad] =
{
if(node.title.namespace != Namespace.Category) Seq.empty
else Seq(quad(subjectUri, node.title.decoded, node.sourceUri))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
package org.dbpedia.extraction.mappings

import org.dbpedia.extraction.destinations.{Dataset,Quad}
import org.dbpedia.extraction.destinations.Dataset
import org.dbpedia.extraction.destinations.Quad

/**
* TODO: generic type may not be optimal.
*/
class CompositeExtractor[N](mappings: Extractor[N] *) extends Extractor[N]
class CompositeExtractor[N](mappings: Extractor[N]*) extends Extractor[N]
{
override val datasets: Set[Dataset] = mappings.flatMap(_.datasets).toSet

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package org.dbpedia.extraction.mappings
import org.dbpedia.extraction.wikiparser.JsonNode

class CompositeJsonNodeExtractor(extractors: Extractor[JsonNode]*)
extends CompositeExtractor[JsonNode](extractors: _*)
with JsonNodeExtractor

/**
* Creates new extractors.
*/
object CompositeJsonNodeExtractor
{
/**
* Creates a new extractor.
*
* TODO: using reflection here loses compile-time type safety.
*
* @param extractors List of extractor classes to be instantiated
* @param context Any type of object that implements the required parameter methods for the extractors
*/
def load(classes: Seq[Class[_ <: JsonNodeExtractor]], context: AnyRef): JsonNodeExtractor =
{
val extractors = classes.map(_.getConstructor(classOf[AnyRef]).newInstance(context))
new CompositeJsonNodeExtractor(extractors: _*)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package org.dbpedia.extraction.mappings

import org.dbpedia.extraction.destinations.Dataset
import org.dbpedia.extraction.destinations.Quad
import org.dbpedia.extraction.sources.WikiPage
import scala.collection.mutable.ArrayBuffer

/**
* TODO: generic type may not be optimal.
*/
class CompositeParseExtractor(extractors: Extractor[_]*)
extends WikiPageExtractor
{
override val datasets: Set[Dataset] = extractors.flatMap(_.datasets).toSet

override def extract(input: WikiPage, subjectUri: String, context: PageContext): Seq[Quad] = {

//val extractors = classes.map(_.getConstructor(classOf[AnyRef]).newInstance(context))

//define different types of Extractors
val wikiPageExtractors = new ArrayBuffer[Extractor[WikiPage]]()
val pageNodeExtractors = new ArrayBuffer[PageNodeExtractor]()
val jsonNodeExtractors = new ArrayBuffer[JsonNodeExtractor]()
val finalExtractors = new ArrayBuffer[Extractor[WikiPage]]()
//to do: add json extractors

val quads = new ArrayBuffer[Quad]()

//if extractor is not either PageNodeExtractor or JsonNodeExtractor so it accepts WikiPage as input
extractors foreach { extractor =>
extractor match {

case _ :PageNodeExtractor => pageNodeExtractors += extractor.asInstanceOf[PageNodeExtractor] //select all extractors which take PageNode to wrap them in WikiParseExtractor
case _ :JsonNodeExtractor => jsonNodeExtractors += extractor.asInstanceOf[JsonNodeExtractor]
case _ :WikiPageExtractor => wikiPageExtractors += extractor.asInstanceOf[Extractor[WikiPage]] //select all extractors which take Wikipage to wrap them in a CompositeExtractor
case _ =>
}
}

if (!wikiPageExtractors.isEmpty)
finalExtractors += new CompositeWikiPageExtractor(wikiPageExtractors :_*)

//create and load WikiParseExtractor here
if (!pageNodeExtractors.isEmpty)
finalExtractors += new WikiParseExtractor(new CompositePageNodeExtractor(pageNodeExtractors :_*))

//create and load JsonParseExtractor here
if (!jsonNodeExtractors.isEmpty)
finalExtractors += new JsonParseExtractor(new CompositeJsonNodeExtractor(jsonNodeExtractors :_*))

if (finalExtractors.isEmpty)
Seq.empty
else
new CompositeExtractor[WikiPage](finalExtractors :_*).extract(input, subjectUri, context)
}
}

/**
* Creates new extractors.
*/
object CompositeParseExtractor
{
/**
* Creates a new CompositeExtractor loaded with same type of Extractors[T]
*
* TODO: using reflection here loses compile-time type safety.
*
* @param classes List of extractor classes to be instantiated
* @param context Any type of object that implements the required parameter methods for the extractors
*/
def load(classes: Seq[Class[_ <: Extractor[_]]], context: AnyRef): WikiPageExtractor =
{
val extractors = classes.map(_.getConstructor(classOf[AnyRef]).newInstance(context))
new CompositeParseExtractor(extractors: _*)
}
}




Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package org.dbpedia.extraction.mappings

import org.dbpedia.extraction.sources.WikiPage

class CompositeWikiPageExtractor(extractors: Extractor[WikiPage]*)
extends CompositeExtractor[WikiPage](extractors: _*)
with WikiPageExtractor


Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import org.dbpedia.extraction.wikiparser._
import org.dbpedia.extraction.destinations.{DBpediaDatasets, Quad}
import java.net.{URLEncoder, URI}
import scala.language.reflectiveCalls
import org.dbpedia.extraction.sources.WikiPage

/**
* Created by IntelliJ IDEA.
Expand All @@ -17,18 +18,18 @@ import scala.language.reflectiveCalls

class ContributorExtractor( context : {
def ontology : Ontology
def language : Language } ) extends PageNodeExtractor
def language : Language } ) extends WikiPageExtractor
{

override val datasets = Set(DBpediaDatasets.RevisionMeta)

override def extract(node : PageNode, subjectUri : String, pageContext : PageContext) : Seq[Quad] =
override def extract(node : WikiPage, subjectUri : String, pageContext : PageContext) : Seq[Quad] =
{
if(node.title.namespace != Namespace.Main) return Seq.empty

if(node.contributorID <= 0) return Seq.empty

val pageURL = "http://" + context.language.wikiCode + ".wikipedia.org/wiki/" + node.root.title.encoded;
val pageURL = "http://" + context.language.wikiCode + ".wikipedia.org/wiki/" + node.title.encoded;

//Required predicates
val contributorPredicate = "http://dbpedia.org/meta/contributor";
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package org.dbpedia.extraction.mappings

import org.dbpedia.extraction.destinations.Quad
import org.dbpedia.extraction.wikiparser._

/**
* Extractors are mappings that extract data from a JsonNode.
* Necessary to get some type safety in CompositeExtractor:
* Class[_ <: Extractor] can be checked at runtime, but Class[_ <: Mapping[PageNode]] can not.
*/
trait JsonNodeExtractor extends Extractor[JsonNode]
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package org.dbpedia.extraction.mappings

import org.dbpedia.extraction.destinations.{Dataset, Quad}
import org.dbpedia.extraction.sources.WikiPage
import org.dbpedia.extraction.wikiparser.impl.json.JsonWikiParser

/**
* User: hadyelsahar
* Date: 11/19/13
* Time: 12:43 PM
*
* JsonParseExtractor as explained in the design : https://f.cloud.github.com/assets/607468/363286/1f8da62c-a1ff-11e2-99c3-bb5136accc07.png
*
* send page to JsonParser, if jsonparser returns none do nothing
* if it's parsed correctly send the JsonNode to the next level extractors
*
* @param extractors a Sequence of CompositeJsonNodeExtractor
*
* */
class JsonParseExtractor(extractors: CompositeJsonNodeExtractor)extends Extractor[WikiPage]{

override val datasets: Set[Dataset] = extractors.datasets

override def extract(input: WikiPage, subjectUri: String, context: PageContext): Seq[Quad] = {

val parser = new JsonWikiParser()
val node = parser(input)
node match {
case Some(n) => extractors.extract(n, subjectUri, context)
case None => Seq.empty
}

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import org.dbpedia.extraction.wikiparser._
import org.dbpedia.extraction.ontology.Ontology
import org.dbpedia.extraction.util.Language
import scala.language.reflectiveCalls
import org.dbpedia.extraction.sources.WikiPage

/**
* Extracts labels to articles based on their title.
Expand All @@ -15,13 +16,13 @@ class LabelExtractor(
def language : Language
}
)
extends PageNodeExtractor
extends WikiPageExtractor
{
val labelProperty = context.ontology.properties("rdfs:label")

override val datasets = Set(DBpediaDatasets.Labels)

override def extract(page: PageNode, subjectUri: String, pageContext: PageContext) : Seq[Quad] =
override def extract(page: WikiPage, subjectUri: String, pageContext: PageContext) : Seq[Quad] =
{
if(page.title.namespace != Namespace.Main) return Seq.empty

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import org.dbpedia.extraction.ontology.{Ontology, OntologyClass, OntologyPropert
import java.lang.IllegalArgumentException
import org.dbpedia.extraction.util.Language
import scala.language.reflectiveCalls
import org.dbpedia.extraction.sources.WikiPage

/**
* Loads the mappings from the configuration and builds a MappingExtractor instance.
Expand All @@ -24,14 +25,15 @@ object MappingsLoader
def ontology : Ontology
def language : Language
def redirects : Redirects
def mappingPageSource : Traversable[PageNode] } ) : Mappings =
def mappingPageSource : Traversable[WikiPage] } ) : Mappings =
{
logger.info("Loading mappings ("+context.language.wikiCode+")")

val classMappings = new HashMap[String, Extractor[TemplateNode]]()
val tableMappings = new ArrayBuffer[TableMapping]()
val parser = WikiParser.getInstance()

for ( page <- context.mappingPageSource;
for ( page <- context.mappingPageSource.map(parser).flatten;
node <- page.children if node.isInstanceOf[TemplateNode] )
{
val tnode = node.asInstanceOf[TemplateNode]
Expand Down
Loading