Skip to content

Commit 60890f2

Browse files
committed
implemented Lexeme Extractor
1 parent 0a7cf55 commit 60890f2

File tree

2 files changed

+243
-3
lines changed

2 files changed

+243
-3
lines changed
Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
package org.dbpedia.extraction.mappings
2+
3+
import org.dbpedia.extraction.config.provenance.DBpediaDatasets
4+
import org.dbpedia.extraction.mappings.JsonNodeExtractor
5+
import org.dbpedia.extraction.ontology.Ontology
6+
import org.dbpedia.extraction.transform.Quad
7+
import org.dbpedia.extraction.util.{Language, WikidataUtil}
8+
import org.dbpedia.extraction.wikiparser.{JsonNode, Namespace}
9+
import org.wikidata.wdtk.datamodel.interfaces._
10+
11+
import scala.collection.JavaConversions._
12+
import scala.collection.mutable.ArrayBuffer
13+
import scala.language.reflectiveCalls
14+
15+
class WikidataLexemeExtractor(
16+
context: {
17+
def ontology: Ontology
18+
def language: Language
19+
}
20+
)
21+
extends JsonNodeExtractor {
22+
23+
private val lexicalCategoryProperty = "http://dbpedia.org/ontology/lexicalcategory"
24+
private val lemmaProperty = "http://dbpedia.org/ontology/lemma"
25+
private val language = "http://dbpedia.org/ontology/language"
26+
private val grammaticalFeatureProperty = "http://dbpedia.org/ontology/grammaticalfeature"
27+
private val labelProperty = context.ontology.properties("rdfs:label")
28+
// private val descriptionProperty = context.ontology.properties("description")
29+
// private val languageProperty = context.ontology.properties("rdf:language")
30+
31+
32+
override val datasets = Set(DBpediaDatasets.WikidataLexeme)
33+
34+
35+
override def extract(page: JsonNode, subjectUri: String): Seq[Quad] = {
36+
val quads = new ArrayBuffer[Quad]()
37+
38+
val subject = WikidataUtil.getWikidataNamespace(subjectUri).replace("Lexeme:", "")
39+
40+
//checks if extractor is used for correct entity
41+
42+
quads ++= getLexicalCategory(page, subject)
43+
quads ++= getLemmas(page, subject)
44+
quads ++= getLanguage(page, subject)
45+
quads ++= getStatements(page, subject)
46+
quads ++= getSenses(page, subject)
47+
quads ++= getForms(page, subject)
48+
49+
50+
quads
51+
}
52+
53+
54+
private def getLexicalCategory(document: JsonNode, subjectUri: String): Seq[Quad] = {
55+
val quads = new ArrayBuffer[Quad]()
56+
57+
if (document.wikiPage.title.namespace == Namespace.WikidataLexeme) {
58+
val page = document.wikiDataDocument.deserializeLexemeDocument(document.wikiPage.source)
59+
page.getLexicalCategory match {
60+
case value: Value =>{
61+
val objectValue = WikidataUtil.getValue(value)
62+
63+
// val datatype = if (WikidataUtil.getDatatype(v) != null) context.ontology.datatypes(WikidataUtil.getDatatype(v)) else null
64+
quads += new Quad(context.language, DBpediaDatasets.WikidataLexeme, subjectUri, lexicalCategoryProperty, objectValue,
65+
document.wikiPage.sourceIri, null)
66+
67+
}
68+
case _ =>
69+
}
70+
}
71+
quads
72+
}
73+
74+
private def getLemmas(document: JsonNode, subjectUri: String): Seq[Quad] = {
75+
val quads = new ArrayBuffer[Quad]()
76+
77+
if (document.wikiPage.title.namespace == Namespace.WikidataLexeme) {
78+
val page = document.wikiDataDocument.deserializeLexemeDocument(document.wikiPage.source)
79+
for ((lang, value) <- page.getLemmas) {
80+
val lemmas = WikidataUtil.replacePunctuation(value.toString, lang)
81+
Language.get(lang) match {
82+
case Some(dbpedia_lang) => {
83+
quads += new Quad(dbpedia_lang, DBpediaDatasets.WikidataLexeme, subjectUri, lemmaProperty, lemmas,
84+
document.wikiPage.sourceIri, context.ontology.datatypes("rdf:langString"))
85+
}
86+
case _ =>
87+
}
88+
}
89+
}
90+
quads
91+
}
92+
93+
private def getLanguage(document: JsonNode, subjectUri: String): Seq[Quad] = {
94+
val quads = new ArrayBuffer[Quad]()
95+
96+
97+
98+
if (document.wikiPage.title.namespace == Namespace.WikidataLexeme) {
99+
val page = document.wikiDataDocument.deserializeLexemeDocument(document.wikiPage.source)
100+
page.getLanguage match {
101+
case value: Value =>{
102+
val objectValue = WikidataUtil.getValue(value)
103+
104+
// val datatype = if (WikidataUtil.getDatatype(v) != null) context.ontology.datatypes(WikidataUtil.getDatatype(v)) else null
105+
quads += new Quad(context.language, DBpediaDatasets.WikidataLexeme, subjectUri, language, objectValue,
106+
document.wikiPage.sourceIri, null)
107+
108+
}
109+
case _ =>
110+
}
111+
}
112+
quads
113+
}
114+
private def getStatements(document: JsonNode, subjectUri: String): Seq[Quad] = {
115+
val quads = new ArrayBuffer[Quad]()
116+
117+
if (document.wikiPage.title.namespace == Namespace.WikidataLexeme) {
118+
val page = document.wikiDataDocument.deserializeLexemeDocument(document.wikiPage.source)
119+
for (statementGroup <- page.getStatementGroups) {
120+
statementGroup.foreach {
121+
statement => {
122+
val claim = statement.getClaim
123+
val lexeme = WikidataUtil.getWikidataNamespace(claim.getMainSnak.getPropertyId.getIri)
124+
125+
claim.getMainSnak match {
126+
case mainSnak: ValueSnak => {
127+
val v = mainSnak.getValue
128+
val value = WikidataUtil.getValue(v).split(" ")(0)
129+
val datatype = if (WikidataUtil.getDatatype(v) != null) context.ontology.datatypes(WikidataUtil.getDatatype(v)) else null
130+
quads += new Quad(context.language, DBpediaDatasets.WikidataLexeme, subjectUri, lexeme, value, document.wikiPage.sourceIri, datatype)
131+
}
132+
case _ =>
133+
}
134+
}
135+
}
136+
}
137+
}
138+
139+
quads
140+
}
141+
//write senses
142+
private def getSenses(document: JsonNode, subjectUri: String): Seq[Quad] = {
143+
val quads = new ArrayBuffer[Quad]()
144+
145+
if (document.wikiPage.title.namespace == Namespace.WikidataLexeme) {
146+
val page = document.wikiDataDocument.deserializeLexemeDocument(document.wikiPage.source)
147+
for (sense <- page.getSenses){
148+
val senseId = sense.getEntityId.toString.split(" ",2)(0)
149+
for (statementGroup <- sense.getStatementGroups) {
150+
statementGroup.foreach {
151+
statement => {
152+
val claim = statement.getClaim
153+
val lexeme = WikidataUtil.getWikidataNamespace(claim.getMainSnak.getPropertyId.getIri)
154+
155+
claim.getMainSnak match {
156+
case mainSnak: ValueSnak => {
157+
val v = mainSnak.getValue
158+
val value = WikidataUtil.getValue(v).split(" ")(0)
159+
val datatype = if (WikidataUtil.getDatatype(v) != null) context.ontology.datatypes(WikidataUtil.getDatatype(v)) else null
160+
quads += new Quad(context.language, DBpediaDatasets.WikidataLexeme, senseId, lexeme, value, document.wikiPage.sourceIri, datatype)
161+
}
162+
case _ =>
163+
}
164+
}
165+
}
166+
}
167+
for((lang, value) <- sense.getGlosses){
168+
val lemmas = WikidataUtil.replacePunctuation(value.toString, lang)
169+
Language.get(lang) match {
170+
case Some(dbpedia_lang) => {
171+
quads += new Quad(dbpedia_lang, DBpediaDatasets.WikidataLexeme, senseId, labelProperty, lemmas,
172+
document.wikiPage.sourceIri, context.ontology.datatypes("rdf:langString"))
173+
}
174+
case _ =>
175+
}
176+
}
177+
178+
}
179+
180+
}
181+
182+
quads
183+
}
184+
private def getForms(document: JsonNode, subjectUri: String): Seq[Quad] = {
185+
val quads = new ArrayBuffer[Quad]()
186+
187+
if (document.wikiPage.title.namespace == Namespace.WikidataLexeme) {
188+
val page = document.wikiDataDocument.deserializeLexemeDocument(document.wikiPage.source)
189+
for (form <- page.getForms){
190+
val formId = form.getEntityId.toString.split(" ",2)(0)
191+
for (statementGroup <- form.getStatementGroups) {
192+
statementGroup.foreach {
193+
statement => {
194+
val claim = statement.getClaim
195+
val lexeme = WikidataUtil.getWikidataNamespace(claim.getMainSnak.getPropertyId.getIri)
196+
197+
claim.getMainSnak match {
198+
case mainSnak: ValueSnak => {
199+
val v = mainSnak.getValue
200+
val value = WikidataUtil.getValue(v).split(" ")(0)
201+
202+
val datatype = if (WikidataUtil.getDatatype(v) != null) context.ontology.datatypes(WikidataUtil.getDatatype(v)) else null
203+
quads += new Quad(context.language, DBpediaDatasets.WikidataLexeme, formId, lexeme, value, document.wikiPage.sourceIri, datatype)
204+
}
205+
case _ =>
206+
}
207+
}
208+
}
209+
}
210+
for((lang, value) <- form.getRepresentations){
211+
val lemmas = WikidataUtil.replacePunctuation(value.toString, lang)
212+
Language.get(lang) match {
213+
case Some(dbpedia_lang) => {
214+
quads += new Quad(dbpedia_lang, DBpediaDatasets.WikidataLexeme, formId, labelProperty, lemmas,
215+
document.wikiPage.sourceIri, context.ontology.datatypes("rdf:langString"))
216+
}
217+
case _ =>
218+
}
219+
}
220+
for (grammaticalFeature <- form.getGrammaticalFeatures){
221+
grammaticalFeature match{
222+
case value: Value =>{
223+
val objectValue = WikidataUtil.getValue(value)
224+
quads += new Quad(context.language, DBpediaDatasets.WikidataLexeme, formId, grammaticalFeatureProperty, objectValue,
225+
document.wikiPage.sourceIri, null)
226+
}
227+
case _ =>
228+
}
229+
}
230+
231+
}
232+
233+
}
234+
235+
quads
236+
}
237+
238+
239+
240+
}

dump/extraction.wikidata.properties

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212

1313
# default:
1414
#source=# moved to $extraction-framework/core/src/main/resources/universal.properties
15-
15+
#source=pages-meta-current19.xml-p19072452p19140743.bz2
1616
# use only directories that contain a 'download-complete' file? Default is false.
17-
require-download-complete=true
17+
require-download-complete=false
1818

1919
# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
2020
languages=wikidata
@@ -23,5 +23,5 @@ languages=wikidata
2323

2424
extractors=
2525

26-
extractors.wikidata=.WikidataR2RExtractor,.WikidataLLExtractor,.WikidataReferenceExtractor,.WikidataAliasExtractor,.WikidataLabelExtractor,.WikidataNameSpaceSameAsExtractor,.WikidataPropertyExtractor,.WikidataLabelExtractor,.WikidataDescriptionExtractor
26+
extractors.wikidata=.WikidataR2RExtractor,.WikidataLLExtractor,.WikidataReferenceExtractor,.WikidataAliasExtractor,.WikidataLabelExtractor,.WikidataNameSpaceSameAsExtractor,.WikidataPropertyExtractor,.WikidataLabelExtractor,.WikidataDescriptionExtractor,.WikidataLexemeExtractor
2727

0 commit comments

Comments
 (0)