1+ package org .dbpedia .extraction .mappings
2+
3+ import org .dbpedia .extraction .config .provenance .DBpediaDatasets
4+ import org .dbpedia .extraction .mappings .JsonNodeExtractor
5+ import org .dbpedia .extraction .ontology .Ontology
6+ import org .dbpedia .extraction .transform .Quad
7+ import org .dbpedia .extraction .util .{Language , WikidataUtil }
8+ import org .dbpedia .extraction .wikiparser .{JsonNode , Namespace }
9+ import org .wikidata .wdtk .datamodel .interfaces ._
10+
11+ import scala .collection .JavaConversions ._
12+ import scala .collection .mutable .ArrayBuffer
13+ import scala .language .reflectiveCalls
14+
15+ class WikidataLexemeExtractor (
16+ context : {
17+ def ontology : Ontology
18+ def language : Language
19+ }
20+ )
21+ extends JsonNodeExtractor {
22+
23+ private val lexicalCategoryProperty = " http://dbpedia.org/ontology/lexicalcategory"
24+ private val lemmaProperty = " http://dbpedia.org/ontology/lemma"
25+ private val language = " http://dbpedia.org/ontology/language"
26+ private val grammaticalFeatureProperty = " http://dbpedia.org/ontology/grammaticalfeature"
27+ private val labelProperty = context.ontology.properties(" rdfs:label" )
28+ // private val descriptionProperty = context.ontology.properties("description")
29+ // private val languageProperty = context.ontology.properties("rdf:language")
30+
31+
32+ override val datasets = Set (DBpediaDatasets .WikidataLexeme )
33+
34+
35+ override def extract (page : JsonNode , subjectUri : String ): Seq [Quad ] = {
36+ val quads = new ArrayBuffer [Quad ]()
37+
38+ val subject = WikidataUtil .getWikidataNamespace(subjectUri).replace(" Lexeme:" , " " )
39+
40+ // checks if extractor is used for correct entity
41+
42+ quads ++= getLexicalCategory(page, subject)
43+ quads ++= getLemmas(page, subject)
44+ quads ++= getLanguage(page, subject)
45+ quads ++= getStatements(page, subject)
46+ quads ++= getSenses(page, subject)
47+ quads ++= getForms(page, subject)
48+
49+
50+ quads
51+ }
52+
53+
54+ private def getLexicalCategory (document : JsonNode , subjectUri : String ): Seq [Quad ] = {
55+ val quads = new ArrayBuffer [Quad ]()
56+
57+ if (document.wikiPage.title.namespace == Namespace .WikidataLexeme ) {
58+ val page = document.wikiDataDocument.deserializeLexemeDocument(document.wikiPage.source)
59+ page.getLexicalCategory match {
60+ case value : Value => {
61+ val objectValue = WikidataUtil .getValue(value)
62+
63+ // val datatype = if (WikidataUtil.getDatatype(v) != null) context.ontology.datatypes(WikidataUtil.getDatatype(v)) else null
64+ quads += new Quad (context.language, DBpediaDatasets .WikidataLexeme , subjectUri, lexicalCategoryProperty, objectValue,
65+ document.wikiPage.sourceIri, null )
66+
67+ }
68+ case _ =>
69+ }
70+ }
71+ quads
72+ }
73+
74+ private def getLemmas (document : JsonNode , subjectUri : String ): Seq [Quad ] = {
75+ val quads = new ArrayBuffer [Quad ]()
76+
77+ if (document.wikiPage.title.namespace == Namespace .WikidataLexeme ) {
78+ val page = document.wikiDataDocument.deserializeLexemeDocument(document.wikiPage.source)
79+ for ((lang, value) <- page.getLemmas) {
80+ val lemmas = WikidataUtil .replacePunctuation(value.toString, lang)
81+ Language .get(lang) match {
82+ case Some (dbpedia_lang) => {
83+ quads += new Quad (dbpedia_lang, DBpediaDatasets .WikidataLexeme , subjectUri, lemmaProperty, lemmas,
84+ document.wikiPage.sourceIri, context.ontology.datatypes(" rdf:langString" ))
85+ }
86+ case _ =>
87+ }
88+ }
89+ }
90+ quads
91+ }
92+
93+ private def getLanguage (document : JsonNode , subjectUri : String ): Seq [Quad ] = {
94+ val quads = new ArrayBuffer [Quad ]()
95+
96+
97+
98+ if (document.wikiPage.title.namespace == Namespace .WikidataLexeme ) {
99+ val page = document.wikiDataDocument.deserializeLexemeDocument(document.wikiPage.source)
100+ page.getLanguage match {
101+ case value : Value => {
102+ val objectValue = WikidataUtil .getValue(value)
103+
104+ // val datatype = if (WikidataUtil.getDatatype(v) != null) context.ontology.datatypes(WikidataUtil.getDatatype(v)) else null
105+ quads += new Quad (context.language, DBpediaDatasets .WikidataLexeme , subjectUri, language, objectValue,
106+ document.wikiPage.sourceIri, null )
107+
108+ }
109+ case _ =>
110+ }
111+ }
112+ quads
113+ }
114+ private def getStatements (document : JsonNode , subjectUri : String ): Seq [Quad ] = {
115+ val quads = new ArrayBuffer [Quad ]()
116+
117+ if (document.wikiPage.title.namespace == Namespace .WikidataLexeme ) {
118+ val page = document.wikiDataDocument.deserializeLexemeDocument(document.wikiPage.source)
119+ for (statementGroup <- page.getStatementGroups) {
120+ statementGroup.foreach {
121+ statement => {
122+ val claim = statement.getClaim
123+ val lexeme = WikidataUtil .getWikidataNamespace(claim.getMainSnak.getPropertyId.getIri)
124+
125+ claim.getMainSnak match {
126+ case mainSnak : ValueSnak => {
127+ val v = mainSnak.getValue
128+ val value = WikidataUtil .getValue(v).split(" " )(0 )
129+ val datatype = if (WikidataUtil .getDatatype(v) != null ) context.ontology.datatypes(WikidataUtil .getDatatype(v)) else null
130+ quads += new Quad (context.language, DBpediaDatasets .WikidataLexeme , subjectUri, lexeme, value, document.wikiPage.sourceIri, datatype)
131+ }
132+ case _ =>
133+ }
134+ }
135+ }
136+ }
137+ }
138+
139+ quads
140+ }
141+ // write senses
142+ private def getSenses (document : JsonNode , subjectUri : String ): Seq [Quad ] = {
143+ val quads = new ArrayBuffer [Quad ]()
144+
145+ if (document.wikiPage.title.namespace == Namespace .WikidataLexeme ) {
146+ val page = document.wikiDataDocument.deserializeLexemeDocument(document.wikiPage.source)
147+ for (sense <- page.getSenses){
148+ val senseId = sense.getEntityId.toString.split(" " ,2 )(0 )
149+ for (statementGroup <- sense.getStatementGroups) {
150+ statementGroup.foreach {
151+ statement => {
152+ val claim = statement.getClaim
153+ val lexeme = WikidataUtil .getWikidataNamespace(claim.getMainSnak.getPropertyId.getIri)
154+
155+ claim.getMainSnak match {
156+ case mainSnak : ValueSnak => {
157+ val v = mainSnak.getValue
158+ val value = WikidataUtil .getValue(v).split(" " )(0 )
159+ val datatype = if (WikidataUtil .getDatatype(v) != null ) context.ontology.datatypes(WikidataUtil .getDatatype(v)) else null
160+ quads += new Quad (context.language, DBpediaDatasets .WikidataLexeme , senseId, lexeme, value, document.wikiPage.sourceIri, datatype)
161+ }
162+ case _ =>
163+ }
164+ }
165+ }
166+ }
167+ for ((lang, value) <- sense.getGlosses){
168+ val lemmas = WikidataUtil .replacePunctuation(value.toString, lang)
169+ Language .get(lang) match {
170+ case Some (dbpedia_lang) => {
171+ quads += new Quad (dbpedia_lang, DBpediaDatasets .WikidataLexeme , senseId, labelProperty, lemmas,
172+ document.wikiPage.sourceIri, context.ontology.datatypes(" rdf:langString" ))
173+ }
174+ case _ =>
175+ }
176+ }
177+
178+ }
179+
180+ }
181+
182+ quads
183+ }
184+ private def getForms (document : JsonNode , subjectUri : String ): Seq [Quad ] = {
185+ val quads = new ArrayBuffer [Quad ]()
186+
187+ if (document.wikiPage.title.namespace == Namespace .WikidataLexeme ) {
188+ val page = document.wikiDataDocument.deserializeLexemeDocument(document.wikiPage.source)
189+ for (form <- page.getForms){
190+ val formId = form.getEntityId.toString.split(" " ,2 )(0 )
191+ for (statementGroup <- form.getStatementGroups) {
192+ statementGroup.foreach {
193+ statement => {
194+ val claim = statement.getClaim
195+ val lexeme = WikidataUtil .getWikidataNamespace(claim.getMainSnak.getPropertyId.getIri)
196+
197+ claim.getMainSnak match {
198+ case mainSnak : ValueSnak => {
199+ val v = mainSnak.getValue
200+ val value = WikidataUtil .getValue(v).split(" " )(0 )
201+
202+ val datatype = if (WikidataUtil .getDatatype(v) != null ) context.ontology.datatypes(WikidataUtil .getDatatype(v)) else null
203+ quads += new Quad (context.language, DBpediaDatasets .WikidataLexeme , formId, lexeme, value, document.wikiPage.sourceIri, datatype)
204+ }
205+ case _ =>
206+ }
207+ }
208+ }
209+ }
210+ for ((lang, value) <- form.getRepresentations){
211+ val lemmas = WikidataUtil .replacePunctuation(value.toString, lang)
212+ Language .get(lang) match {
213+ case Some (dbpedia_lang) => {
214+ quads += new Quad (dbpedia_lang, DBpediaDatasets .WikidataLexeme , formId, labelProperty, lemmas,
215+ document.wikiPage.sourceIri, context.ontology.datatypes(" rdf:langString" ))
216+ }
217+ case _ =>
218+ }
219+ }
220+ for (grammaticalFeature <- form.getGrammaticalFeatures){
221+ grammaticalFeature match {
222+ case value : Value => {
223+ val objectValue = WikidataUtil .getValue(value)
224+ quads += new Quad (context.language, DBpediaDatasets .WikidataLexeme , formId, grammaticalFeatureProperty, objectValue,
225+ document.wikiPage.sourceIri, null )
226+ }
227+ case _ =>
228+ }
229+ }
230+
231+ }
232+
233+ }
234+
235+ quads
236+ }
237+
238+
239+
240+ }
0 commit comments