Skip to content

Commit 48bc96d

Browse files
committed
added scala script to extract ll master file and ll specfic files
1 parent 3c1ea98 commit 48bc96d

File tree

1 file changed

+167
-0
lines changed

1 file changed

+167
-0
lines changed
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
//package org.dbpedia.extraction.scripts
2+
3+
import java.io._
4+
import scala.collection.immutable.HashMap
5+
import scala.io.Source
6+
import scala.io._
7+
import sys.process._
8+
9+
10+
/**
11+
* User: hadyelsahar
12+
*
13+
*/
14+
15+
16+
object LanguageSpecificLinksGenerator {
17+
18+
/**
19+
* HashMap to keep track of all opened files BufferedWriters in order to close and flush them
20+
* when needed
21+
*/
22+
var filesWriters = new HashMap[String,BufferedWriter]()
23+
24+
/**
25+
* Helper function to split .nt file lines and extract subject , pred , object and the fullstop
26+
* @param arg triple line
27+
* @return array of sub , pred, obj and fullstop
28+
*/
29+
private def split(arg: String): Array[String] = {
30+
arg.split(" ").map(_.trim).filter(_.nonEmpty)
31+
}
32+
33+
/**
34+
* helper function to create files and save them in te filesWriter HashMap
35+
* @param fileName
36+
*/
37+
private def CreateFile(fileName: String)
38+
{
39+
val file = new File(fileName)
40+
file.createNewFile()
41+
val writer = new BufferedWriter(new FileWriter(file))
42+
filesWriters += (fileName->writer)
43+
}
44+
45+
/**
46+
* helper function to write line by line in a file
47+
* @param file name of the file as created by the CreateFile Function and saved in the HashMap
48+
* @param str string to be written in the file
49+
*/
50+
private def LogToFile(file: String, str: String)
51+
{
52+
val writer = filesWriters(file)
53+
writer.write(str)
54+
writer.newLine()
55+
}
56+
57+
/**
58+
* destructive function to flush and close all opened buffered writers
59+
*/
60+
private def CloseWriters ()
61+
{
62+
for(w <- filesWriters)
63+
{
64+
w._2.flush()
65+
w._2.close()
66+
}
67+
}
68+
69+
70+
71+
def main(args: Array[String]) {
72+
//todo : add some requires here to check for arguments
73+
//arg1 = 0
74+
val option = args(0)
75+
76+
/**
77+
* option 0 :
78+
* -----------
79+
*extracting language links related properties from the WikiData RDF Dumb File
80+
* and save them in a separated languagelinks.nt file
81+
*/
82+
if(option == "0")
83+
{
84+
val baseDir = new File(args(1))
85+
val file = Source.fromFile(baseDir)
86+
87+
CreateFile("./languagelinks.ttl")
88+
89+
for(ln <- file.getLines){
90+
val triple = split(ln);
91+
92+
//check if the triple is in the correct .ttl format
93+
if(triple.length ==4){
94+
95+
//languagelinks triples needed are those contain schema:about predicates and wikipediapages subjects which indicated wikipedia page
96+
val Regx = """.*\.wikipedia.org\/wiki.*<http:\/\/schema\.org\/about>""".r
97+
98+
if(Regx.findFirstIn(ln) != None ){
99+
triple(0) = triple(0).replace(".wikipedia.org/wiki",".dbpedia.org/resource")
100+
LogToFile("./languagelinks.nt",triple(2)+" "+"<http://www.w3.org/2002/07/owl#sameAs>"+" "+triple(0)+" .")
101+
}
102+
103+
}
104+
105+
}
106+
107+
CloseWriters()
108+
}
109+
110+
/**
111+
* option 1:
112+
* ---------
113+
* from the extracted languagelinks.nt file
114+
* extracting language links and save them in languagelinks folder
115+
*/
116+
if(option == "1")
117+
{
118+
val baseDir = new File(args(1))
119+
val file = Source.fromFile(baseDir)
120+
121+
122+
for(ln <- file.getLines){
123+
var triple = split(ln);
124+
125+
if(triple.length ==4){
126+
val Q1 = triple(0)
127+
val Obj1 = triple(2)
128+
129+
val langRegx = """<http:\/\/(.*).dbpedia.org\/resource\/.*>""".r
130+
val langRegx(lang) = triple(2)
131+
132+
//make folder for ll files
133+
new File("./llinkfiles").mkdir()
134+
135+
//create languagefile for each language if doesn't exist before
136+
val LLFileName = "./llinkfiles/interlanguage_links_same_as_"+lang+".ttl"
137+
138+
if(!filesWriters.contains(LLFileName))
139+
{
140+
CreateFile(LLFileName)
141+
}
142+
143+
//iterate over all triples todo: change to more efficient way
144+
for(ln <- Source.fromFile(baseDir).getLines){
145+
triple = split(ln);
146+
147+
if(triple.length ==4 ){
148+
val Q2= triple(0)
149+
val Obj2 = triple(2)
150+
val langRegx(innerLang) = triple(2)
151+
152+
if(lang != innerLang && Q1 == Q2)
153+
{
154+
LogToFile(LLFileName,Obj1+" "+"<http://www.w3.org/2002/07/owl#sameAs>"+" "+Obj2+" .")
155+
}
156+
}
157+
}
158+
}
159+
}
160+
CloseWriters()
161+
}
162+
163+
164+
}
165+
166+
}
167+

0 commit comments

Comments
 (0)