Skip to content

Commit 09c5305

Browse files
committed
generating Language specific LL files depending on chuncks data
1 parent b15a0b7 commit 09c5305

File tree

1 file changed

+50
-68
lines changed

1 file changed

+50
-68
lines changed

scripts/src/main/scala/org/dbpedia/extraction/scripts/LanguageSpecificLinksGenerator.scala

Lines changed: 50 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import sys.process._
99

1010
/**
1111
* User: hadyelsahar
12-
*
1312
*/
1413

1514

@@ -66,39 +65,6 @@ object LanguageSpecificLinksGenerator {
6665
}
6766
}
6867

69-
70-
/**
71-
* util function to add paddings to all lines to make them of same length
72-
* @param fileIn
73-
* @param fileOut
74-
*/
75-
private def padFile (fileIn :String , fileOut:String)
76-
{
77-
val baseDir = new File(fileIn)
78-
79-
//finding maximum length of line in the triples file
80-
var maxLength=0
81-
82-
for(ln <-Source.fromFile(baseDir).getLines)
83-
{
84-
if(ln.length > maxLength) maxLength = ln.length
85-
}
86-
87-
CreateFile(fileOut)
88-
89-
//adding paddings
90-
for(ln <-Source.fromFile(baseDir).getLines)
91-
{
92-
val newline = ln+" "*(maxLength-ln.length)
93-
LogToFile(fileOut,newline)
94-
}
95-
96-
CloseWriters()
97-
}
98-
99-
100-
101-
10268
def main(args: Array[String]) {
10369
//todo : add some requires here to check for arguments
10470
//arg1 = 0
@@ -135,75 +101,91 @@ object LanguageSpecificLinksGenerator {
135101

136102
}
137103

138-
padFile("./languagelinks.ttl","./languagelinks_Padded.ttl")
139-
104+
CloseWriters
140105

141106
}
142107

143-
144108
/**
145-
* option 2:
109+
* option 1:
146110
* ---------
147111
* from the extracted languagelinks.nt file
148112
* extracting language links and save them in languagelinks folder
149113
*/
150114
if(option == "1")
151115
{
116+
val startTime = System.nanoTime
117+
118+
//opening master file for language links
152119
val baseDir = new File(args(1))
153120
val file = Source.fromFile(baseDir)
154121

122+
//creating folder for output files
123+
new File("./llinkfiles").mkdir()
155124

156-
for(ln <- file.getLines){
157-
var triple = split(ln);
125+
var Q = ""
126+
var oldQ = ""
127+
var triplesObjects = List[String]()
128+
val lines = file.getLines
129+
for(ln <- lines){
158130

159-
if(triple.length ==4){
160-
val Q1 = triple(0)
161-
val Obj1 = triple(2)
162-
163-
val langRegx = """<http:\/\/(.*).dbpedia.org\/resource\/.*>""".r
164-
val langRegx(lang) = triple(2)
131+
val triple = split(ln);
165132

166-
//make folder for ll files
167-
new File("./llinkfiles").mkdir()
133+
//gather all objects of triples until the subject changes
134+
oldQ = Q
135+
Q = triple(0)
136+
val tripleObj = triple(2)
168137

169-
//create languagefile for each language if doesn't exist before
170-
val LLFileName = "./llinkfiles/interlanguage_links_same_as_"+lang+".ttl"
138+
//for each chuck ( the subject changed or if it's the last line ) , make combinations and save to files
171139

172-
if(!filesWriters.contains(LLFileName))
140+
if((oldQ != Q && oldQ != "") || !lines.hasNext)
173141
{
174-
CreateFile(LLFileName)
175-
}
142+
//println(oldQ)
143+
for(obj <- triplesObjects)
144+
{
145+
//extracting language
146+
val langRegx = """<http:\/\/(.*).dbpedia.org\/resource\/.*>""".r
147+
val langRegx(lang) = obj
176148

177-
//iterate over all triples todo: change to more efficient way
178-
for(ln <- Source.fromFile(baseDir).getLines){
149+
//creating file for language if not exists
150+
val fileName = "./llinkfiles/interlanguage_links_same_as_"+lang+".ttl"
179151

180-
triple = split(ln);
152+
if(!filesWriters.contains(fileName))
153+
{
154+
CreateFile(fileName)
155+
}
181156

182-
if(triple.length ==4 ){
183-
val Q2= triple(0)
184-
val Obj2 = triple(2)
185-
val langRegx(innerLang) = triple(2)
157+
//creating combination string
158+
var LLString :String= ""
159+
//removing itself
160+
val innerTripleObjects = triplesObjects.diff(List(obj))
186161

187-
if(lang != innerLang && Q1 == Q2)
162+
163+
for(obj2 <- innerTripleObjects)
188164
{
189-
LogToFile(LLFileName,Obj1+" "+"<http://www.w3.org/2002/07/owl#sameAs>"+" "+Obj2+" .")
165+
//LLString += obj +" <http://www.w3.org/2002/07/owl#sameAs> " +obj2+" .\n"
166+
LogToFile(fileName,obj +" <http://www.w3.org/2002/07/owl#sameAs> " +obj2+" .\n")
190167
}
168+
169+
//LogToFile(fileName,LLString)
191170
}
171+
172+
//empty the Chunk container
173+
triplesObjects = List[String]()
192174
}
175+
176+
triplesObjects = triplesObjects :+ tripleObj
193177
}
194-
}
195-
CloseWriters()
196-
}
197178

179+
CloseWriters()
198180

199-
if(option =="test")
200-
{
181+
print("time taken: " + (System.nanoTime - startTime)/1000000000 +" secs" )
201182

202-
padFile(args(1),args(1).replace("test","testpadded"))
203183

204184
}
205185

186+
206187
}
207188

189+
208190
}
209191

0 commit comments

Comments
 (0)