Skip to content

Commit 21a402c

Browse files
committed
implicit conversions for readline , adding options for file names
1 parent eb97cee commit 21a402c

File tree

1 file changed

+44
-50
lines changed

1 file changed

+44
-50
lines changed

scripts/src/main/scala/org/dbpedia/extraction/scripts/LanguageSpecificLinksGenerator.scala

Lines changed: 44 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,14 @@ package org.dbpedia.extraction.scripts
22

33
import java.io._
44
import scala.collection.immutable.HashMap
5-
import scala.io.Source
65
import scala.io._
7-
import sys.process._
86
import org.dbpedia.util.text.uri._
9-
import org.dbpedia.extraction.util.RichFile
10-
import org.dbpedia.extraction.util.FileLike
7+
import org.dbpedia.extraction.util.RichFile.wrapFile
8+
import org.dbpedia.extraction.util.RichReader.wrapReader
119
import java.util.zip.{GZIPInputStream, GZIPOutputStream}
1210
import org.apache.commons.compress.compressors.bzip2.{BZip2CompressorInputStream, BZip2CompressorOutputStream}
13-
11+
import org.dbpedia.extraction.util.FileLike
12+
import java.nio.charset.Charset
1413

1514

1615
/**
@@ -60,15 +59,34 @@ object LanguageSpecificLinksGenerator {
6059
def inputStream(file: FileLike[_]): InputStream =
6160
open(file, _.inputStream(), unzippers)
6261

63-
}
62+
/**
63+
* open output stream, wrap in zipper stream if file suffix indicates compressed file,
64+
* wrap in writer.
65+
*/
66+
def writer(file: FileLike[_], charset: Charset = Codec.UTF8): Writer =
67+
new OutputStreamWriter(outputStream(file), charset)
68+
69+
70+
def reader(file: FileLike[_], charset: Charset = Codec.UTF8): Reader =
71+
new InputStreamReader(inputStream(file), charset)
72+
73+
def readLines(file: FileLike[_])(proc: String => Unit): Unit = {
74+
val reader = this.reader(file)
75+
try {
76+
for (line <- reader) {
77+
proc(line)
78+
}
79+
}
80+
finally reader.close()
81+
}
6482

83+
}
6584

6685
/**
6786
* HashMap to keep track of all opened files BufferedWriters in order to close and flush them
6887
* when needed
6988
*/
7089
var filesWriters = new HashMap[String,BufferedWriter]()
71-
7290
/**
7391
* Helper function to split .nt file lines and extract subject , pred , object and the fullstop
7492
* @param arg triple line
@@ -77,7 +95,6 @@ object LanguageSpecificLinksGenerator {
7795
private def split(arg: String): Array[String] = {
7896
arg.split(" ").map(_.trim).filter(_.nonEmpty)
7997
}
80-
8198
/**
8299
* helper function to write line by line in a file
83100
* create file if doesn't exist
@@ -89,9 +106,7 @@ object LanguageSpecificLinksGenerator {
89106
if(!filesWriters.contains(fileName))
90107
{
91108
val file = new File(fileName)
92-
val richFile = new RichFile(file)
93-
val outputStream = IOUtils.outputStream(richFile)
94-
val outputStreamWriter = new OutputStreamWriter(outputStream)
109+
val outputStreamWriter = IOUtils.writer(file)
95110
val bufferedWriter:BufferedWriter = new BufferedWriter(outputStreamWriter)
96111

97112
filesWriters += (fileName->bufferedWriter)
@@ -101,7 +116,6 @@ object LanguageSpecificLinksGenerator {
101116
writer.write(str)
102117
writer.newLine()
103118
}
104-
105119
/**
106120
* destructive function to flush and close all opened buffered writers
107121
*/
@@ -125,18 +139,16 @@ object LanguageSpecificLinksGenerator {
125139
*/
126140
if(option == "0")
127141
{
128-
val inFile = new File(args(1))
129-
val inRichFile = new RichFile(inFile)
130-
val in = IOUtils.inputStream(inRichFile)
131-
val lines = Source.fromInputStream(in,"UTF-8").getLines()
132-
142+
val outFileName = args(2)
133143

134144
//languagelinks triples needed are those contain schema:about predicates and wikipediapages subjects which indicated wikipedia page
135145
val cond1 = "wikipedia.org/wiki"
136146
val cond2 = "<http://schema.org/about>"
137147

148+
val inFile = new File(args(1))
138149

139-
for(ln <- lines){
150+
//iterating over dump files -- readlines accept arg of type File implicitly through RichFile.wrapFile
151+
IOUtils.readLines(inFile){ln =>
140152
val triple = split(ln);
141153

142154
//check if the triple is in the correct .ttl format
@@ -147,7 +159,7 @@ object LanguageSpecificLinksGenerator {
147159
triple(0) = triple(0).replace(".wikipedia.org/wiki",".dbpedia.org/resource")
148160
val sub = UriDecoder.decode(triple(2))
149161
val obj = UriDecoder.decode(triple(0))
150-
logToFile("./languagelinks.ttl.gz",sub+" "+"<http://www.w3.org/2002/07/owl#sameAs>"+" "+obj+" .")
162+
logToFile(outFileName,sub+" "+"<http://www.w3.org/2002/07/owl#sameAs>"+" "+obj+" .")
151163
}
152164
}
153165
}
@@ -165,21 +177,24 @@ object LanguageSpecificLinksGenerator {
165177
if(option == "1")
166178
{
167179

168-
//opening master file for language links
169-
val inFile = new File(args(1))
170-
val inRichFile = new RichFile(inFile)
171-
val in = IOUtils.inputStream(inRichFile)
172-
val lines = Source.fromInputStream(in,"UTF-8").getLines()
173-
174-
175180
//creating folder for output files
176-
new File("./llinkfiles").mkdir()
181+
val baseDir = args(2)
182+
new File(baseDir).mkdir()
183+
//file extension .ttl .ttl.gz .ttl.bz2
184+
val outFileExtension = args(3)
185+
177186

178187
var Q = ""
179188
var oldQ = ""
180189
var triplesObjects = List[String]()
181190

182191
val langRegx = """<http:\/\/(.*).dbpedia.*>""".r
192+
193+
//iterating over LLmasterfile Triples -- readLines accept arg of type File implicitly through RichFile.wrapFile
194+
val inFile = new File(args(1))
195+
val inStream= IOUtils.inputStream(inFile)
196+
val lines = Source.fromInputStream(inStream).getLines
197+
183198
for(ln <- lines){
184199

185200
val triple = split(ln);
@@ -198,15 +213,11 @@ object LanguageSpecificLinksGenerator {
198213
{
199214
//extracting language
200215
val langRegx(lang) = obj
201-
202216
//initializing file name
203-
val outFileName = "./llinkfiles/interlanguage_links_same_as_"+lang+".ttl"
204-
217+
val outFileName = baseDir+"/interlanguage_links_same_as_"+lang+outFileExtension
205218
//removing itself
206219
val innerTripleObjects = triplesObjects.diff(List(obj))
207-
208-
209-
//logtofile funciton includes creating files if not exist
220+
//logtofile function includes creating files if not exist
210221
for(obj2 <- innerTripleObjects)
211222
{
212223
logToFile(outFileName,obj +" <http://www.w3.org/2002/07/owl#sameAs> " +obj2+" .")
@@ -224,23 +235,6 @@ object LanguageSpecificLinksGenerator {
224235
closeWriters()
225236
}
226237

227-
228-
if(option =="test")
229-
{
230-
val fileName = new File(args(1))
231-
val file = new RichFile(fileName)
232-
val in = IOUtils.inputStream(file)
233-
val lines = Source.fromInputStream(in,"UTF-8").getLines()
234-
235-
for(ln <- lines)
236-
{
237-
println(ln)
238-
}
239-
240-
241-
}
242-
243-
244238
print("time taken: " + (System.nanoTime - startTime)/1000000000 +" secs\n" )
245239

246240
}

0 commit comments

Comments
 (0)