@@ -2,15 +2,14 @@ package org.dbpedia.extraction.scripts
22
33import java .io ._
44import scala .collection .immutable .HashMap
5- import scala .io .Source
65import scala .io ._
7- import sys .process ._
86import org .dbpedia .util .text .uri ._
9- import org .dbpedia .extraction .util .RichFile
10- import org .dbpedia .extraction .util .FileLike
7+ import org .dbpedia .extraction .util .RichFile . wrapFile
8+ import org .dbpedia .extraction .util .RichReader . wrapReader
119import java .util .zip .{GZIPInputStream , GZIPOutputStream }
1210import org .apache .commons .compress .compressors .bzip2 .{BZip2CompressorInputStream , BZip2CompressorOutputStream }
13-
11+ import org .dbpedia .extraction .util .FileLike
12+ import java .nio .charset .Charset
1413
1514
1615/**
@@ -60,15 +59,34 @@ object LanguageSpecificLinksGenerator {
6059 def inputStream (file : FileLike [_]): InputStream =
6160 open(file, _.inputStream(), unzippers)
6261
63- }
62+ /**
63+ * open output stream, wrap in zipper stream if file suffix indicates compressed file,
64+ * wrap in writer.
65+ */
66+ def writer (file : FileLike [_], charset : Charset = Codec .UTF8 ): Writer =
67+ new OutputStreamWriter (outputStream(file), charset)
68+
69+
70+ def reader (file : FileLike [_], charset : Charset = Codec .UTF8 ): Reader =
71+ new InputStreamReader (inputStream(file), charset)
72+
73+ def readLines (file : FileLike [_])(proc : String => Unit ): Unit = {
74+ val reader = this .reader(file)
75+ try {
76+ for (line <- reader) {
77+ proc(line)
78+ }
79+ }
80+ finally reader.close()
81+ }
6482
83+ }
6584
6685 /**
6786 * HashMap to keep track of all opened files BufferedWriters in order to close and flush them
6887 * when needed
6988 */
7089 var filesWriters = new HashMap [String ,BufferedWriter ]()
71-
7290 /**
7391 * Helper function to split .nt file lines and extract subject , pred , object and the fullstop
7492 * @param arg triple line
@@ -77,7 +95,6 @@ object LanguageSpecificLinksGenerator {
7795 private def split (arg : String ): Array [String ] = {
7896 arg.split(" " ).map(_.trim).filter(_.nonEmpty)
7997 }
80-
8198 /**
8299 * helper function to write line by line in a file
83100 * create file if doesn't exist
@@ -89,9 +106,7 @@ object LanguageSpecificLinksGenerator {
89106 if (! filesWriters.contains(fileName))
90107 {
91108 val file = new File (fileName)
92- val richFile = new RichFile (file)
93- val outputStream = IOUtils .outputStream(richFile)
94- val outputStreamWriter = new OutputStreamWriter (outputStream)
109+ val outputStreamWriter = IOUtils .writer(file)
95110 val bufferedWriter : BufferedWriter = new BufferedWriter (outputStreamWriter)
96111
97112 filesWriters += (fileName-> bufferedWriter)
@@ -101,7 +116,6 @@ object LanguageSpecificLinksGenerator {
101116 writer.write(str)
102117 writer.newLine()
103118 }
104-
105119 /**
106120 * destructive function to flush and close all opened buffered writers
107121 */
@@ -125,18 +139,16 @@ object LanguageSpecificLinksGenerator {
125139 */
126140 if (option == " 0" )
127141 {
128- val inFile = new File (args(1 ))
129- val inRichFile = new RichFile (inFile)
130- val in = IOUtils .inputStream(inRichFile)
131- val lines = Source .fromInputStream(in," UTF-8" ).getLines()
132-
142+ val outFileName = args(2 )
133143
134144 // languagelinks triples needed are those contain schema:about predicates and wikipediapages subjects which indicated wikipedia page
135145 val cond1 = " wikipedia.org/wiki"
136146 val cond2 = " <http://schema.org/about>"
137147
148+ val inFile = new File (args(1 ))
138149
139- for (ln <- lines){
150+ // iterating over dump files -- readlines accept arg of type File implicitly through RichFile.wrapFile
151+ IOUtils .readLines(inFile){ln =>
140152 val triple = split(ln);
141153
142154 // check if the triple is in the correct .ttl format
@@ -147,7 +159,7 @@ object LanguageSpecificLinksGenerator {
147159 triple(0 ) = triple(0 ).replace(" .wikipedia.org/wiki" ," .dbpedia.org/resource" )
148160 val sub = UriDecoder .decode(triple(2 ))
149161 val obj = UriDecoder .decode(triple(0 ))
150- logToFile(" ./languagelinks.ttl.gz " ,sub+ " " + " <http://www.w3.org/2002/07/owl#sameAs>" + " " + obj+ " ." )
162+ logToFile(outFileName ,sub+ " " + " <http://www.w3.org/2002/07/owl#sameAs>" + " " + obj+ " ." )
151163 }
152164 }
153165 }
@@ -165,21 +177,24 @@ object LanguageSpecificLinksGenerator {
165177 if (option == " 1" )
166178 {
167179
168- // opening master file for language links
169- val inFile = new File (args(1 ))
170- val inRichFile = new RichFile (inFile)
171- val in = IOUtils .inputStream(inRichFile)
172- val lines = Source .fromInputStream(in," UTF-8" ).getLines()
173-
174-
175180 // creating folder for output files
176- new File (" ./llinkfiles" ).mkdir()
181+ val baseDir = args(2 )
182+ new File (baseDir).mkdir()
183+ // file extension .ttl .ttl.gz .ttl.bz2
184+ val outFileExtension = args(3 )
185+
177186
178187 var Q = " "
179188 var oldQ = " "
180189 var triplesObjects = List [String ]()
181190
182191 val langRegx = """ <http:\/\/(.*).dbpedia.*>""" .r
192+
193+ // iterating over LLmasterfile Triples -- readLines accept arg of type File implicitly through RichFile.wrapFile
194+ val inFile = new File (args(1 ))
195+ val inStream = IOUtils .inputStream(inFile)
196+ val lines = Source .fromInputStream(inStream).getLines
197+
183198 for (ln <- lines){
184199
185200 val triple = split(ln);
@@ -198,15 +213,11 @@ object LanguageSpecificLinksGenerator {
198213 {
199214 // extracting language
200215 val langRegx(lang) = obj
201-
202216 // initializing file name
203- val outFileName = " ./llinkfiles/interlanguage_links_same_as_" + lang+ " .ttl"
204-
217+ val outFileName = baseDir+ " /interlanguage_links_same_as_" + lang+ outFileExtension
205218 // removing itself
206219 val innerTripleObjects = triplesObjects.diff(List (obj))
207-
208-
209- // logtofile funciton includes creating files if not exist
220+ // logtofile function includes creating files if not exist
210221 for (obj2 <- innerTripleObjects)
211222 {
212223 logToFile(outFileName,obj + " <http://www.w3.org/2002/07/owl#sameAs> " + obj2+ " ." )
@@ -224,23 +235,6 @@ object LanguageSpecificLinksGenerator {
224235 closeWriters()
225236 }
226237
227-
228- if (option == " test" )
229- {
230- val fileName = new File (args(1 ))
231- val file = new RichFile (fileName)
232- val in = IOUtils .inputStream(file)
233- val lines = Source .fromInputStream(in," UTF-8" ).getLines()
234-
235- for (ln <- lines)
236- {
237- println(ln)
238- }
239-
240-
241- }
242-
243-
244238 print(" time taken: " + (System .nanoTime - startTime)/ 1000000000 + " secs\n " )
245239
246240 }
0 commit comments