Skip to content

Commit 6f36efd

Browse files
committed
replaced regex match with stringmatch more faster performance
1 parent 80dc928 commit 6f36efd

File tree

1 file changed

+14
-8
lines changed

1 file changed

+14
-8
lines changed

scripts/src/main/scala/org/dbpedia/extraction/scripts/LanguageSpecificLinksGenerator.scala

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import org.apache.commons.compress.compressors.bzip2.{BZip2CompressorInputStream
2121
object LanguageSpecificLinksGenerator {
2222

2323
//Todo: remove and include from org.dbpedia.extraction.util.IOUtils when merging Dump branch to main
24-
object IOUtils {
24+
object IOUtils {
2525

2626
/**
2727
* Map from file suffix (without "." dot) to output stream wrapper
@@ -89,7 +89,8 @@ object LanguageSpecificLinksGenerator {
8989
if(!filesWriters.contains(fileName))
9090
{
9191
val file = new File(fileName)
92-
val outputStream = new FileOutputStream(file)
92+
val richFile = new RichFile(file)
93+
val outputStream = IOUtils.outputStream(richFile)
9394
val outputStreamWriter = new OutputStreamWriter(outputStream)
9495
val bufferedWriter:BufferedWriter = new BufferedWriter(outputStreamWriter)
9596

@@ -124,24 +125,30 @@ object LanguageSpecificLinksGenerator {
124125
*/
125126
if(option == "0")
126127
{
128+
127129
val inFile = new File(args(1))
128-
val file = Source.fromFile(inFile)
130+
val inRichFile = new RichFile(inFile)
131+
val in = IOUtils.inputStream(inRichFile)
132+
val lines = Source.fromInputStream(in,"UTF-8").getLines()
133+
129134

130135
//languagelinks triples needed are those contain schema:about predicates and wikipediapages subjects which indicated wikipedia page
131-
val regx = """.*\.wikipedia.org\/wiki.*<http:\/\/schema\.org\/about>""".r
136+
val cond1 = "wikipedia.org/wiki"
137+
val cond2 = "<http://schema.org/about>"
132138

133139

134-
for(ln <- file.getLines){
140+
for(ln <- lines){
135141
val triple = split(ln);
136142

137143
//check if the triple is in the correct .ttl format
138144
if(triple.length ==4){
139145

140-
if(regx.findFirstIn(ln) != None ){
146+
if(ln.contains(cond1)&&ln.contains(cond2))
147+
{
141148
triple(0) = triple(0).replace(".wikipedia.org/wiki",".dbpedia.org/resource")
142149
val sub = UriDecoder.decode(triple(2))
143150
val obj = UriDecoder.decode(triple(0))
144-
logToFile("./languagelinks.ttl",sub+" "+"<http://www.w3.org/2002/07/owl#sameAs>"+" "+obj+" .")
151+
logToFile("./languagelinks.ttl.gz",sub+" "+"<http://www.w3.org/2002/07/owl#sameAs>"+" "+obj+" .")
145152
}
146153

147154
}
@@ -222,7 +229,6 @@ object LanguageSpecificLinksGenerator {
222229
if(option =="test")
223230
{
224231
val fileName = new File(args(1))
225-
226232
val file = new RichFile(fileName)
227233
val in = IOUtils.inputStream(file)
228234
val lines = Source.fromInputStream(in,"UTF-8").getLines()

0 commit comments

Comments
 (0)