Skip to content

Commit fcbf365

Browse files
committed
Merge pull request dbpedia#94 from ninniuz/80_quote_scales_with_regex
dbpedia#80: Fix escaping of special chars in regex
2 parents d63aab2 + 9615635 commit fcbf365

File tree

2 files changed

+33
-30
lines changed

2 files changed

+33
-30
lines changed

core/src/main/scala/org/dbpedia/extraction/dataparser/ParserUtils.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import org.dbpedia.extraction.config.dataparser.ParserUtilsConfig
44
import java.text.{NumberFormat,DecimalFormatSymbols}
55
import org.dbpedia.extraction.util.Language
66
import java.util.Locale
7+
import java.util.regex.Pattern
78

89
/**
910
* Utility functions used by the data parsers.
@@ -30,7 +31,7 @@ class ParserUtils( context : { def language : Language } )
3031

3132
// TODO: use "\s+" instead of "\s?" between number and scale?
3233
// TODO: in some Asian languages, digits are not separated by thousands but by ten thousands or so...
33-
private val regex = ("""(?i)([\D]*)([0-9]+(?:\""" + groupingSeparator + """[0-9]{3})*)(""" + decimalSeparatorsRegex + """[0-9]+)?\s?\[?\[?(""" + scales.keySet.mkString("|") + """)\]?\]?(.*)""").r
34+
private val regex = ("""(?i)([\D]*)([0-9]+(?:\""" + groupingSeparator + """[0-9]{3})*)(""" + decimalSeparatorsRegex + """[0-9]+)?\s?\[?\[?(""" + scales.keySet.map(Pattern.quote).mkString("|") + """)\]?\]?(.*)""").r
3435

3536
def parse(str: String): Number = {
3637
// space is sometimes used as grouping separator
Lines changed: 31 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,33 @@
11
package org.dbpedia.extraction.dataparser
22

3-
//import junit.framework.TestCase
4-
//import junit.framework.Assert._
5-
//import org.dbpedia.extraction.util.Language
6-
//
7-
//class ParserUtilsTest extends TestCase
8-
//{
9-
// def testConvertLargeNumbers() : Unit =
10-
// {
11-
// testConvert("en", "100.5 million", "100500000")
12-
// testConvert("de", "100,5 million", "100500000")
13-
//
14-
// testConvert("de", "1.234,5 mrd", "1234500000000")
15-
// // FIXME: this should fail, mrd is not English
16-
// testConvert("en", "1,234.5 mrd", "1234500000000")
17-
//
18-
// testConvert("en", "1,234.5 billion", "1234500000000")
19-
// // FIXME: this should work, billion is 10^12 in German
20-
// // testConvert("de", "100,5 billion", "100500000000000")
21-
//
22-
// testConvert("en", "1,234.5 trillion", "1234500000000000")
23-
// // FIXME: this should work, trillion is 10^18 in German
24-
// // testConvert("de", "1.234,5 trillion", "1234500000000000000000")
25-
// }
26-
//
27-
// private def testConvert( lang : String, value : String, expect : String ) : Unit =
28-
// {
29-
// assertEquals(expect, ParserUtils.convertLargeNumbers(value, Language(lang)))
30-
// }
31-
//}
3+
import junit.framework.TestCase
4+
import junit.framework.Assert._
5+
import org.dbpedia.extraction.util.Language
6+
7+
class ParserUtilsTest extends TestCase
8+
{
9+
def testConvertLargeNumbers() : Unit =
10+
{
11+
testConvert("en", "100.5 million", "100500000")
12+
testConvert("de", "100,5 million", "100500000")
13+
14+
// testConvert("de", "1.234,5 mrd", "1234500000000")
15+
// FIXME: this should fail, mrd is not English
16+
// testConvert("en", "1,234.5 mrd", "1234500000000")
17+
18+
// testConvert("en", "1,234.5 billion", "1234500000000")
19+
testConvert("de", "100,5 billion", "100500000000000")
20+
21+
// testConvert("en", "1,234.5 trillion", "1234500000000000")
22+
// FIXME: this should work, trillion is 10^18 in German
23+
// testConvert("de", "1.234,5 trillion", "1234500000000000000000")
24+
testConvert("nl", "123 milja", "123 milja")
25+
testConvert("nl", "123 milj.", "123000000000")
26+
}
27+
28+
private def testConvert( lang : String, value : String, expect : String ) : Unit =
29+
{
30+
val parser = new ParserUtils(new { def language = Language(lang) })
31+
assertEquals(expect, parser.convertLargeNumbers(value))
32+
}
33+
}

0 commit comments

Comments
 (0)