Skip to content

Commit 35de582

Browse files
committed
fix and imp
1 parent 3b5188f commit 35de582

File tree

3 files changed

+194
-131
lines changed

3 files changed

+194
-131
lines changed

core/src/main/scala/org/dbpedia/extraction/util/DataQualityMonitor.scala

Lines changed: 8 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -5,56 +5,30 @@ import java.util.concurrent.atomic.AtomicLong
55
import scala.collection.concurrent.TrieMap
66

77
/**
8-
* Centralized monitoring and logging system for extraction quality and errors.
9-
*
10-
* Features:
11-
* - Structured logging with context (extractor, page, error type)
12-
* - Metrics collection for error rates
13-
* - Thread-safe error counting
14-
* - Export capabilities for failed extractions
15-
*
16-
* Usage:
17-
* {{{
18-
* val monitor = DataQualityMonitor.forExtractor("HomepageExtractor")
19-
* monitor.logInvalidData("Einstein", "Invalid IRI: malformed URL", exception)
20-
* monitor.getMetrics() // Get error statistics
21-
* }}}
8+
* Monitors data quality issues during extraction.
9+
* Tracks errors per extractor and provides export capabilities.
2210
*/
2311
object DataQualityMonitor {
2412

2513
private val logger = Logger.getLogger(classOf[DataQualityMonitor].getName)
26-
27-
// Global metrics storage (thread-safe)
2814
private val errorCounts = new TrieMap[String, AtomicLong]()
2915
private val errorDetails = new TrieMap[String, collection.mutable.ListBuffer[ExtractionError]]()
3016

31-
/**
32-
* Create a monitor for a specific extractor
33-
*/
3417
def forExtractor(extractorName: String): DataQualityMonitor = {
3518
new DataQualityMonitor(extractorName)
3619
}
3720

38-
/**
39-
* Get global extraction statistics
40-
*/
4121
def getGlobalMetrics(): Map[String, Long] = {
4222
errorCounts.map { case (key, counter) => (key, counter.get()) }.toMap
4323
}
4424

45-
/**
46-
* Get detailed errors for analysis
47-
*/
4825
def getErrorDetails(errorType: String, limit: Int = 100): List[ExtractionError] = {
4926
errorDetails.get(errorType) match {
5027
case Some(errors) => errors.take(limit).toList
5128
case None => List.empty
5229
}
5330
}
5431

55-
/**
56-
* Export errors to CSV format for analysis
57-
*/
5832
def exportToCsv(errorType: String, limit: Int = 1000): String = {
5933
val errors = getErrorDetails(errorType, limit)
6034
val header = "Extractor,PageTitle,ErrorMessage,Timestamp\n"
@@ -64,80 +38,46 @@ object DataQualityMonitor {
6438
header + rows
6539
}
6640

67-
/**
68-
* Reset all metrics (useful for testing)
69-
*/
7041
def reset(): Unit = {
7142
errorCounts.clear()
7243
errorDetails.clear()
7344
}
7445
}
7546

76-
/**
77-
* Monitor instance for a specific extractor
78-
*/
7947
class DataQualityMonitor(val extractorName: String) {
8048

8149
private val logger = Logger.getLogger(s"org.dbpedia.extraction.monitor.$extractorName")
8250

83-
/**
84-
* Log invalid data with context
85-
*
86-
* @param pageTitle The Wikipedia page being processed
87-
* @param reason Description of why the data is invalid
88-
* @param exception Optional exception that caused the error
89-
* @param data Optional invalid data for debugging
90-
*/
9151
def logInvalidData(
9252
pageTitle: String,
9353
reason: String,
9454
exception: Option[Throwable] = None,
9555
data: Option[String] = None
9656
): Unit = {
9757
val message = buildMessage(pageTitle, reason, data)
98-
99-
// Log with appropriate level
10058
exception match {
10159
case Some(ex) => logger.log(Level.WARNING, message, ex)
10260
case None => logger.warning(message)
10361
}
104-
105-
// Record metrics
10662
recordError(pageTitle, reason, exception)
10763
}
10864

109-
/**
110-
* Log skipped extraction with reason
111-
*/
11265
def logSkipped(pageTitle: String, reason: String): Unit = {
11366
logger.fine(s"[$extractorName] Skipped '$pageTitle': $reason")
11467
}
11568

116-
/**
117-
* Log successful extraction with statistics
118-
*/
11969
def logSuccess(pageTitle: String, triplesCount: Int): Unit = {
12070
logger.fine(s"[$extractorName] Extracted $triplesCount triples from '$pageTitle'")
12171
}
12272

123-
/**
124-
* Get metrics for this extractor
125-
*/
12673
def getMetrics(): Map[String, Long] = {
12774
DataQualityMonitor.errorCounts
12875
.filter { case (key, _) => key.startsWith(s"$extractorName:") }
12976
.map { case (key, counter) => (key, counter.get()) }
13077
.toMap
13178
}
13279

133-
/**
134-
* Get total error count for this extractor
135-
*/
136-
def getTotalErrors(): Long = {
137-
getMetrics().values.sum
138-
}
139-
140-
// Private helper methods
80+
def getTotalErrors(): Long = getMetrics().values.sum
14181

14282
private def buildMessage(pageTitle: String, reason: String, data: Option[String]): String = {
14383
val dataStr = data.map(d => s" | Data: ${truncate(d, 200)}").getOrElse("")
@@ -147,12 +87,10 @@ class DataQualityMonitor(val extractorName: String) {
14787
private def recordError(pageTitle: String, reason: String, exception: Option[Throwable]): Unit = {
14888
val errorType = s"$extractorName:${categorizeError(reason, exception)}"
14989

150-
// Increment counter
15190
DataQualityMonitor.errorCounts
15291
.getOrElseUpdate(errorType, new AtomicLong(0))
15392
.incrementAndGet()
15493

155-
// Store details (limit to prevent memory issues)
15694
val errorDetail = ExtractionError(
15795
extractorName = extractorName,
15896
pageTitle = pageTitle,
@@ -161,14 +99,11 @@ class DataQualityMonitor(val extractorName: String) {
16199
timestamp = System.currentTimeMillis()
162100
)
163101

164-
DataQualityMonitor.errorDetails
165-
.getOrElseUpdate(errorType, collection.mutable.ListBuffer.empty)
166-
.synchronized {
167-
val buffer = DataQualityMonitor.errorDetails(errorType)
168-
if (buffer.size < 10000) { // Limit storage
169-
buffer += errorDetail
170-
}
171-
}
102+
DataQualityMonitor.errorDetails.synchronized {
103+
val buffer = DataQualityMonitor.errorDetails
104+
.getOrElseUpdate(errorType, collection.mutable.ListBuffer.empty)
105+
if (buffer.size < 10000) buffer += errorDetail
106+
}
172107
}
173108

174109
private def categorizeError(reason: String, exception: Option[Throwable]): String = {
@@ -187,9 +122,6 @@ class DataQualityMonitor(val extractorName: String) {
187122
}
188123
}
189124

190-
/**
191-
* Case class representing an extraction error
192-
*/
193125
case class ExtractionError(
194126
extractorName: String,
195127
pageTitle: String,

core/src/test/scala/org/dbpedia/extraction/util/DataQualityMonitorTest.scala

Lines changed: 25 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -2,72 +2,63 @@ package org.dbpedia.extraction.util
22

33
import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}
44

5-
/**
6-
* Test suite for DataQualityMonitor
7-
*/
85
class DataQualityMonitorTest extends FlatSpec with Matchers with BeforeAndAfter {
96

107
before {
11-
// Reset metrics before each test
128
DataQualityMonitor.reset()
139
}
1410

15-
"DataQualityMonitor" should "create a monitor for an extractor" in {
11+
"DataQualityMonitor" should "create monitor" in {
1612
val monitor = DataQualityMonitor.forExtractor("TestExtractor")
1713
monitor should not be null
1814
monitor.extractorName should be("TestExtractor")
1915
}
2016

21-
it should "record invalid data errors" in {
17+
it should "record errors" in {
2218
val monitor = DataQualityMonitor.forExtractor("TestExtractor")
23-
2419
monitor.logInvalidData(
2520
"Albert_Einstein",
2621
"Invalid IRI syntax",
2722
Some(new IllegalArgumentException("Test exception")),
2823
Some("http://malformed url")
2924
)
3025

31-
val metrics = monitor.getMetrics()
32-
metrics should not be empty
26+
monitor.getMetrics() should not be empty
3327
monitor.getTotalErrors() should be(1)
3428
}
3529

36-
it should "categorize errors correctly" in {
30+
it should "categorize errors" in {
3731
val monitor = DataQualityMonitor.forExtractor("TestExtractor")
3832

39-
// Log different types of errors
4033
monitor.logInvalidData("Page1", "Invalid data", exception = Some(new IllegalArgumentException()))
4134
monitor.logInvalidData("Page2", "Malformed URL", exception = None)
4235
monitor.logInvalidData("Page3", "Missing property", exception = None)
4336

44-
val metrics = monitor.getMetrics()
45-
metrics.size should be >= 1
37+
monitor.getMetrics().size should be >= 1
4638
monitor.getTotalErrors() should be(3)
4739
}
4840

49-
it should "track multiple extractors independently" in {
50-
val monitor1 = DataQualityMonitor.forExtractor("ExtractorA")
51-
val monitor2 = DataQualityMonitor.forExtractor("ExtractorB")
41+
it should "track extractors independently" in {
42+
val m1 = DataQualityMonitor.forExtractor("ExtractorA")
43+
val m2 = DataQualityMonitor.forExtractor("ExtractorB")
5244

53-
monitor1.logInvalidData("Page1", "Error in A")
54-
monitor1.logInvalidData("Page2", "Another error in A")
55-
monitor2.logInvalidData("Page3", "Error in B")
45+
m1.logInvalidData("Page1", "Error in A")
46+
m1.logInvalidData("Page2", "Another error in A")
47+
m2.logInvalidData("Page3", "Error in B")
5648

57-
monitor1.getTotalErrors() should be(2)
58-
monitor2.getTotalErrors() should be(1)
49+
m1.getTotalErrors() should be(2)
50+
m2.getTotalErrors() should be(1)
5951
}
6052

6153
it should "provide global metrics" in {
62-
val monitor1 = DataQualityMonitor.forExtractor("ExtractorA")
63-
val monitor2 = DataQualityMonitor.forExtractor("ExtractorB")
54+
val m1 = DataQualityMonitor.forExtractor("ExtractorA")
55+
val m2 = DataQualityMonitor.forExtractor("ExtractorB")
6456

65-
monitor1.logInvalidData("Page1", "Error 1")
66-
monitor2.logInvalidData("Page2", "Error 2")
67-
monitor2.logInvalidData("Page3", "Error 3")
57+
m1.logInvalidData("Page1", "Error 1")
58+
m2.logInvalidData("Page2", "Error 2")
59+
m2.logInvalidData("Page3", "Error 3")
6860

69-
val globalMetrics = DataQualityMonitor.getGlobalMetrics()
70-
globalMetrics.values.sum should be(3)
61+
DataQualityMonitor.getGlobalMetrics().values.sum should be(3)
7162
}
7263

7364
it should "retrieve error details" in {
@@ -81,60 +72,41 @@ class DataQualityMonitorTest extends FlatSpec with Matchers with BeforeAndAfter
8172
details.head.extractorName should be("TestExtractor")
8273
}
8374

84-
it should "export errors to CSV format" in {
75+
it should "export to CSV" in {
8576
val monitor = DataQualityMonitor.forExtractor("TestExtractor")
8677

8778
monitor.logInvalidData("Einstein", "Invalid IRI")
8879
monitor.logInvalidData("Tesla", "Malformed URL")
8980

9081
val csv = DataQualityMonitor.exportToCsv("TestExtractor:InvalidData", limit = 100)
91-
92-
csv should include("Extractor,PageTitle,ErrorMessage,Timestamp")
93-
csv should include("TestExtractor")
82+
csv should include("Extractor,PageTitle,ErrorMessage")
9483
csv should include("Einstein")
9584
}
9685

97-
it should "limit stored error details to prevent memory issues" in {
86+
it should "limit stored details" in {
9887
val monitor = DataQualityMonitor.forExtractor("TestExtractor")
88+
(1 to 11000).foreach(i => monitor.logInvalidData(s"Page$i", "Error"))
9989

100-
// Try to log more than the limit (10000)
101-
for (i <- 1 to 11000) {
102-
monitor.logInvalidData(s"Page$i", "Error")
103-
}
104-
105-
// Should still work without memory issues
106-
val metrics = monitor.getMetrics()
10790
monitor.getTotalErrors() should be(11000)
108-
109-
// But details should be limited
11091
val details = DataQualityMonitor.getErrorDetails("TestExtractor:InvalidData", limit = 20000)
11192
details.size should be <= 10000
11293
}
11394

11495
it should "log skipped extractions" in {
11596
val monitor = DataQualityMonitor.forExtractor("TestExtractor")
116-
117-
// Should not throw exception
11897
monitor.logSkipped("TestPage", "Not in main namespace")
119-
120-
// Skipped pages don't count as errors
12198
monitor.getTotalErrors() should be(0)
12299
}
123100

124101
it should "log successful extractions" in {
125102
val monitor = DataQualityMonitor.forExtractor("TestExtractor")
126-
127-
// Should not throw exception
128103
monitor.logSuccess("Einstein", 5)
129-
130-
// Successful extractions don't count as errors
131104
monitor.getTotalErrors() should be(0)
132105
}
133106

134-
it should "handle concurrent logging safely" in {
107+
it should "handle concurrent logging" in {
135108
val monitor = DataQualityMonitor.forExtractor("ConcurrentExtractor")
136109

137-
// Simulate concurrent logging from multiple threads
138110
val threads = (1 to 10).map { i =>
139111
new Thread {
140112
override def run(): Unit = {
@@ -147,12 +119,10 @@ class DataQualityMonitorTest extends FlatSpec with Matchers with BeforeAndAfter
147119

148120
threads.foreach(_.start())
149121
threads.foreach(_.join())
150-
151-
// Should have logged all errors without data corruption
152122
monitor.getTotalErrors() should be(1000)
153123
}
154124

155-
"Error details" should "include all required fields" in {
125+
"Error details" should "include required fields" in {
156126
val monitor = DataQualityMonitor.forExtractor("TestExtractor")
157127

158128
monitor.logInvalidData(

0 commit comments

Comments
 (0)