-
Notifications
You must be signed in to change notification settings - Fork 291
Add Data Quality Monitoring for Extractors #808
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,4 +6,4 @@ target/ | |
| *.lck | ||
| *.tmp | ||
| java_pid* | ||
| dump/test-basedir | ||
| dump/test-basedir | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,35 @@ | ||
| <?xml version="1.0" encoding="UTF-8"?> | ||
| <projectDescription> | ||
| <name>core</name> | ||
| <buildSpec> | ||
| <buildCommand> | ||
| <name>org.scala-ide.sdt.core.scalabuilder</name> | ||
| </buildCommand> | ||
| </buildSpec> | ||
| <natures> | ||
| <nature>org.scala-ide.sdt.core.scalanature</nature> | ||
| <nature>org.eclipse.jdt.core.javanature</nature> | ||
| </natures> | ||
| </projectDescription> | ||
| <name>core</name> | ||
| <comment></comment> | ||
| <projects> | ||
| </projects> | ||
| <buildSpec> | ||
| <buildCommand> | ||
| <name>org.scala-ide.sdt.core.scalabuilder</name> | ||
| <arguments> | ||
| </arguments> | ||
| </buildCommand> | ||
| <buildCommand> | ||
| <name>org.eclipse.m2e.core.maven2Builder</name> | ||
| <arguments> | ||
| </arguments> | ||
| </buildCommand> | ||
| </buildSpec> | ||
| <natures> | ||
| <nature>org.eclipse.m2e.core.maven2Nature</nature> | ||
| <nature>org.scala-ide.sdt.core.scalanature</nature> | ||
| <nature>org.eclipse.jdt.core.javanature</nature> | ||
| </natures> | ||
| <filteredResources> | ||
| <filter> | ||
| <id>1765562241624</id> | ||
| <name></name> | ||
| <type>30</type> | ||
| <matcher> | ||
| <id>org.eclipse.core.resources.regexFilterMatcher</id> | ||
| <arguments>node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__</arguments> | ||
| </matcher> | ||
| </filter> | ||
| </filteredResources> | ||
| </projectDescription> |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,131 @@ | ||
| package org.dbpedia.extraction.util | ||
|
|
||
| import java.util.logging.{Level, Logger} | ||
| import java.util.concurrent.atomic.AtomicLong | ||
| import scala.collection.concurrent.TrieMap | ||
|
|
||
| /** | ||
| * Monitors data quality issues during extraction. | ||
| * Tracks errors per extractor and provides export capabilities. | ||
| */ | ||
| object DataQualityMonitor { | ||
|
|
||
| private val logger = Logger.getLogger(classOf[DataQualityMonitor].getName) | ||
| private val errorCounts = new TrieMap[String, AtomicLong]() | ||
| private val errorDetails = new TrieMap[String, collection.mutable.ListBuffer[ExtractionError]]() | ||
|
|
||
| def forExtractor(extractorName: String): DataQualityMonitor = { | ||
| new DataQualityMonitor(extractorName) | ||
| } | ||
|
|
||
| def getGlobalMetrics(): Map[String, Long] = { | ||
| errorCounts.map { case (key, counter) => (key, counter.get()) }.toMap | ||
| } | ||
|
|
||
| def getErrorDetails(errorType: String, limit: Int = 100): List[ExtractionError] = { | ||
| errorDetails.get(errorType) match { | ||
| case Some(errors) => errors.take(limit).toList | ||
| case None => List.empty | ||
| } | ||
| } | ||
|
|
||
| def exportToCsv(errorType: String, limit: Int = 1000): String = { | ||
| val errors = getErrorDetails(errorType, limit) | ||
| val header = "Extractor,PageTitle,ErrorMessage,Timestamp\n" | ||
| val rows = errors.map(e => | ||
| s"${e.extractorName},${e.pageTitle},${e.message.replaceAll(",", ";")},${e.timestamp}" | ||
| ).mkString("\n") | ||
| header + rows | ||
| } | ||
|
Comment on lines
+32
to
+39
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. CSV export is vulnerable to malformed output. Only Consider proper RFC 4180 CSV escaping: def exportToCsv(errorType: String, limit: Int = 1000): String = {
val errors = getErrorDetails(errorType, limit)
val header = "Extractor,PageTitle,ErrorMessage,Timestamp\n"
+ def escapeCsvField(field: String): String = {
+ if (field.contains(",") || field.contains("\"") || field.contains("\n"))
+ "\"" + field.replace("\"", "\"\"") + "\""
+ else field
+ }
val rows = errors.map(e =>
- s"${e.extractorName},${e.pageTitle},${e.message.replaceAll(",", ";")},${e.timestamp}"
+ s"${escapeCsvField(e.extractorName)},${escapeCsvField(e.pageTitle)},${escapeCsvField(e.message)},${e.timestamp}"
).mkString("\n")
header + rows
}🤖 Prompt for AI Agents |
||
|
|
||
| def reset(): Unit = { | ||
| errorCounts.clear() | ||
| errorDetails.clear() | ||
| } | ||
| } | ||
|
|
||
| class DataQualityMonitor(val extractorName: String) { | ||
|
|
||
| private val logger = Logger.getLogger(s"org.dbpedia.extraction.monitor.$extractorName") | ||
|
|
||
| def logInvalidData( | ||
| pageTitle: String, | ||
| reason: String, | ||
| exception: Option[Throwable] = None, | ||
| data: Option[String] = None | ||
| ): Unit = { | ||
| val message = buildMessage(pageTitle, reason, data) | ||
| exception match { | ||
| case Some(ex) => logger.log(Level.WARNING, message, ex) | ||
| case None => logger.warning(message) | ||
| } | ||
| recordError(pageTitle, reason, exception) | ||
| } | ||
|
|
||
| def logSkipped(pageTitle: String, reason: String): Unit = { | ||
| logger.fine(s"[$extractorName] Skipped '$pageTitle': $reason") | ||
| } | ||
|
|
||
| def logSuccess(pageTitle: String, triplesCount: Int): Unit = { | ||
| logger.fine(s"[$extractorName] Extracted $triplesCount triples from '$pageTitle'") | ||
| } | ||
|
|
||
| def getMetrics(): Map[String, Long] = { | ||
| DataQualityMonitor.errorCounts | ||
| .filter { case (key, _) => key.startsWith(s"$extractorName:") } | ||
| .map { case (key, counter) => (key, counter.get()) } | ||
| .toMap | ||
| } | ||
|
|
||
| def getTotalErrors(): Long = getMetrics().values.sum | ||
|
|
||
| private def buildMessage(pageTitle: String, reason: String, data: Option[String]): String = { | ||
| val dataStr = data.map(d => s" | Data: ${truncate(d, 200)}").getOrElse("") | ||
| s"[$extractorName] Invalid data in '$pageTitle': $reason$dataStr" | ||
| } | ||
|
|
||
| private def recordError(pageTitle: String, reason: String, exception: Option[Throwable]): Unit = { | ||
| val errorType = s"$extractorName:${categorizeError(reason, exception)}" | ||
|
|
||
| DataQualityMonitor.errorCounts | ||
| .getOrElseUpdate(errorType, new AtomicLong(0)) | ||
| .incrementAndGet() | ||
|
|
||
| val errorDetail = ExtractionError( | ||
| extractorName = extractorName, | ||
| pageTitle = pageTitle, | ||
| message = reason, | ||
| exceptionType = exception.map(_.getClass.getSimpleName), | ||
| timestamp = System.currentTimeMillis() | ||
| ) | ||
|
|
||
| DataQualityMonitor.errorDetails.synchronized { | ||
| val buffer = DataQualityMonitor.errorDetails | ||
| .getOrElseUpdate(errorType, collection.mutable.ListBuffer.empty) | ||
| if (buffer.size < 10000) buffer += errorDetail | ||
| } | ||
| } | ||
|
|
||
| private def categorizeError(reason: String, exception: Option[Throwable]): String = { | ||
| exception match { | ||
| case Some(ex) => ex.getClass.getSimpleName | ||
| case None if reason.toLowerCase.contains("invalid") => "InvalidData" | ||
| case None if reason.toLowerCase.contains("malformed") => "MalformedData" | ||
| case None if reason.toLowerCase.contains("missing") => "MissingData" | ||
| case None => "Other" | ||
| } | ||
| } | ||
|
|
||
| private def truncate(str: String, maxLength: Int): String = { | ||
| if (str.length <= maxLength) str | ||
| else str.substring(0, maxLength) + "..." | ||
| } | ||
| } | ||
|
|
||
| case class ExtractionError( | ||
| extractorName: String, | ||
| pageTitle: String, | ||
| message: String, | ||
| exceptionType: Option[String], | ||
| timestamp: Long | ||
| ) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Potential thread-safety issue when reading error details.
errors.take(limit)operates on a mutableListBufferthat may be concurrently modified byrecordError. Consider synchronizing the read or returning a snapshot.def getErrorDetails(errorType: String, limit: Int = 100): List[ExtractionError] = { errorDetails.get(errorType) match { - case Some(errors) => errors.take(limit).toList + case Some(errors) => errors.synchronized { errors.take(limit).toList } case None => List.empty } }📝 Committable suggestion
🤖 Prompt for AI Agents