Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
c9dfb7d
log better messages
jimkont Dec 6, 2013
acaee86
change Live cache SQL schema
jimkont Dec 3, 2013
1cd6cad
avoid Virtuoso transaction deadlocks on CHECKPOINT
jimkont Dec 6, 2013
9d095db
move feeder configuration to property file
jimkont Dec 10, 2013
0392cd6
Fix according to review: getMessage() may return null, use toString()…
jimkont Dec 11, 2013
aaa04dd
Fix HomepageExtractor issues
ninniuz Nov 28, 2013
0089903
Merge pull request #126 from ninniuz/homepage_extractor_fixes
jimkont Dec 17, 2013
5baeeed
Issue #144: Deduplicate quads in the extraction server
ninniuz Dec 18, 2013
b1f30b4
Merge pull request #146 from ninniuz/144_duplicate_quads
jimkont Dec 18, 2013
4a5dd9e
bugfix provided by @ninniuz
jimkont Dec 22, 2013
b900b09
manually merge pull request #95 / fixxes issue #23
jimkont Dec 23, 2013
a5ce415
bugfix provided by @ninniuz
jimkont Dec 22, 2013
56f3b5e
Merge pull request #142 from jimkont/live_features
ninniuz Jan 7, 2014
3bd5124
Issue #149: Read stats generation property from JVM system property
ninniuz Jan 13, 2014
42012e8
Merge pull request #150 from ninniuz/149_stats_parameter
jimkont Jan 13, 2014
e12fc26
Missing italian string for HomepageExtractor.
mammadori Jan 15, 2014
fffbe4c
Issue #151: URL encode wikititle in sample page extraction
ninniuz Jan 17, 2014
9d96186
import scala.language.reflectiveCalls
jimkont Jan 17, 2014
e9976d4
adapt for the new parser output
jimkont Jan 17, 2014
ec63949
HomepageExtractor config
ninniuz Jan 17, 2014
f5eb297
remove obsolete classes
jimkont Jan 17, 2014
67bdd2d
Merge pull request #153 from ninniuz/homepage_config_missing_lang
jimkont Jan 20, 2014
9330afd
Merge pull request #152 from ninniuz/151_server_extraction_ampersand
jimkont Jan 20, 2014
909a741
throw exception on parsing error
jimkont Jan 20, 2014
edab2b6
merge with latest master
jimkont Jan 20, 2014
3c73aa8
skipped HomepageExtractor test on merge
jimkont Jan 20, 2014
d5b578f
comment unneeded constants (todo: remove all & reuse Namespace class)
jimkont Jan 20, 2014
6209a69
Adapt Live core to work with the refactored framework
jimkont Jan 20, 2014
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,15 @@
<repositories>

<repository>
<id>osr-public-releases</id>
<name>OSR Public Releases</name>
<url>http://mojo.informatik.uni-erlangen.de/nexus/content/repositories/public-releases</url>
<id>osr-public-releases</id>
<name>OSR Public Releases</name>
<url>http://mojo.informatik.uni-erlangen.de/nexus/content/repositories/public-releases</url>
</repository>

<repository>
<id>osr-public-snapshots</id>
<name>OSR Public snapshots</name>
<url>http://mojo.informatik.uni-erlangen.de/nexus/content/repositories/public-snapshots</url>
<id>osr-public-snapshots</id>
<name>OSR Public snapshots</name>
<url>http://mojo.informatik.uni-erlangen.de/nexus/content/repositories/public-snapshots</url>
</repository>

</repositories>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ object HomepageExtractorConfig
// For "ar" configuration, rendering right-to-left may seems like a bug, but it's not.
// Don't change this else if you know how it is done.

val propertyNamesMap = Map(
private val propertyNamesMap = Map(
"ar" -> Set("الموقع", "الصفحة الرسمية", "موقع", "الصفحة الرئيسية", "صفحة ويب", "موقع ويب"),
"ca" -> Set("pàgina", "web", "lloc"),
"de" -> Set("website", "homepage", "webpräsenz", "web", "site", "siteweb", "site web"),/*cleanup*/
Expand All @@ -26,9 +26,13 @@ object HomepageExtractorConfig
"ru" -> Set("сайт")
)

def propertyNames(lang : String) : Set[String] = {
propertyNamesMap.getOrElse(lang, Set())
}

val supportedLanguages = propertyNamesMap.keySet

val externalLinkSectionsMap = Map(
private val externalLinkSectionsMap = Map(
"ar" -> "وصلات خارجية",
"ca" -> "(?:Enllaços externs|Enllaço extern)",
"de" -> "Weblinks?",
Expand All @@ -46,7 +50,11 @@ object HomepageExtractorConfig
"ru" -> "Ссылки"
)

val officialMap = Map(
def externalLinkSections(lang : String) : String = {
externalLinkSectionsMap.getOrElse(lang, "")
}

private val officialMap = Map(
"ar" -> "رسمي",
"ca" -> "oficial",
"de" -> "offizielle",
Expand All @@ -64,4 +72,26 @@ object HomepageExtractorConfig
"ru" -> "официальный"
)

def official(lang : String) : String = {
officialMap.getOrElse(lang, "")
}

// Map(language -> Map(templateName -> templatePropertyKey))
private val templateOfficialWebsiteMap = Map(
"ca" -> Map("Oficial" -> "1"),
/* "it" -> Map("Sito Ufficiale" -> "1"), This does not exist, yet */
"el" -> Map("Επίσημη ιστοσελίδα" -> "1"),
"en" -> Map("Official website" -> "1"),
"eo" -> Map("Oficiala_retejo" -> "1"),
"es" -> Map("Página_web" -> "1"),
"fr" -> Map("Site_officiel" -> "url"),
"ga" -> Map("Páxina_web" -> "1"),
"pt" -> Map("Oficial" -> "1"),
"ru" -> Map("Официальный сайт" -> "1")
)

def templateOfficialWebsite(lang : String) : Map[String, String] = {
templateOfficialWebsiteMap.getOrElse(lang, Map())
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,13 @@ object InfoboxExtractorConfig

// When you generate statistics, set the following to true. To get full coverage, you should
// probably set most other parameters here to zero or empty values.
val extractTemplateStatistics = false
val extractTemplateStatistics =
try {
System.getProperty("extract.template.stats", "false").toBoolean
} catch {
case ex : Exception => false
}

val minPropertyCount = 2

val minRatioOfExplicitPropertyKeys = 0.75
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@ extends PageNodeExtractor
{
private val language = context.language.wikiCode

private val propertyNames = HomepageExtractorConfig.propertyNamesMap(language)
private val propertyNames = HomepageExtractorConfig.propertyNames(language)

private val official = HomepageExtractorConfig.officialMap(language)
private val official = HomepageExtractorConfig.official(language)

private val externalLinkSections = HomepageExtractorConfig.externalLinkSectionsMap(language)
private val externalLinkSections = HomepageExtractorConfig.externalLinkSections(language)

private val templateOfficialWebsite = HomepageExtractorConfig.templateOfficialWebsite(language)

private val homepageProperty = context.ontology.properties("foaf:homepage")

Expand All @@ -35,37 +37,49 @@ extends PageNodeExtractor
private val officialAndLineEndRegex = ("""(?msiu)[^$]*\b""" + official + """\b.*$.*""").r
private val officialAndNoLineEndRegex = ("""(?msiu)[^$]*\b""" + official + """\b[^$]*""").r
private val lineEndRegex = "(?ms).*$.+".r
// Similar to org.dbpedia.extraction.config.dataparser.DataParserConfig.splitPropertyNodeRegexLink - without '/' and ';'
private val splitPropertyNodeLinkStrict = """<br\s*\/?>|\n| and | or |,| """

override val datasets = Set(DBpediaDatasets.Homepages)

override def extract(page: PageNode, subjectUri: String, pageContext: PageContext): Seq[Quad] =
{
if(page.title.namespace != Namespace.Main) return Seq.empty

val list = collectProperties(page).filter(p => propertyNames.contains(p.key.toLowerCase))
list.foreach((property) => {
property.children match
{
case (textNode @ TextNode(text, _)) :: _ =>

val list = collectProperties(page).filter(p => propertyNames.contains(p.key.toLowerCase)).flatMap {
NodeUtil.splitPropertyNode(_, splitPropertyNodeLinkStrict, true)
}

list.foreach((property) =>

// Find among children
for (child <- property.children) {
child match
{
val url = if (!text.startsWith("http")) "http://" + text else text
val graph = generateStatement(subjectUri, pageContext, url, textNode)
if (!graph.isEmpty)
case (textNode @ TextNode(text, _)) =>
{
return graph
val cleaned = cleanProperty(text)
if (cleaned.nonEmpty) { // do not proceed if the property value is not a valid candidate
val url = if (!cleaned.startsWith("http")) "http://" + cleaned else cleaned
val graph = generateStatement(subjectUri, pageContext, url, textNode)
if (!graph.isEmpty)
{
return graph
}
}
}
}
case (linkNode @ ExternalLinkNode(destination, _, _, _)) :: _ =>
{
val graph = generateStatement(subjectUri, pageContext, destination.toString, linkNode)
if (!graph.isEmpty)
case (linkNode @ ExternalLinkNode(destination, _, _, _)) =>
{
return graph
val graph = generateStatement(subjectUri, pageContext, destination.toString, linkNode)
if (!graph.isEmpty)
{
return graph
}
}
case _ =>
}
case _ =>
}
})
)

for(externalLinkSectionChildren <- collectExternalLinkSection(page.children))
{
Expand All @@ -84,6 +98,18 @@ extends PageNodeExtractor
Seq.empty
}

private def cleanProperty(text: String) : String = {

val candidateUrl = text.stripLineEnd.trim // remove ending new line

// While it is perfectly legal to have hostnames without dots in URLs
// it is very unlikely that such URLs will be present in Wikipedia
// Most of the times such values represent texts inserted by editors
// to convey a "missing homepage" info, such as None, N/A, missing, down etc.
if (candidateUrl.matches(""".*\w\.\w.*""")) candidateUrl
else ""
}

private def generateStatement(subjectUri: String, pageContext: PageContext, url: String, node: Node): Seq[Quad] =
{
try
Expand All @@ -101,20 +127,50 @@ extends PageNodeExtractor
Seq.empty
}

private def extractUrlFromProperty(node: PropertyNode): Option[String] = {

/*
It could be:
1) {{template | key = example.com }}
2) {{template | key = http://example.com }}

In 1) => PropertyNode("key", List(TextNode("example.com", _))
In 2) => PropertyNode("key", List(ExternalLinkNode(URI("http://example.com"), ...)))
*/
val url = node.children.collect {
case TextNode(t, _) => t
case ExternalLinkNode(destination, _, _, _) => destination.toString
}.mkString.trim

if (url.isEmpty) {
None
} else {
try {
val uri = new URI(url)
if (uri.getScheme == null) Some("http://" + uri.toString)
else Some(uri.toString)
} catch {
case _ : Exception => None
}
}
}

private def findLinkTemplateInSection(nodes: List[Node]): Option[(String, Node)] =
{
// TODO: use for-loop instead of recursion
nodes match
{
// TODO: use language-specific name
case (templateNode @ TemplateNode(title, _, _, _)) :: _
if ((title.decoded == "Official") || ((context.redirects.map.contains(title.decoded)) && (context.redirects.map(title.decoded) == "Official"))) =>
case (templateNode @ TemplateNode(title, _, _, _)) :: tail =>
{
templateNode.property("1") match
{
case Some(propertyNode) => propertyNode.retrieveText.map(url => (url, propertyNode))
case _ => None
val templateRedirect = context.redirects.resolve(title).decoded
if (templateOfficialWebsite.contains(templateRedirect)) {
templateNode.property(templateOfficialWebsite(templateRedirect)) match
{
case Some(propertyNode) => extractUrlFromProperty(propertyNode).map(url => (url, propertyNode))
case None => findLinkTemplateInSection(tail) // do not stop the recursion - there might be other templates
}
}
else findLinkTemplateInSection(tail)
}
case head :: tail => findLinkTemplateInSection(tail)
case Nil => None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import org.dbpedia.extraction.util.Language
import org.dbpedia.extraction.destinations.{Quad, DBpediaDatasets}
import org.dbpedia.extraction.wikiparser.{JsonNode, Namespace, PageNode}
import collection.mutable.ArrayBuffer
import scala.language.reflectiveCalls

/**
* Extracts data from Wikidata sources.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import org.dbpedia.extraction.util.Language
import org.dbpedia.extraction.destinations.{Quad, DBpediaDatasets}
import org.dbpedia.extraction.wikiparser.{JsonNode, PageNode}
import collection.mutable.ArrayBuffer
import scala.language.reflectiveCalls

/**
* Extracts Wikidata claims
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import org.dbpedia.extraction.util.Language
import org.dbpedia.extraction.destinations.{Quad, DBpediaDatasets}
import org.dbpedia.extraction.wikiparser.{JsonNode, Namespace, PageNode}
import collection.mutable.ArrayBuffer
import scala.language.reflectiveCalls

/**
* Extracts data from Wikidata sources.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import org.dbpedia.extraction.util.Language
import org.dbpedia.extraction.destinations.{Quad, DBpediaDatasets}
import org.dbpedia.extraction.wikiparser.{JsonNode, PageNode}
import collection.mutable.ArrayBuffer
import org.dbpedia.extraction.wikiparser.Namespace
import scala.language.reflectiveCalls

/**
* Extracts labels triples from Wikidata sources
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import collection.mutable.ArrayBuffer
import org.dbpedia.extraction.ontology.io.OntologyReader
import org.dbpedia.extraction.dataparser.{DataParser, DateTimeParser}
import org.dbpedia.extraction.ontology.datatypes.Datatype
import scala.language.reflectiveCalls

/**
* Extracts Wikidata claims
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import org.dbpedia.extraction.util.Language
import org.dbpedia.extraction.destinations.{Quad, DBpediaDatasets}
import org.dbpedia.extraction.wikiparser.{JsonNode, PageNode}
import collection.mutable.ArrayBuffer
import scala.language.reflectiveCalls

/**
* it's an extractor to extract Mappings between Wikidata URIs to WikiData URIs inside DBpedia, in the form of :
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import org.dbpedia.extraction.util.Language
import org.dbpedia.extraction.destinations.{Quad, DBpediaDatasets}
import org.dbpedia.extraction.wikiparser.{JsonNode, PageNode}
import collection.mutable.ArrayBuffer
import scala.language.reflectiveCalls

/**
* it's an extractor to extract sameas data from DBpedia-WikiData on the form of
Expand Down
4 changes: 2 additions & 2 deletions core/src/main/scala/org/dbpedia/extraction/util/WikiApi.scala
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class WikiApi(url: URL, language: Language)
// -> "generator" instead of "list" and "gapnamespace" instead of "apnamespace" ("gap" is for "generator all pages")

//Retrieve list of pages
val response = query("?action=query&format=xml&list=allpages&apfrom=" + fromPage + "&aplimit=" + pageListLimit + "&apnamespace=" + namespace.code)
val response = query("?action=query&format=xml&list=allpages&apfrom=" + URLEncoder.encode(fromPage, "UTF-8") + "&aplimit=" + pageListLimit + "&apnamespace=" + namespace.code)

//Extract page ids
val pageIds = for(p <- response \ "query" \ "allpages" \ "p") yield (p \ "@pageid").head.text.toLong
Expand Down Expand Up @@ -119,7 +119,7 @@ class WikiApi(url: URL, language: Language)
{
for(titleGroup <- titles.grouped(pageDownloadLimit))
{
val response = query("?action=query&format=xml&prop=revisions&titles=" + titleGroup.map(_.encodedWithNamespace).mkString("|") + "&rvprop=ids|content|timestamp|user|userid")
val response = query("?action=query&format=xml&prop=revisions&titles=" + titleGroup.map(t => URLEncoder.encode(t.encodedWithNamespace, "UTF-8")).mkString("|") + "&rvprop=ids|content|timestamp|user|userid")
processPages(response, proc)
}
}
Expand Down
Loading