Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
5342469
Manage REST API answer link shape
Sep 14, 2022
7174c71
Not cleaning HTML before getJsoupDoc and clean it inside for managing…
Sep 14, 2022
d00cf7a
Create a WikipediaNifExtractor extension for REST API answer
Sep 14, 2022
89c6d5c
change connector
Sep 14, 2022
6461bce
add possibility to choose connector
Sep 14, 2022
9cdeb79
deprecate class
Sep 14, 2022
29329c0
Create a MediaWikiConnector Abstract class for gathering common params
Sep 14, 2022
06202d5
Create a new connector for the REST API
Sep 14, 2022
053f0ab
Create a new connector for the REST API
Sep 14, 2022
2050875
script for creating custom dump sample
Sep 14, 2022
7ad20ae
script for generating Minidump from uri list generated by create_cust…
Sep 14, 2022
f55d803
script for creating uri list randomly from id list
Sep 14, 2022
f7686ec
adapt property files to new possible APIS
Sep 14, 2022
782214b
add new param for MWC api
Sep 14, 2022
984b5d4
new Test for abstract benchmark
Sep 14, 2022
021ca01
Add new properties for API connectors
Sep 14, 2022
7001bcd
adapt for extension
Sep 14, 2022
78d91d6
Update core/src/main/scala/org/dbpedia/extraction/util/MediawikiConne…
datalogism Sep 16, 2022
d7929da
Update dump/src/test/scala/org/dbpedia/extraction/dump/ExtractionTest…
datalogism Sep 16, 2022
167b342
Update dump/src/test/scala/org/dbpedia/extraction/dump/ExtractionTest…
datalogism Sep 16, 2022
6112b94
Update dump/src/test/scala/org/dbpedia/extraction/dump/ExtractionTest…
datalogism Sep 16, 2022
e97dd88
Update core/src/main/scala/org/dbpedia/extraction/config/Config.scala
datalogism Sep 16, 2022
969f3b4
Update core/src/main/scala/org/dbpedia/extraction/config/Config.scala
datalogism Sep 16, 2022
e89f813
clear comments of API config and fix plain abstract API urls
Sep 19, 2022
2246f21
snake case to camel case
Sep 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 87 additions & 46 deletions core/src/main/java/org/dbpedia/extraction/nif/LinkExtractor.java
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,14 @@ public LinkExtractor(NifExtractorContext context) {

public void head(Node node, int depth) {

if(skipLevel>=0)
if(skipLevel>=0){
return;
}


if(paragraph == null)
paragraph = new Paragraph(0, "", "p");
if(paragraph == null) {
paragraph = new Paragraph(0, "", "p");
}
//ignore all content inside invisible tags
if(invisible || node.attr("style").matches(".*display\\s*:\\s*none.*")) {
invisible = true;
Expand All @@ -52,13 +55,15 @@ public void head(Node node, int depth) {

if(node.nodeName().equals("#text")) {
String tempText = node.toString();

//replace no-break spaces because unescape doesn't deal with them
tempText = StringEscapeUtils.unescapeHtml4(tempText);
tempText = org.dbpedia.extraction.util.StringUtils.escape(tempText, replaceChars());
tempText = tempText.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", "");

//this text node is the content of an <a> element: make a new nif:Word
if(inLink) {

if(!tempText.trim().startsWith(this.context.wikipediaTemplateString + ":")) //not!
{
tempLink.setLinkText(tempText);
Expand All @@ -70,11 +75,15 @@ public void head(Node node, int depth) {
errors.add("found Template in resource: " + this.context.resource + ": " + tempText);
return;
}

}
else
paragraph.addText(tempText);

} else if(node.nodeName().equals("a")) {
}

else if(node.nodeName().equals("a")) {

String link = node.attr("href");
//TODO central string management
/**
Expand All @@ -84,41 +93,62 @@ public void head(Node node, int depth) {
* see Schopenhauer: https://en.wikipedia.org/w/api.php?uselang=en&format=xml&action=parse&prop=text&pageid=17340400
*/
String linkPrefix = "/wiki/";
// standard wikilinks
if (link.contains(linkPrefix) && !link.contains(":")) {
tempLink = new Link();
String uri = cleanLink(node.attr("href"), false);
setUri(uri);

//simple example of Help:IPA
// <a href="/wiki/Help:IPA/Standard_German" title="Help:IPA/Standard German">[ˈaɐ̯tʊɐ̯ ˈʃoːpn̩haʊ̯ɐ]</a>
} else if (link.contains(linkPrefix) && link.contains(":")) {
/**
* TODO buggy
* Cleans up child nodes: difficult example
* <a href="/wiki/Help:IPA/English" title="Help:IPA/English">/<span style="border-bottom:1px dotted"><span title="/ˈ/: primary stress follows">ˈ</span><span title="/ʃ/: 'sh' in 'shy'">ʃ</span><span title="/oʊ/: 'o' in 'code'">oʊ</span><span title="'p' in 'pie'">p</span><span title="/ən/: 'on' in 'button'">ən</span><span title="'h' in 'hi'">h</span><span title="/aʊ/: 'ou' in 'mouth'">aʊ</span><span title="/./: syllable break">.</span><span title="/ər/: 'er' in 'letter'">ər</span></span>/</a>
*/
if (!node.childNodes().isEmpty()) {
if (node.childNode(0).nodeName().equals("#text") &&
node.childNode(0).toString().contains(":") &&
!node.childNode(0).toString().contains("http")) {
tempLink = new Link();
String uri = cleanLink(node.attr("href"), false);
setUri(uri);
}
} else {
skipLevel = depth;
}
//TODO add example
} else if (node.attr("class").equals("external text")) {
//don't skip external links
tempLink = new Link();
String uri = cleanLink(node.attr("href"), true);
setUri(uri);

} else {
skipLevel = depth;
}
// SPECIAL CASE FOR RESTAPI PARSING

if(node.hasAttr("rel")){

String relType = node.attr("rel");
if(relType.equals("mw:WikiLink")){

tempLink = new Link();
String uri = cleanLink(node.attr("href"), false);
setUri(uri);


} else if (relType.equals("mw:ExtLink")) {
tempLink = new Link();
String uri = cleanLink(node.attr("href"), true);
setUri(uri);
}
}else{
// standard wikilinks
if (link.contains(linkPrefix) && !link.contains(":")) {
tempLink = new Link();
String uri = cleanLink(node.attr("href"), false);
setUri(uri);

//simple example of Help:IPA
// <a href="/wiki/Help:IPA/Standard_German" title="Help:IPA/Standard German">[ˈaɐ̯tʊɐ̯ ˈʃoːpn̩haʊ̯ɐ]</a>
} else if (link.contains(linkPrefix) && link.contains(":")) {
/**
* TODO buggy
* Cleans up child nodes: difficult example
* <a href="/wiki/Help:IPA/English" title="Help:IPA/English">/<span style="border-bottom:1px dotted"><span title="/ˈ/: primary stress follows">ˈ</span><span title="/ʃ/: 'sh' in 'shy'">ʃ</span><span title="/oʊ/: 'o' in 'code'">oʊ</span><span title="'p' in 'pie'">p</span><span title="/ən/: 'on' in 'button'">ən</span><span title="'h' in 'hi'">h</span><span title="/aʊ/: 'ou' in 'mouth'">aʊ</span><span title="/./: syllable break">.</span><span title="/ər/: 'er' in 'letter'">ər</span></span>/</a>
*/
if (!node.childNodes().isEmpty()) {
if (node.childNode(0).nodeName().equals("#text") &&
node.childNode(0).toString().contains(":") &&
!node.childNode(0).toString().contains("http")) {
tempLink = new Link();
String uri = cleanLink(node.attr("href"), false);
setUri(uri);
}
} else {
skipLevel = depth;
}
//TODO add example
} else if (node.attr("class").equals("external text")) {
//don't skip external links
tempLink = new Link();
String uri = cleanLink(node.attr("href"), true);
setUri(uri);

} else {
skipLevel = depth;
}
}


} else if(node.nodeName().equals("p")) {
if(paragraph != null) {
addParagraph("p");
Expand All @@ -136,6 +166,7 @@ public void head(Node node, int depth) {
skipLevel = depth;
} else if(node.nodeName().equals("span")) {
//denote notes

if(node.attr("class").contains("notebegin"))
addParagraph("note");

Expand All @@ -159,13 +190,21 @@ private void setUri(String uri) {

private String cleanLink(String uri, boolean external) {
if(!external) {

String linkPrefix = "/wiki/";
String linkPrefix2= "./";
if(uri.contains(linkPrefix)){
uri=uri.substring(uri.indexOf("?title=")+7);
} else if (uri.contains(linkPrefix2)) {
uri=uri.substring(uri.indexOf("?title=")+3);
}
//TODO central string management
if(!this.context.language.equals("en")) {

uri="http://"+this.context.language+".dbpedia.org/resource/"+uri.substring(uri.indexOf("?title=")+7);
uri="http://"+this.context.language+".dbpedia.org/resource/"+uri;

} else {
uri="http://dbpedia.org/resource/"+uri.substring(uri.indexOf("?title=")+7);
}
else {
uri="http://dbpedia.org/resource/"+uri;
}
uri = uri.replace("&action=edit&redlink=1", "");

Expand All @@ -183,22 +222,23 @@ private String cleanLink(String uri, boolean external) {
e.printStackTrace();
}
}

return UriUtils.uriToDbpediaIri(uri).toString();
}

public void tail(Node node, int depth) {



if(skipLevel>0) {
if(skipLevel==depth) {

skipLevel = -1;
return;
} else {
return;
}
}

if(node.nodeName().equals("a")&&inLink) {
if(node.nodeName().equals("a") && inLink) {
inLink = false;
paragraph.addLink(tempLink);
tempLink = new Link();
Expand All @@ -210,6 +250,7 @@ else if(node.nodeName().equals("p") && paragraph != null) {
addParagraph("p");
}
else if(node.nodeName().equals("sup") && inSup) {

inSup = false;
}
else if(node.nodeName().matches("h\\d")) {
Expand Down
44 changes: 33 additions & 11 deletions core/src/main/scala/org/dbpedia/extraction/config/Config.scala
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,10 @@ class Config(val configPath: String) extends
}

/**
* Number of parallel processes allowed. Depends on the number of cores, type of disk and IO speed
* Number of parallel processes allowed. Depends on the number of cores, type of disk, and IO speed
*
*/
lazy val parallelProcesses: Int = this.getProperty("parallel-processes", "4").trim.toInt
lazy val parallelProcesses: Int = this.getProperty("parallel-processes", "1").trim.toInt

lazy val sparkMaster: String = Option(getString(this, "spark-master")).getOrElse("local[*]")

Expand Down Expand Up @@ -259,18 +259,32 @@ class Config(val configPath: String) extends
}

lazy val mediawikiConnection: MediaWikiConnection = Try {

MediaWikiConnection(
apiUrl = this.getProperty("mwc-apiUrl", "").trim,
apiType = this.getProperty("mwc-type", "").trim,
apiUrl = this.getProperty("mwc-type").trim match {
case "rest" => this.getProperty("mwc-apiRestUrl", "").trim
case "mwc" => this.getProperty("mwc-apiMWCUrl", "").trim
case "local" => this.getProperty("mwc-apiLocalUrl", "").trim
},
maxRetries = this.getProperty("mwc-maxRetries", "4").trim.toInt,
connectMs = this.getProperty("mwc-connectMs", "2000").trim.toInt,
readMs = this.getProperty("mwc-readMs", "5000").trim.toInt,
sleepFactor = this.getProperty("mwc-sleepFactor", "1000").trim.toInt
sleepFactor = this.getProperty("mwc-sleepFactor", "1000").trim.toInt,
maxlag = this.getProperty("mwc-maxlag", "5").trim.toInt,
useragent = this.getProperty("mwc-useragent", "anonymous").trim,
gzip = this.getProperty("mwc-gzip","false").trim.toBoolean,
retryafter = this.getProperty("mwc-retryafter", "false").trim.toBoolean,
accept = this.getProperty("mwc-accept", "text/html").trim,
charset = this.getProperty("mwc-charset", "utf-8").trim,
profile = this.getProperty("mwc-profile", "https://www.mediawiki.org/wiki/Specs/HTML/2.1.0").trim
)
} match{
case Success(s) => s
case Failure(f) => throw new IllegalArgumentException("Not all necessary parameters for the 'MediaWikiConnection' class were provided or could not be parsed to the expected type.", f)
case Failure(f) => throw new IllegalArgumentException("Some parameters necessary for the 'MediaWikiConnection' class were not provided or could not be parsed to the expected type.", f)
}


lazy val abstractParameters: AbstractParameters = Try {
AbstractParameters(
abstractQuery = this.getProperty("abstract-query", "").trim,
Expand Down Expand Up @@ -364,12 +378,20 @@ object Config{
* @param sleepFactor
*/
case class MediaWikiConnection(
apiUrl: String,
maxRetries: Int,
connectMs: Int,
readMs: Int,
sleepFactor: Int
)
apiType: String,
apiUrl: String,
maxRetries: Int,
connectMs: Int,
readMs: Int,
sleepFactor: Int,
maxlag: Int,
useragent: String,
gzip: Boolean,
retryafter: Boolean,
accept : String,
charset: String,
profile: String
)

case class AbstractParameters(
abstractQuery: String,
Expand Down
30 changes: 20 additions & 10 deletions core/src/main/scala/org/dbpedia/extraction/mappings/NifExtractor.scala
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ package org.dbpedia.extraction.mappings
import org.dbpedia.extraction.annotations.ExtractorAnnotation
import org.dbpedia.extraction.config.Config
import org.dbpedia.extraction.config.provenance.DBpediaDatasets
import org.dbpedia.extraction.nif.WikipediaNifExtractor
import org.dbpedia.extraction.nif.{WikipediaNifExtractorRest, WikipediaNifExtractor}
import org.dbpedia.extraction.ontology.Ontology
import org.dbpedia.extraction.transform.Quad
import org.dbpedia.extraction.util.{Language, MediaWikiConnector}
import org.dbpedia.extraction.util.{Language, MediawikiConnectorConfigured, MediaWikiConnectorRest}
import org.dbpedia.extraction.wikiparser._

import scala.language.reflectiveCalls
Expand Down Expand Up @@ -41,12 +41,11 @@ class NifExtractor(
protected val writeLinkAnchors: Boolean = context.configFile.nifParameters.writeLinkAnchor
protected val writeStrings: Boolean = context.configFile.nifParameters.writeAnchor
protected val shortAbstractLength: Int = context.configFile.abstractParameters.shortAbstractMinLength

protected val abstractsOnly : Boolean = context.configFile.nifParameters.abstractsOnly
protected val dbpediaVersion: String = context.configFile.dbPediaVersion

override val datasets = Set(DBpediaDatasets.NifContext,DBpediaDatasets.NifPageStructure,DBpediaDatasets.NifTextLinks,DBpediaDatasets.LongAbstracts, DBpediaDatasets.ShortAbstracts, DBpediaDatasets.RawTables, DBpediaDatasets.Equations)

private val mwConnector = new MediaWikiConnector(context.configFile.mediawikiConnection, context.configFile.nifParameters.nifTags.split(","))

override def extract(pageNode : WikiPage, subjectUri : String): Seq[Quad] =
{
Expand All @@ -56,13 +55,24 @@ class NifExtractor(
//Don't extract abstracts from redirect and disambiguation pages
if(pageNode.isRedirect || pageNode.isDisambiguation) return Seq.empty

//Retrieve page text
val html = mwConnector.retrievePage(pageNode.title, apiParametersFormat, pageNode.isRetry) match{
case Some(t) => NifExtractor.postProcessExtractedHtml(pageNode.title, t)
case None => return Seq.empty
}
var html = ""
val mwcType = context.configFile.mediawikiConnection.apiType

new WikipediaNifExtractor(context, pageNode).extractNif(html)(err => pageNode.addExtractionRecord(err))
if (mwcType == "rest") {
val mwConnector = new MediaWikiConnectorRest(context.configFile.mediawikiConnection, context.configFile.nifParameters.nifTags.split(","))
html = mwConnector.retrievePage(pageNode.title, apiParametersFormat, pageNode.isRetry) match {
case Some(t) => NifExtractor.postProcessExtractedHtml(pageNode.title, t)
case None => return Seq.empty
}
new WikipediaNifExtractorRest(context, pageNode).extractNif(html)(err => pageNode.addExtractionRecord(err))
} else {
val mwConnector = new MediawikiConnectorConfigured(context.configFile.mediawikiConnection, context.configFile.nifParameters.nifTags.split(","))
html = mwConnector.retrievePage(pageNode.title, apiParametersFormat, pageNode.isRetry) match {
case Some(t) => NifExtractor.postProcessExtractedHtml(pageNode.title, t)
case None => return Seq.empty
}
new WikipediaNifExtractor(context, pageNode).extractNif(html)(err => pageNode.addExtractionRecord(err))
}
}

}
Expand Down
6 changes: 3 additions & 3 deletions core/src/main/scala/org/dbpedia/extraction/mappings/PlainAbstractExtractor.scala
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import org.dbpedia.extraction.config.provenance.DBpediaDatasets
import org.dbpedia.extraction.ontology.Ontology
import org.dbpedia.extraction.transform.{Quad, QuadBuilder}
import org.dbpedia.extraction.util.abstracts.AbstractUtils
import org.dbpedia.extraction.util.{Language, MediaWikiConnector, WikiUtil}
import org.dbpedia.extraction.util.{Language, MediawikiConnectorConfigured}
import org.dbpedia.extraction.wikiparser._

import scala.language.reflectiveCalls
Expand Down Expand Up @@ -63,7 +63,6 @@ extends WikiPageExtractor

override val datasets = Set(DBpediaDatasets.LongAbstracts, DBpediaDatasets.ShortAbstracts)

private val mwConnector = new MediaWikiConnector(context.configFile.mediawikiConnection, context.configFile.abstractParameters.abstractTags.split(","))

override def extract(pageNode : WikiPage, subjectUri: String): Seq[Quad] =
{
Expand All @@ -79,7 +78,8 @@ extends WikiPageExtractor
//val abstractWikiText = getAbstractWikiText(pageNode)
// if(abstractWikiText == "") return Seq.empty

//Retrieve page text

val mwConnector = new MediawikiConnectorConfigured(context.configFile.mediawikiConnection, context.configFile.abstractParameters.abstractTags.split(","))
val text = mwConnector.retrievePage(pageNode.title, apiParametersFormat, pageNode.isRetry) match {
case Some(t) => PlainAbstractExtractor.postProcessExtractedHtml(pageNode.title, replacePatterns(t))
case None => return Seq.empty
Expand Down
Loading