dbpedia · jlareck · Sep 22, 2022 · Sep 14, 2022 · Sep 14, 2022 · Sep 14, 2022
diff --git a/core/src/main/java/org/dbpedia/extraction/nif/LinkExtractor.java b/core/src/main/java/org/dbpedia/extraction/nif/LinkExtractor.java
@@ -39,11 +39,14 @@ public LinkExtractor(NifExtractorContext context) {
 
 	public void head(Node node, int depth) {
 
-		if(skipLevel>=0)
+		if(skipLevel>=0){
 			return;
+		}
+
 
-        if(paragraph == null)
-            paragraph = new Paragraph(0, "", "p");
+        if(paragraph == null) {
+			paragraph = new Paragraph(0, "", "p");
+		}
 		//ignore all content inside invisible tags
 		if(invisible || node.attr("style").matches(".*display\\s*:\\s*none.*")) {
 			invisible = true;
@@ -52,13 +55,15 @@ public void head(Node node, int depth) {
 
 		if(node.nodeName().equals("#text")) {
 		  String tempText = node.toString();
+
 		  //replace no-break spaces because unescape doesn't deal with them
 		  tempText = StringEscapeUtils.unescapeHtml4(tempText);
           tempText = org.dbpedia.extraction.util.StringUtils.escape(tempText, replaceChars());
 		  tempText = tempText.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", "");
 
 		  //this text node is the content of an <a> element: make a new nif:Word
 		  if(inLink) {
+
               if(!tempText.trim().startsWith(this.context.wikipediaTemplateString + ":"))  //not!
               {
                   tempLink.setLinkText(tempText);
@@ -70,11 +75,15 @@ public void head(Node node, int depth) {
 				  errors.add("found Template in resource: " + this.context.resource + ": " + tempText);
 				  return;
 			  }
+
 		  }
 		  else
 		    paragraph.addText(tempText);
 
-		} else if(node.nodeName().equals("a")) {
+		}
+
+		else if(node.nodeName().equals("a")) {
+
             String link = node.attr("href");
             //TODO central string management
 			/**
@@ -84,41 +93,62 @@ public void head(Node node, int depth) {
 			 * see Schopenhauer: https://en.wikipedia.org/w/api.php?uselang=en&format=xml&action=parse&prop=text&pageid=17340400
 			 */
             String linkPrefix = "/wiki/";
-            // standard wikilinks
-            if (link.contains(linkPrefix) && !link.contains(":")) {
-                tempLink = new Link();
-                String uri = cleanLink(node.attr("href"), false);
-                setUri(uri);
-
-            //simple example of Help:IPA
-			// <a href="/wiki/Help:IPA/Standard_German" title="Help:IPA/Standard German">[ˈaɐ̯tʊɐ̯ ˈʃoːpn̩haʊ̯ɐ]</a>
-            } else if (link.contains(linkPrefix) && link.contains(":")) {
-				/**
-				 * TODO buggy
-				 * Cleans up child nodes: difficult example
-				 * <a href="/wiki/Help:IPA/English" title="Help:IPA/English">/<span style="border-bottom:1px dotted"><span title="/ˈ/: primary stress follows">ˈ</span><span title="/ʃ/: 'sh' in 'shy'">ʃ</span><span title="/oʊ/: 'o' in 'code'">oʊ</span><span title="'p' in 'pie'">p</span><span title="/ən/: 'on' in 'button'">ən</span><span title="'h' in 'hi'">h</span><span title="/aʊ/: 'ou' in 'mouth'">aʊ</span><span title="/./: syllable break">.</span><span title="/ər/: 'er' in 'letter'">ər</span></span>/</a>
-				 */
-                if (!node.childNodes().isEmpty()) {
-                    if (node.childNode(0).nodeName().equals("#text") &&
-                            node.childNode(0).toString().contains(":") &&
-                            !node.childNode(0).toString().contains("http")) {
-                        tempLink = new Link();
-                        String uri = cleanLink(node.attr("href"), false);
-                        setUri(uri);
-                    }
-                } else {
-                    skipLevel = depth;
-                }
-            //TODO add example
-            } else if (node.attr("class").equals("external text")) {
-                //don't skip external links
-                tempLink = new Link();
-                String uri = cleanLink(node.attr("href"), true);
-                setUri(uri);
-
-            } else {
-                skipLevel = depth;
-            }
+			// SPECIAL CASE FOR RESTAPI PARSING
+
+			if(node.hasAttr("rel")){
+
+				String relType = node.attr("rel");
+				if(relType.equals("mw:WikiLink")){
+
+						tempLink = new Link();
+						String uri = cleanLink(node.attr("href"), false);
+						setUri(uri);
+
+
+				} else if (relType.equals("mw:ExtLink")) {
+						tempLink = new Link();
+						String uri = cleanLink(node.attr("href"), true);
+						setUri(uri);
+				}
+			}else{
+				// standard wikilinks
+				if (link.contains(linkPrefix) && !link.contains(":")) {
+					tempLink = new Link();
+					String uri = cleanLink(node.attr("href"), false);
+					setUri(uri);
+
+					//simple example of Help:IPA
+					// <a href="/wiki/Help:IPA/Standard_German" title="Help:IPA/Standard German">[ˈaɐ̯tʊɐ̯ ˈʃoːpn̩haʊ̯ɐ]</a>
+				} else if (link.contains(linkPrefix) && link.contains(":")) {
+					/**
+					 * TODO buggy
+					 * Cleans up child nodes: difficult example
+					 * <a href="/wiki/Help:IPA/English" title="Help:IPA/English">/<span style="border-bottom:1px dotted"><span title="/ˈ/: primary stress follows">ˈ</span><span title="/ʃ/: 'sh' in 'shy'">ʃ</span><span title="/oʊ/: 'o' in 'code'">oʊ</span><span title="'p' in 'pie'">p</span><span title="/ən/: 'on' in 'button'">ən</span><span title="'h' in 'hi'">h</span><span title="/aʊ/: 'ou' in 'mouth'">aʊ</span><span title="/./: syllable break">.</span><span title="/ər/: 'er' in 'letter'">ər</span></span>/</a>
+					 */
+					if (!node.childNodes().isEmpty()) {
+						if (node.childNode(0).nodeName().equals("#text") &&
+								node.childNode(0).toString().contains(":") &&
+								!node.childNode(0).toString().contains("http")) {
+							tempLink = new Link();
+							String uri = cleanLink(node.attr("href"), false);
+							setUri(uri);
+						}
+					} else {
+						skipLevel = depth;
+					}
+					//TODO add example
+				} else if (node.attr("class").equals("external text")) {
+					//don't skip external links
+					tempLink = new Link();
+					String uri = cleanLink(node.attr("href"), true);
+					setUri(uri);
+
+				} else {
+					skipLevel = depth;
+				}
+			}
+
+
         } else if(node.nodeName().equals("p")) {
             if(paragraph != null) {
                 addParagraph("p");
@@ -136,6 +166,7 @@ public void head(Node node, int depth) {
             skipLevel = depth;
         } else if(node.nodeName().equals("span")) {
 			//denote notes
+
 		    if(node.attr("class").contains("notebegin"))
                 addParagraph("note");
 
@@ -159,13 +190,21 @@ private void setUri(String uri) {
 
 	private String cleanLink(String uri, boolean external) {
 		if(!external) {
+
+			String linkPrefix = "/wiki/";
+			String linkPrefix2= "./";
+			if(uri.contains(linkPrefix)){
+				uri=uri.substring(uri.indexOf("?title=")+7);
+			} else if (uri.contains(linkPrefix2)) {
+				uri=uri.substring(uri.indexOf("?title=")+3);
+			}
 			//TODO central string management
 			if(!this.context.language.equals("en")) {
-
-				uri="http://"+this.context.language+".dbpedia.org/resource/"+uri.substring(uri.indexOf("?title=")+7);
+				uri="http://"+this.context.language+".dbpedia.org/resource/"+uri;
 
-			} else {
-				uri="http://dbpedia.org/resource/"+uri.substring(uri.indexOf("?title=")+7);
+			}
+			else {
+				uri="http://dbpedia.org/resource/"+uri;
 			}
 			uri = uri.replace("&action=edit&redlink=1", "");
 
@@ -183,22 +222,23 @@ private String cleanLink(String uri, boolean external) {
 				e.printStackTrace();
 			}
 		}
-
 		return UriUtils.uriToDbpediaIri(uri).toString();
 	}
 
 	public void tail(Node node, int depth) {
-
+
+
 		if(skipLevel>0) {
 			if(skipLevel==depth) {
+
 				skipLevel = -1;
 				return;
 			} else {
 				return;
 			}
 		}
 
-		if(node.nodeName().equals("a")&&inLink) {
+		if(node.nodeName().equals("a") && inLink) {
 			inLink = false;
 			paragraph.addLink(tempLink);
 			tempLink = new Link();
@@ -210,6 +250,7 @@ else if(node.nodeName().equals("p") && paragraph != null) {
             addParagraph("p");
         }
         else if(node.nodeName().equals("sup") && inSup) {
+
 			inSup = false;
 		}
         else if(node.nodeName().matches("h\\d")) {

diff --git a/core/src/main/scala/org/dbpedia/extraction/config/Config.scala b/core/src/main/scala/org/dbpedia/extraction/config/Config.scala
@@ -94,10 +94,10 @@ class Config(val configPath: String) extends
   }
 
   /**
-    * Number of parallel processes allowed. Depends on the number of cores, type of disk and IO speed
+    * Number of parallel processes allowed. Depends on the number of cores, type of disk, and IO speed
     *
     */
-  lazy val parallelProcesses: Int = this.getProperty("parallel-processes", "4").trim.toInt
+  lazy val parallelProcesses: Int = this.getProperty("parallel-processes", "1").trim.toInt
 
   lazy val sparkMaster: String = Option(getString(this, "spark-master")).getOrElse("local[*]")
 
@@ -259,18 +259,32 @@ class Config(val configPath: String) extends
   }
 
   lazy val mediawikiConnection: MediaWikiConnection = Try {
+
     MediaWikiConnection(
-      apiUrl = this.getProperty("mwc-apiUrl", "").trim,
+      apiType = this.getProperty("mwc-type", "").trim,
+      apiUrl = this.getProperty("mwc-type").trim match {
+        case "rest" =>  this.getProperty("mwc-apiRestUrl", "").trim
+        case "mwc" => this.getProperty("mwc-apiMWCUrl", "").trim
+        case "local" => this.getProperty("mwc-apiLocalUrl", "").trim
+      },
       maxRetries = this.getProperty("mwc-maxRetries", "4").trim.toInt,
       connectMs = this.getProperty("mwc-connectMs", "2000").trim.toInt,
       readMs = this.getProperty("mwc-readMs", "5000").trim.toInt,
-      sleepFactor = this.getProperty("mwc-sleepFactor", "1000").trim.toInt
+      sleepFactor = this.getProperty("mwc-sleepFactor", "1000").trim.toInt,
+      maxlag = this.getProperty("mwc-maxlag", "5").trim.toInt,
+      useragent = this.getProperty("mwc-useragent", "anonymous").trim,
+      gzip = this.getProperty("mwc-gzip","false").trim.toBoolean,
+      retryafter = this.getProperty("mwc-retryafter", "false").trim.toBoolean,
+      accept = this.getProperty("mwc-accept", "text/html").trim,
+      charset = this.getProperty("mwc-charset", "utf-8").trim,
+      profile = this.getProperty("mwc-profile", "https://www.mediawiki.org/wiki/Specs/HTML/2.1.0").trim
     )
   } match{
     case Success(s) => s
-    case Failure(f) => throw new IllegalArgumentException("Not all necessary parameters for the 'MediaWikiConnection' class were provided or could not be parsed to the expected type.", f)
+    case Failure(f) => throw new IllegalArgumentException("Some parameters necessary for the 'MediaWikiConnection' class were not provided or could not be parsed to the expected type.", f)
   }
 
+
   lazy val abstractParameters: AbstractParameters = Try {
     AbstractParameters(
       abstractQuery = this.getProperty("abstract-query", "").trim,
@@ -364,12 +378,20 @@ object Config{
     * @param sleepFactor
     */
   case class MediaWikiConnection(
-    apiUrl: String,
-    maxRetries: Int,
-    connectMs: Int,
-    readMs: Int,
-    sleepFactor: Int
-  )
+                                  apiType: String,
+                                  apiUrl: String,
+                                  maxRetries: Int,
+                                  connectMs: Int,
+                                  readMs: Int,
+                                  sleepFactor: Int,
+                                  maxlag: Int,
+                                  useragent: String,
+                                  gzip: Boolean,
+                                  retryafter: Boolean,
+                                  accept  : String,
+                                  charset: String,
+                                  profile: String
+                                )
 
   case class AbstractParameters(
                                  abstractQuery: String,

diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/NifExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/NifExtractor.scala
@@ -3,10 +3,10 @@ package org.dbpedia.extraction.mappings
 import org.dbpedia.extraction.annotations.ExtractorAnnotation
 import org.dbpedia.extraction.config.Config
 import org.dbpedia.extraction.config.provenance.DBpediaDatasets
-import org.dbpedia.extraction.nif.WikipediaNifExtractor
+import org.dbpedia.extraction.nif.{WikipediaNifExtractorRest, WikipediaNifExtractor}
 import org.dbpedia.extraction.ontology.Ontology
 import org.dbpedia.extraction.transform.Quad
-import org.dbpedia.extraction.util.{Language, MediaWikiConnector}
+import org.dbpedia.extraction.util.{Language, MediawikiConnectorConfigured, MediaWikiConnectorRest}
 import org.dbpedia.extraction.wikiparser._
 
 import scala.language.reflectiveCalls
@@ -41,12 +41,11 @@ class NifExtractor(
   protected val writeLinkAnchors: Boolean = context.configFile.nifParameters.writeLinkAnchor
   protected val writeStrings: Boolean = context.configFile.nifParameters.writeAnchor
   protected val shortAbstractLength: Int = context.configFile.abstractParameters.shortAbstractMinLength
-
+  protected val abstractsOnly : Boolean =  context.configFile.nifParameters.abstractsOnly
   protected val dbpediaVersion: String = context.configFile.dbPediaVersion
 
   override val datasets = Set(DBpediaDatasets.NifContext,DBpediaDatasets.NifPageStructure,DBpediaDatasets.NifTextLinks,DBpediaDatasets.LongAbstracts, DBpediaDatasets.ShortAbstracts, DBpediaDatasets.RawTables, DBpediaDatasets.Equations)
 
-  private val mwConnector = new MediaWikiConnector(context.configFile.mediawikiConnection, context.configFile.nifParameters.nifTags.split(","))
 
   override def extract(pageNode : WikiPage, subjectUri : String): Seq[Quad] =
   {
@@ -56,13 +55,24 @@ class NifExtractor(
     //Don't extract abstracts from redirect and disambiguation pages
     if(pageNode.isRedirect || pageNode.isDisambiguation) return Seq.empty
 
-    //Retrieve page text
-    val html = mwConnector.retrievePage(pageNode.title, apiParametersFormat, pageNode.isRetry) match{
-      case Some(t) => NifExtractor.postProcessExtractedHtml(pageNode.title, t)
-      case None => return Seq.empty
-    }
+    var html = ""
+    val mwcType = context.configFile.mediawikiConnection.apiType
 
-    new WikipediaNifExtractor(context, pageNode).extractNif(html)(err => pageNode.addExtractionRecord(err))
+    if (mwcType == "rest") {
+      val mwConnector = new MediaWikiConnectorRest(context.configFile.mediawikiConnection, context.configFile.nifParameters.nifTags.split(","))
+      html = mwConnector.retrievePage(pageNode.title, apiParametersFormat, pageNode.isRetry) match {
+        case Some(t) => NifExtractor.postProcessExtractedHtml(pageNode.title, t)
+        case None => return Seq.empty
+      }
+      new WikipediaNifExtractorRest(context, pageNode).extractNif(html)(err => pageNode.addExtractionRecord(err))
+    } else {
+      val mwConnector = new MediawikiConnectorConfigured(context.configFile.mediawikiConnection, context.configFile.nifParameters.nifTags.split(","))
+      html = mwConnector.retrievePage(pageNode.title, apiParametersFormat, pageNode.isRetry) match {
+        case Some(t) => NifExtractor.postProcessExtractedHtml(pageNode.title, t)
+        case None => return Seq.empty
+      }
+      new WikipediaNifExtractor(context, pageNode).extractNif(html)(err => pageNode.addExtractionRecord(err))
+    }
   }
 
 }

diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/PlainAbstractExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/PlainAbstractExtractor.scala
@@ -7,7 +7,7 @@ import org.dbpedia.extraction.config.provenance.DBpediaDatasets
 import org.dbpedia.extraction.ontology.Ontology
 import org.dbpedia.extraction.transform.{Quad, QuadBuilder}
 import org.dbpedia.extraction.util.abstracts.AbstractUtils
-import org.dbpedia.extraction.util.{Language, MediaWikiConnector, WikiUtil}
+import org.dbpedia.extraction.util.{Language, MediawikiConnectorConfigured}
 import org.dbpedia.extraction.wikiparser._
 
 import scala.language.reflectiveCalls
@@ -63,7 +63,6 @@ extends WikiPageExtractor
 
   override val datasets = Set(DBpediaDatasets.LongAbstracts, DBpediaDatasets.ShortAbstracts)
 
-  private val mwConnector = new MediaWikiConnector(context.configFile.mediawikiConnection, context.configFile.abstractParameters.abstractTags.split(","))
 
     override def extract(pageNode : WikiPage, subjectUri: String): Seq[Quad] =
     {
@@ -79,7 +78,8 @@ extends WikiPageExtractor
         //val abstractWikiText = getAbstractWikiText(pageNode)
         // if(abstractWikiText == "") return Seq.empty
 
-        //Retrieve page text
+
+        val mwConnector = new MediawikiConnectorConfigured(context.configFile.mediawikiConnection, context.configFile.abstractParameters.abstractTags.split(","))
         val text = mwConnector.retrievePage(pageNode.title, apiParametersFormat, pageNode.isRetry) match {
           case Some(t) => PlainAbstractExtractor.postProcessExtractedHtml(pageNode.title, replacePatterns(t))
           case None => return Seq.empty