Skip to content

Commit 73a0992

Browse files
authored
Merge branch 'dbpedia:master' into sisterproject_extractor
2 parents 1f13fe4 + 4946db2 commit 73a0992

File tree

10 files changed

+55
-144
lines changed

10 files changed

+55
-144
lines changed

core/src/main/java/org/dbpedia/extraction/nif/LinkExtractor.java

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ public void head(Node node, int depth) {
4343
return;
4444
}
4545

46-
4746
if(paragraph == null) {
4847
paragraph = new Paragraph(0, "", "p");
4948
}
@@ -63,7 +62,6 @@ public void head(Node node, int depth) {
6362

6463
//this text node is the content of an <a> element: make a new nif:Word
6564
if(inLink) {
66-
6765
if(!tempText.trim().startsWith(this.context.wikipediaTemplateString + ":")) //not!
6866
{
6967
tempLink.setLinkText(tempText);
@@ -75,11 +73,9 @@ public void head(Node node, int depth) {
7573
errors.add("found Template in resource: " + this.context.resource + ": " + tempText);
7674
return;
7775
}
78-
7976
}
8077
else
8178
paragraph.addText(tempText);
82-
8379
}
8480

8581
else if(node.nodeName().equals("a")) {
@@ -93,24 +89,20 @@ else if(node.nodeName().equals("a")) {
9389
* see Schopenhauer: https://en.wikipedia.org/w/api.php?uselang=en&format=xml&action=parse&prop=text&pageid=17340400
9490
*/
9591
String linkPrefix = "/wiki/";
96-
// SPECIAL CASE FOR RESTAPI PARSING
97-
98-
if(node.hasAttr("rel")){
9992

93+
// SPECIAL CASE FOR RESTAPI PARSING https://en.wikipedia.org/api/rest_v1/
94+
if(node.hasAttr("rel")) {
10095
String relType = node.attr("rel");
10196
if(relType.equals("mw:WikiLink")){
102-
103-
tempLink = new Link();
104-
String uri = cleanLink(node.attr("href"), false);
105-
setUri(uri);
106-
107-
97+
tempLink = new Link();
98+
String uri = cleanLink(node.attr("href"), false);
99+
setUri(uri);
108100
} else if (relType.equals("mw:ExtLink")) {
109-
tempLink = new Link();
110-
String uri = cleanLink(node.attr("href"), true);
111-
setUri(uri);
101+
tempLink = new Link();
102+
String uri = cleanLink(node.attr("href"), true);
103+
setUri(uri);
112104
}
113-
}else{
105+
} else {
114106
// standard wikilinks
115107
if (link.contains(linkPrefix) && !link.contains(":")) {
116108
tempLink = new Link();
@@ -147,8 +139,6 @@ else if(node.nodeName().equals("a")) {
147139
skipLevel = depth;
148140
}
149141
}
150-
151-
152142
} else if(node.nodeName().equals("p")) {
153143
if(paragraph != null) {
154144
addParagraph("p");
@@ -201,17 +191,14 @@ private String cleanLink(String uri, boolean external) {
201191
//TODO central string management
202192
if(!this.context.language.equals("en")) {
203193
uri="http://"+this.context.language+".dbpedia.org/resource/"+uri;
204-
205-
}
206-
else {
194+
} else {
207195
uri="http://dbpedia.org/resource/"+uri;
208196
}
209197
uri = uri.replace("&action=edit&redlink=1", "");
210198

211199
} else {
212200
//there are links that contain illegal hostnames
213201
try {
214-
215202
if(uri.startsWith("//"))
216203
uri = "http:"+uri;
217204
uri = URLEncoder.encode(uri,"UTF-8");
@@ -226,11 +213,8 @@ private String cleanLink(String uri, boolean external) {
226213
}
227214

228215
public void tail(Node node, int depth) {
229-
230-
231216
if(skipLevel>0) {
232217
if(skipLevel==depth) {
233-
234218
skipLevel = -1;
235219
return;
236220
} else {

core/src/main/scala/org/dbpedia/extraction/config/Config.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ class Config(val configPath: String) extends
9797
* Number of parallel processes allowed. Depends on the number of cores, type of disk, and IO speed
9898
*
9999
*/
100-
lazy val parallelProcesses: Int = this.getProperty("parallel-processes", "1").trim.toInt
100+
lazy val parallelProcesses: Int = this.getProperty("parallel-processes", "4").trim.toInt
101101

102102
lazy val sparkMaster: String = Option(getString(this, "spark-master")).getOrElse("local[*]")
103103

core/src/main/scala/org/dbpedia/extraction/mappings/PlainAbstractExtractor.scala

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,6 @@ extends WikiPageExtractor
7878
//val abstractWikiText = getAbstractWikiText(pageNode)
7979
// if(abstractWikiText == "") return Seq.empty
8080

81-
8281
val mwConnector = new MediawikiConnectorConfigured(context.configFile.mediawikiConnection, context.configFile.abstractParameters.abstractTags.split(","))
8382
val text = mwConnector.retrievePage(pageNode.title, apiParametersFormat, pageNode.isRetry) match {
8483
case Some(t) => PlainAbstractExtractor.postProcessExtractedHtml(pageNode.title, replacePatterns(t))

core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -59,38 +59,30 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara
5959

6060
val sections = getRelevantParagraphs(html)
6161

62-
var context = ""
62+
var context = ""
6363
var offset = 0
64-
6564
val quads = for(section <- sections) yield {
6665
extractTextFromHtml(section, new NifExtractorContext(language, subjectIri, templateString)) match {
6766
case Success(extractionResults) => {
6867
sectionMap.put(section, extractionResults)
6968
sectionMap.put(extractionResults, extractionResults)
7069

71-
7270
if (context.length != 0) {
7371
context = context + "\n\n"
7472
offset += 2
7573
}
76-
7774
var quads = if(nifParameters.abstractsOnly)
7875
Seq()
7976
else
8077
makeStructureElements(extractionResults, nifContextIri, graphIri, offset)
8178

82-
8379
offset += extractionResults.getExtractedLength
8480
context += extractionResults.getExtractedText
8581

8682
//collect additional triples
8783
quads ++= extendSectionTriples(extractionResults, graphIri, subjectIri)
88-
89-
9084
//forward exceptions
9185
extractionResults.errors.foreach(exceptionHandle(_, RecordSeverity.Warning, null))
92-
93-
9486
quads
9587
}
9688
case Failure(e) => {
@@ -151,7 +143,6 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara
151143
triples += nifStructure(p.getSectionIri(), RdfNamespace.NIF.append("nextSection"), sectionUri, sourceUrl, null)
152144
case None =>
153145
}
154-
155146
section.getTop match{
156147
case Some(p) =>
157148
triples += nifStructure(sectionUri, RdfNamespace.NIF.append("superString"), p.getSectionIri(), sourceUrl, null)
@@ -177,8 +168,6 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara
177168
if (section.next.isEmpty)
178169
triples += nifStructure(section.getTop.get.getSectionIri(), RdfNamespace.NIF.append("lastSection"), sectionUri, sourceUrl, null)
179170
}
180-
181-
182171
}
183172

184173
//further specifying paragraphs of every section
@@ -355,9 +344,7 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara
355344
}
356345

357346
protected def getJsoupDoc(html: String): Document = {
358-
359-
var html_clean=cleanHtml(html)
360-
val doc = Jsoup.parse( html_clean)
347+
val doc = Jsoup.parse(cleanHtml(html))
361348

362349
//delete queries
363350
for(query <- cssSelectorConfigMap.removeElements)

core/src/main/scala/org/dbpedia/extraction/nif/WikipediaNifExtractorRest.scala

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,13 @@ import scala.language.reflectiveCalls
1414
* Created by Chile on 1/19/2017.
1515
*/
1616
class WikipediaNifExtractorRest(
17-
context : {
18-
def ontology : Ontology
19-
def language : Language
20-
def configFile : Config
21-
},
22-
wikiPage: WikiPage
23-
)
24-
extends WikipediaNifExtractor ( context ,wikiPage) {
17+
context : {
18+
def ontology : Ontology
19+
def language : Language
20+
def configFile : Config
21+
},
22+
wikiPage: WikiPage
23+
) extends WikipediaNifExtractor(context ,wikiPage) {
2524

2625

2726
/**
@@ -52,7 +51,7 @@ class WikipediaNifExtractorRest(
5251

5352
title match {
5453

55-
case Some(t) if super.isWikiNextTitle(t) && !processEnd=>
54+
case Some(t) if super.isWikiNextTitle(t) && !processEnd =>
5655

5756
//calculate the section number by looking at the <h2> to <h4> tags
5857
val depth = Integer.parseInt(t.asInstanceOf[org.jsoup.nodes.Element].tagName().substring(1)) - 1
@@ -67,9 +66,7 @@ class WikipediaNifExtractorRest(
6766
if (currentSection.size == depth - 1)
6867
currentSection.append(zw + 1)
6968
}
70-
7169
subnodes = subnodes.drop(1)
72-
7370
val section = new PageSection(
7471
//previous section (if on same depth level
7572
prev = currentSection.last match {
@@ -89,7 +86,6 @@ class WikipediaNifExtractorRest(
8986
//take all following tags until you hit another title or end of content
9087
content = Seq(t) ++ subnodes.takeWhile(node => !node.nodeName().matches("h\\d") && !node.nodeName().matches("section"))
9188
)
92-
9389
tocMap.append(section)
9490
case None => processEnd=true
9591
case _ => processEnd=true
@@ -98,13 +94,10 @@ class WikipediaNifExtractorRest(
9894
getSection(subnodes)
9995
subnodes = subnodes.drop(1)
10096
}
101-
10297
subnodes = subnodes.dropWhile(node => !node.nodeName().matches("h\\d") && !node.nodeName().matches("section"))
10398
}
104-
10599
}
106100

107-
108101
val abstractSect=doc.select("body").select("section").first.childNodes.asScala //get first section
109102
val ab = abstractSect.filter(node => node.nodeName() == "p") //move cursor to abstract
110103

@@ -131,7 +124,4 @@ class WikipediaNifExtractorRest(
131124
}
132125
tocMap
133126
}
134-
135-
136-
137127
}

core/src/main/scala/org/dbpedia/extraction/util/MediaWikiConnectorRest.scala

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ class MediaWikiConnectorRest(connectionConfig: MediaWikiConnection, xmlPath: Seq
2323
protected val apiProfile: String = connectionConfig.profile
2424
protected val userAgent: String = connectionConfig.useragent
2525

26-
2726
/**
2827
* Retrieves a Wikipedia page.
2928
*
@@ -52,11 +51,8 @@ class MediaWikiConnectorRest(connectionConfig: MediaWikiConnection, xmlPath: Seq
5251
val parameters = "redirect=true"
5352
val apiUrl: URL = new URL(url.concat(titleParam).concat("?"+parameters))
5453

55-
56-
5754
//println(s"mediawikiurl: $apiUrl")
5855

59-
6056
for (counter <- 1 to maxRetries) {
6157

6258
val conn = apiUrl.openConnection
@@ -77,8 +73,6 @@ class MediaWikiConnectorRest(connectionConfig: MediaWikiConnection, xmlPath: Seq
7773
val answerClean = answerHeader.asScala.filterKeys(_ != null)
7874

7975
if(conn.getHeaderField(null).contains("HTTP/1.1 200 OK") ){
80-
81-
8276
val end = java.time.LocalTime.now()
8377
conn match {
8478
case connection: HttpURLConnection =>
@@ -106,15 +100,12 @@ class MediaWikiConnectorRest(connectionConfig: MediaWikiConnection, xmlPath: Seq
106100
//println("WITH EXPONENTIAL BACK OFF" + counter)
107101
//println("Sleeping time double >>>>>>>>>>>" + pow(waiting_time, counter))
108102
//println("Sleeping time int >>>>>>>>>>>" + sleepMs)
109-
110103
}
111104
if (counter < maxRetries)
112105
Thread.sleep(sleepMs)
113106
else
114107
throw new Exception("Timeout error retrieving abstract of " + pageTitle + " in " + counter + " tries.")
115108
} else {
116-
117-
118109
//println(s"mediawikiurl: $apiUrl?$parameters")
119110
return parsedAnswer match {
120111
case Success(str) => Option(str)

0 commit comments

Comments
 (0)