Skip to content

Commit 022a693

Browse files
committed
Adding YAGO NounGroup parser and relevant changes
1 parent 756b45c commit 022a693

File tree

3 files changed

+345
-2
lines changed

3 files changed

+345
-2
lines changed

WikipediaCategoryProcessor/src/main/java/org/yago/javatools/administrative/D.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@ protected static void i() {
7676
/** Prints some Objects, returns them */
7777
public static Object p(Object... a) {
7878
pl(a);
79-
System.out.println("");
8079
if (a == null || a.length == 0) return (null);
8180
if (a.length == 1) return (a[0]);
8281
return (a);
@@ -87,8 +86,11 @@ public static Object println(Object... a) {
8786
return (p(a));
8887
}
8988

89+
90+
9091
/** Prints some Objects on one line */
91-
public static void pl(Object... a) {
92+
public static void pl(Object... a) {
93+
//System.out.print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@");
9294
System.out.print(toString(a));
9395
}
9496

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
* To change this template, choose Tools | Templates
3+
* and open the template in the editor.
4+
*/
5+
/**
6+
* KarshaAnnotate- Annotation tool for financial documents
7+
*
8+
* Copyright (C) 2013, Lanka Software Foundation and and University of Maryland.
9+
*
10+
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General
11+
* Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any
12+
* later version.
13+
*
14+
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
15+
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
16+
* details.
17+
*
18+
* You should have received a copy of the GNU Affero General Public License along with this program. If not, see
19+
* <http://www.gnu.org/licenses/>.
20+
*
21+
* Date Author Changes Sep 4, 2013 Kasun Perera Created
22+
*
23+
*/
24+
package org.yago.javatools.administrative;
25+
26+
27+
import org.yago.javatools.parsers.NounGroup;
28+
29+
/**
30+
* TODO- describe the purpose of the class
31+
*
32+
*/
33+
public class Elements
34+
{
35+
36+
public static void main( String[] args ) throws Exception
37+
{
38+
39+
System.out.println( getHead( "booooooooo" ) );
40+
41+
}
42+
43+
public static String getHead( String category )
44+
{
45+
46+
String elementList[] = splitObject( new NounGroup( category ).description() );
47+
if ( elementList == null || elementList.length == 0 )
48+
{
49+
return ( null );
50+
}
51+
/*
52+
* lelemnts of the elementList
53+
* [0]"NounGroup:
54+
* [1]Original: "+original+"
55+
* [2]Stemmed: "+stemmed()+"
56+
* [3]Determiner: "+determiner+"
57+
* [4]preModifiers: "+preModifier+"
58+
* [5]Head: "+head+"
59+
* [6]Adjective:"+adjective+"
60+
* [7]Preposition: "+preposition+"
61+
* [8]postModifier:\n"+(postModifier==null?"":postModifier.description()));
62+
*
63+
*/
64+
String head[] = elementList[5].split( ":" );
65+
if(head.length<1){
66+
return (null);
67+
}
68+
69+
return (head[1].trim());
70+
}
71+
72+
public static String[] splitObject( Object... a )
73+
{
74+
String objectlist[] = D.toString( a ).split( "\\n" );
75+
76+
return objectlist;
77+
}
78+
}
Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
package org.yago.javatools.parsers;
2+
import java.util.Arrays;
3+
import java.util.List;
4+
import java.util.Set;
5+
6+
import org.yago.javatools.administrative.D;
7+
import org.yago.javatools.datatypes.FinalSet;
8+
9+
/**
10+
This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools).
11+
It is licensed under the Creative Commons Attribution License
12+
(see http://creativecommons.org/licenses/by/3.0) by
13+
the YAGO-NAGA team (see http://mpii.de/yago-naga).
14+
15+
16+
17+
18+
19+
The class NounGroup splits a noun group (given by a String) into its
20+
modifiers and its head.<BR>
21+
Example:
22+
<PRE>
23+
System.out.println(new NounGroup("the United States of America").description());
24+
->
25+
NounGroup:
26+
Original: the_United_States_of_America
27+
Determiner: the
28+
Head: State
29+
Plural: true
30+
preModifiers: United
31+
Adjective:
32+
Preposition: of
33+
postModifier:
34+
NounGroup:
35+
Original: America
36+
Determiner:
37+
Head: America
38+
Plural: false
39+
preModifiers:
40+
Preposition:
41+
postModifier:
42+
</PRE>
43+
*/
44+
public class NounGroup {
45+
46+
/** Defines just one function from a String to a boolean */
47+
public interface String2Boolean {
48+
/** Function from a String to a boolean */
49+
boolean apply(String s);
50+
}
51+
52+
/** Tells whether a word is an adjective (currently by a simple heuristics */
53+
public static String2Boolean isAdjective=new String2Boolean() {
54+
public boolean apply(String s) {
55+
return(s.length()>0 && Character.isLowerCase(s.charAt(0)) &&
56+
(s.endsWith("al") || s.endsWith("ed") || s.endsWith("ing")));
57+
}
58+
};
59+
60+
/** Contains determiners*/
61+
public static final Set<String> determiners=new FinalSet<String>(
62+
"the",
63+
"a",
64+
"an",
65+
"this",
66+
"these",
67+
"those"
68+
);
69+
70+
/** Holds prepositions (like "of" etc.) */
71+
public static final FinalSet<String> prepositions=new FinalSet<String>(
72+
",",
73+
"at",
74+
"about",
75+
"and",
76+
"by",
77+
"for",
78+
"from",
79+
"in",
80+
"of",
81+
"on",
82+
"to",
83+
"with",
84+
"who",
85+
"-",
86+
"\u2248",
87+
"under"
88+
);
89+
90+
/** Holds the original noun group */
91+
protected String original;
92+
93+
/** Holds the adjective */
94+
protected String adjective;
95+
96+
/** Holds the preposition */
97+
protected String preposition;
98+
99+
/** Holds the noun group after the preposition */
100+
protected NounGroup postModifier;
101+
102+
/** Holds the head of the noun group */
103+
protected String head;
104+
105+
/** Holds the modifiers before the head */
106+
protected String preModifier;
107+
108+
/** Holds the determiner (if any) */
109+
protected String determiner;
110+
111+
/** Returns the adjective. */
112+
public String adjective() {
113+
return adjective;
114+
}
115+
116+
/**Returns the determiner. */
117+
public String determiner() {
118+
return determiner;
119+
}
120+
121+
/** Returns the head (lowercased singular). */
122+
public String head() {
123+
return head;
124+
}
125+
126+
/**Returns the original. */
127+
public String original() {
128+
return original;
129+
}
130+
131+
/** Returns the postModifier. */
132+
public NounGroup postModifier() {
133+
return postModifier;
134+
}
135+
136+
/** Returns the preModifier. */
137+
public String preModifier() {
138+
return preModifier;
139+
}
140+
141+
/** Returns the preposition.*/
142+
public String preposition() {
143+
return preposition;
144+
}
145+
146+
/** Returns the full name with the head word stemmed */
147+
public String stemmed() {
148+
StringBuilder full=new StringBuilder();
149+
if(preModifier!=null) full.append(preModifier).append(' ');
150+
full.append(PlingStemmer.stem(head.toLowerCase()));
151+
if(adjective!=null) full.append(' ').append(adjective);
152+
if(preposition!=null) full.append(' ').append(preposition);
153+
if(postModifier!=null) full.append(' ').append(postModifier.original());
154+
return(full.toString());
155+
}
156+
157+
/** Stems the head. TRUE if this had any effect */
158+
public boolean stemHead() {
159+
String stemmed=PlingStemmer.stem(head);
160+
boolean result=!stemmed.equals(head);
161+
head=stemmed;
162+
return(result);
163+
}
164+
/** Constructs a noun group from a String */
165+
public NounGroup(String s) {
166+
this(Arrays.asList(s.split("[\\s_]+")));
167+
}
168+
169+
/** Constructs a noun group from a list of words */
170+
public NounGroup(List<String> words) {
171+
// Assemble the original
172+
original=words.toString().replace(", ", " ");
173+
original=original.substring(1,original.length()-1);
174+
175+
// Cut away preceding determiners
176+
if(words.size()>0 && determiners.contains(words.get(0).toLowerCase())) {
177+
determiner=words.get(0).toLowerCase();
178+
words=words.subList(1, words.size());
179+
}
180+
181+
// Locate prepositions (but not in first or last position)
182+
int prepPos;
183+
for(prepPos=1;prepPos<words.size()-1;prepPos++) {
184+
if(prepositions.contains(words.get(prepPos))) {
185+
preposition=words.get(prepPos);
186+
break;
187+
}
188+
}
189+
190+
// Locate "-ing"-adjectives before prepositions (but not at pos 0)
191+
int ingPos;
192+
for(ingPos=1;ingPos<prepPos;ingPos++) {
193+
if(words.get(ingPos).endsWith("ing")) {
194+
adjective=words.get(ingPos);
195+
break;
196+
}
197+
}
198+
199+
// Cut off postmodifier in "Blubs blubbing in blah"
200+
if(preposition!=null && adjective!=null && ingPos==prepPos-1) {
201+
postModifier=new NounGroup(words.subList(prepPos+1, words.size()));
202+
words=words.subList(0, ingPos);
203+
}
204+
// Cut off postmodifier in "Blubs blubbing blah"
205+
else if(adjective!=null) {
206+
postModifier=new NounGroup(words.subList(ingPos+1, words.size()));
207+
words=words.subList(0, ingPos);
208+
}
209+
// Cut off postmodifier in "Blubs in blah"
210+
else if(preposition!=null) {
211+
postModifier=new NounGroup(words.subList(prepPos+1, words.size()));
212+
if(prepPos>1 && isAdjective.apply(words.get(prepPos-1))) {
213+
adjective=words.get(prepPos-1);
214+
words=words.subList(0, prepPos-1);
215+
} else {
216+
words=words.subList(0, prepPos);
217+
}
218+
}
219+
220+
if(words.size()==0) return;
221+
222+
head=words.get(words.size()-1);
223+
if(words.size()>1) {
224+
preModifier=words.subList(0, words.size()-1).toString().replace(", ", "_");
225+
preModifier=preModifier.substring(1, preModifier.length()-1);
226+
}
227+
}
228+
229+
230+
/** Checks if the originals match */
231+
public boolean equals(Object o) {
232+
return(o instanceof NounGroup && ((NounGroup)o).original.equals(original));
233+
}
234+
235+
/** Returns the original */
236+
public String toString() {
237+
return(original);
238+
}
239+
240+
/** Returns all fields in a String */
241+
public String description() {
242+
return("NounGroup:\n"+
243+
" Original: "+original+"\n"+
244+
" Stemmed: "+stemmed()+"\n"+
245+
" Determiner: "+determiner+"\n"+
246+
" preModifiers: "+preModifier+"\n"+
247+
" Head: "+head+"\n"+
248+
" Adjective: "+adjective+"\n"+
249+
" Preposition: "+preposition+"\n"+
250+
" postModifier: \n"+(postModifier==null?"":postModifier.description()));
251+
}
252+
253+
/** Test method */
254+
public static void main(String[] args) throws Exception {
255+
D.p("Enter a noun group and press ENTER. Press CTRL+C to abort");
256+
// while(true) {
257+
// D.p(new NounGroup(D.r()).description());
258+
// }
259+
260+
D.p(new NounGroup("Star_Trek_characters").description());
261+
}
262+
263+
}

0 commit comments

Comments
 (0)