|
| 1 | +package org.yago.javatools.parsers; |
| 2 | +import java.util.Arrays; |
| 3 | +import java.util.List; |
| 4 | +import java.util.Set; |
| 5 | + |
| 6 | +import org.yago.javatools.administrative.D; |
| 7 | +import org.yago.javatools.datatypes.FinalSet; |
| 8 | + |
| 9 | +/** |
| 10 | +This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools). |
| 11 | +It is licensed under the Creative Commons Attribution License |
| 12 | +(see http://creativecommons.org/licenses/by/3.0) by |
| 13 | +the YAGO-NAGA team (see http://mpii.de/yago-naga). |
| 14 | + |
| 15 | +
|
| 16 | + |
| 17 | + |
| 18 | +
|
| 19 | +The class NounGroup splits a noun group (given by a String) into its |
| 20 | +modifiers and its head.<BR> |
| 21 | +Example: |
| 22 | +<PRE> |
| 23 | + System.out.println(new NounGroup("the United States of America").description()); |
| 24 | + -> |
| 25 | + NounGroup: |
| 26 | + Original: the_United_States_of_America |
| 27 | + Determiner: the |
| 28 | + Head: State |
| 29 | + Plural: true |
| 30 | + preModifiers: United |
| 31 | + Adjective: |
| 32 | + Preposition: of |
| 33 | + postModifier: |
| 34 | + NounGroup: |
| 35 | + Original: America |
| 36 | + Determiner: |
| 37 | + Head: America |
| 38 | + Plural: false |
| 39 | + preModifiers: |
| 40 | + Preposition: |
| 41 | + postModifier: |
| 42 | +</PRE> |
| 43 | +*/ |
| 44 | +public class NounGroup { |
| 45 | + |
| 46 | + /** Defines just one function from a String to a boolean */ |
| 47 | + public interface String2Boolean { |
| 48 | + /** Function from a String to a boolean */ |
| 49 | + boolean apply(String s); |
| 50 | + } |
| 51 | + |
| 52 | + /** Tells whether a word is an adjective (currently by a simple heuristics */ |
| 53 | + public static String2Boolean isAdjective=new String2Boolean() { |
| 54 | + public boolean apply(String s) { |
| 55 | + return(s.length()>0 && Character.isLowerCase(s.charAt(0)) && |
| 56 | + (s.endsWith("al") || s.endsWith("ed") || s.endsWith("ing"))); |
| 57 | + } |
| 58 | + }; |
| 59 | + |
| 60 | + /** Contains determiners*/ |
| 61 | + public static final Set<String> determiners=new FinalSet<String>( |
| 62 | + "the", |
| 63 | + "a", |
| 64 | + "an", |
| 65 | + "this", |
| 66 | + "these", |
| 67 | + "those" |
| 68 | + ); |
| 69 | + |
| 70 | + /** Holds prepositions (like "of" etc.) */ |
| 71 | + public static final FinalSet<String> prepositions=new FinalSet<String>( |
| 72 | + ",", |
| 73 | + "at", |
| 74 | + "about", |
| 75 | + "and", |
| 76 | + "by", |
| 77 | + "for", |
| 78 | + "from", |
| 79 | + "in", |
| 80 | + "of", |
| 81 | + "on", |
| 82 | + "to", |
| 83 | + "with", |
| 84 | + "who", |
| 85 | + "-", |
| 86 | + "\u2248", |
| 87 | + "under" |
| 88 | + ); |
| 89 | + |
| 90 | + /** Holds the original noun group */ |
| 91 | + protected String original; |
| 92 | + |
| 93 | + /** Holds the adjective */ |
| 94 | + protected String adjective; |
| 95 | + |
| 96 | + /** Holds the preposition */ |
| 97 | + protected String preposition; |
| 98 | + |
| 99 | + /** Holds the noun group after the preposition */ |
| 100 | + protected NounGroup postModifier; |
| 101 | + |
| 102 | + /** Holds the head of the noun group */ |
| 103 | + protected String head; |
| 104 | + |
| 105 | + /** Holds the modifiers before the head */ |
| 106 | + protected String preModifier; |
| 107 | + |
| 108 | + /** Holds the determiner (if any) */ |
| 109 | + protected String determiner; |
| 110 | + |
| 111 | + /** Returns the adjective. */ |
| 112 | + public String adjective() { |
| 113 | + return adjective; |
| 114 | + } |
| 115 | + |
| 116 | + /**Returns the determiner. */ |
| 117 | + public String determiner() { |
| 118 | + return determiner; |
| 119 | + } |
| 120 | + |
| 121 | + /** Returns the head (lowercased singular). */ |
| 122 | + public String head() { |
| 123 | + return head; |
| 124 | + } |
| 125 | + |
| 126 | + /**Returns the original. */ |
| 127 | + public String original() { |
| 128 | + return original; |
| 129 | + } |
| 130 | + |
| 131 | + /** Returns the postModifier. */ |
| 132 | + public NounGroup postModifier() { |
| 133 | + return postModifier; |
| 134 | + } |
| 135 | + |
| 136 | + /** Returns the preModifier. */ |
| 137 | + public String preModifier() { |
| 138 | + return preModifier; |
| 139 | + } |
| 140 | + |
| 141 | + /** Returns the preposition.*/ |
| 142 | + public String preposition() { |
| 143 | + return preposition; |
| 144 | + } |
| 145 | + |
| 146 | + /** Returns the full name with the head word stemmed */ |
| 147 | + public String stemmed() { |
| 148 | + StringBuilder full=new StringBuilder(); |
| 149 | + if(preModifier!=null) full.append(preModifier).append(' '); |
| 150 | + full.append(PlingStemmer.stem(head.toLowerCase())); |
| 151 | + if(adjective!=null) full.append(' ').append(adjective); |
| 152 | + if(preposition!=null) full.append(' ').append(preposition); |
| 153 | + if(postModifier!=null) full.append(' ').append(postModifier.original()); |
| 154 | + return(full.toString()); |
| 155 | + } |
| 156 | + |
| 157 | + /** Stems the head. TRUE if this had any effect */ |
| 158 | + public boolean stemHead() { |
| 159 | + String stemmed=PlingStemmer.stem(head); |
| 160 | + boolean result=!stemmed.equals(head); |
| 161 | + head=stemmed; |
| 162 | + return(result); |
| 163 | + } |
| 164 | + /** Constructs a noun group from a String */ |
| 165 | + public NounGroup(String s) { |
| 166 | + this(Arrays.asList(s.split("[\\s_]+"))); |
| 167 | + } |
| 168 | + |
| 169 | + /** Constructs a noun group from a list of words */ |
| 170 | + public NounGroup(List<String> words) { |
| 171 | + // Assemble the original |
| 172 | + original=words.toString().replace(", ", " "); |
| 173 | + original=original.substring(1,original.length()-1); |
| 174 | + |
| 175 | + // Cut away preceding determiners |
| 176 | + if(words.size()>0 && determiners.contains(words.get(0).toLowerCase())) { |
| 177 | + determiner=words.get(0).toLowerCase(); |
| 178 | + words=words.subList(1, words.size()); |
| 179 | + } |
| 180 | + |
| 181 | + // Locate prepositions (but not in first or last position) |
| 182 | + int prepPos; |
| 183 | + for(prepPos=1;prepPos<words.size()-1;prepPos++) { |
| 184 | + if(prepositions.contains(words.get(prepPos))) { |
| 185 | + preposition=words.get(prepPos); |
| 186 | + break; |
| 187 | + } |
| 188 | + } |
| 189 | + |
| 190 | + // Locate "-ing"-adjectives before prepositions (but not at pos 0) |
| 191 | + int ingPos; |
| 192 | + for(ingPos=1;ingPos<prepPos;ingPos++) { |
| 193 | + if(words.get(ingPos).endsWith("ing")) { |
| 194 | + adjective=words.get(ingPos); |
| 195 | + break; |
| 196 | + } |
| 197 | + } |
| 198 | + |
| 199 | + // Cut off postmodifier in "Blubs blubbing in blah" |
| 200 | + if(preposition!=null && adjective!=null && ingPos==prepPos-1) { |
| 201 | + postModifier=new NounGroup(words.subList(prepPos+1, words.size())); |
| 202 | + words=words.subList(0, ingPos); |
| 203 | + } |
| 204 | + // Cut off postmodifier in "Blubs blubbing blah" |
| 205 | + else if(adjective!=null) { |
| 206 | + postModifier=new NounGroup(words.subList(ingPos+1, words.size())); |
| 207 | + words=words.subList(0, ingPos); |
| 208 | + } |
| 209 | + // Cut off postmodifier in "Blubs in blah" |
| 210 | + else if(preposition!=null) { |
| 211 | + postModifier=new NounGroup(words.subList(prepPos+1, words.size())); |
| 212 | + if(prepPos>1 && isAdjective.apply(words.get(prepPos-1))) { |
| 213 | + adjective=words.get(prepPos-1); |
| 214 | + words=words.subList(0, prepPos-1); |
| 215 | + } else { |
| 216 | + words=words.subList(0, prepPos); |
| 217 | + } |
| 218 | + } |
| 219 | + |
| 220 | + if(words.size()==0) return; |
| 221 | + |
| 222 | + head=words.get(words.size()-1); |
| 223 | + if(words.size()>1) { |
| 224 | + preModifier=words.subList(0, words.size()-1).toString().replace(", ", "_"); |
| 225 | + preModifier=preModifier.substring(1, preModifier.length()-1); |
| 226 | + } |
| 227 | + } |
| 228 | + |
| 229 | + |
| 230 | + /** Checks if the originals match */ |
| 231 | + public boolean equals(Object o) { |
| 232 | + return(o instanceof NounGroup && ((NounGroup)o).original.equals(original)); |
| 233 | + } |
| 234 | + |
| 235 | + /** Returns the original */ |
| 236 | + public String toString() { |
| 237 | + return(original); |
| 238 | + } |
| 239 | + |
| 240 | + /** Returns all fields in a String */ |
| 241 | + public String description() { |
| 242 | + return("NounGroup:\n"+ |
| 243 | + " Original: "+original+"\n"+ |
| 244 | + " Stemmed: "+stemmed()+"\n"+ |
| 245 | + " Determiner: "+determiner+"\n"+ |
| 246 | + " preModifiers: "+preModifier+"\n"+ |
| 247 | + " Head: "+head+"\n"+ |
| 248 | + " Adjective: "+adjective+"\n"+ |
| 249 | + " Preposition: "+preposition+"\n"+ |
| 250 | + " postModifier: \n"+(postModifier==null?"":postModifier.description())); |
| 251 | + } |
| 252 | + |
| 253 | + /** Test method */ |
| 254 | + public static void main(String[] args) throws Exception { |
| 255 | + D.p("Enter a noun group and press ENTER. Press CTRL+C to abort"); |
| 256 | +// while(true) { |
| 257 | +// D.p(new NounGroup(D.r()).description()); |
| 258 | +// } |
| 259 | + |
| 260 | + D.p(new NounGroup("Star_Trek_characters").description()); |
| 261 | + } |
| 262 | + |
| 263 | +} |
0 commit comments