@@ -462,6 +462,7 @@ ESEDING = "es"|"ed"|"ing"
462462G = [^ \t\r\n \u2028\u2029\u000B\u000C\u0085 _]
463463GM = [^ \t\r\n \u2028\u2029\u000B\u000C\u0085 _-]
464464SKIP = [ \t\r\n \u2028\u2029\u000B\u000C\u0085 ]
465+ DIGIT = [ 0- 9]
465466
466467/* adjectives such as tame which become tamer, tamest */
467468E_ADJS = "able"|"absolute"|"abstruse"|"acute"|"ample"|"austere"|"bare"|"base"|"blithe"|"blonde"|"blue"|"brave"|"brittle"|"brusque"|"capable"|"chaste"|"choice"|"close"|"coarse"|"complete"|"concise"|"crude"|"cute"|"demure"|"dense"|"dire"|"divine"|"doggone"|"eerie"|"extreme"|"false"|"feeble"|"fickle"|"fierce"|"fine"|"free"|"game"|"gauche"|"gentle"|"gladsome"|"grave"|"grewsome"|"gruesome"|"hale"|"handsome"|"hoarse"|"huge"|"humane"|"humble"|"idle"|"immense"|"inane"|"insane"|"intense"|"irate"|"kittle"|"lame"|"large"|"late"|"lithe"|"little"|"loose"|"mature"|"mere"|"mickle"|"minute"|"mute"|"naive"|"naïve"|"negative"|"nice"|"nimble"|"noble"|"nude"|"obscene"|"obscure"|"obtuse"|"opaque"|"pale"|"polite"|"positive"|"possible"|"precise"|"private"|"pure"|"purple"|"rare"|"rathe"|"remote"|"resolute"|"rife"|"ripe"|"rude"|"safe"|"sage"|"sane"|"savage"|"scarce"|"secure"|"sensible"|"serene"|"severe"|"simple"|"sincere"|"sore"|"spare"|"sparse"|"spruce"|"square"|"stable"|"stale"|"strange"|"suave"|"sublime"|"subtile"|"subtle"|"supple"|"supreme"|"sure"|"svelte"|"tame"|"tense"|"terse"|"trite"|"true"|"unique"|"unripe"|"unsafe"|"unstable"|"untrue"|"unwise"|"urbane"|"vague"|"vile"|"white"|"wholesome"|"wide"|"winsome"|"wise"|"yare"
@@ -2027,7 +2028,9 @@ S_ENDING_DEMONYMS = "Afghan"|"Afghani"|"African"|"Albanian"|"Alexandrine"|"Alger
20272028<verb,noun,any> {GM} * "-" { // The first word isn't stemmed separately, but the second half can be
20282029 String stem = common_noun_stem();
20292030 String n = next();
2030- if (n == null ) {
2031+ if (n == null || n. startsWith(" _" )) {
2032+ // TODO: would be nice to use the rest of the lemma rules
2033+ // when the token just ends with "-"
20312034 return stem;
20322035 } else {
20332036 return stem. concat(n);
@@ -2107,6 +2110,9 @@ S_ENDING_DEMONYMS = "Afghan"|"Afghani"|"African"|"Albanian"|"Alexandrine"|"Alger
21072110<scan> {S_ENDING_DEMONYMS} / _( NN( P?)( S?)| JJ) { return (capitalise(common_noun_stem())); }
21082111<scan> {S_ENDING_DEMONYMS} s/ _( NN( P?)( S?)| JJ) { return (capitalise(stem(1 , " " , " s" ))); }
21092112
2113+ /* remove commas from numbers, eg 5,000 -> 5000 */
2114+ <scan> ( {DIGIT} | "," )+( [ .] {DIGIT} +)?/ _CD { return (yytext(). replaceAll(" ," , " " )); }
2115+
21102116<scan> "worse" / _JJR { return (stem(5 , " bad" , " " )); }
21112117<scan> "worst" / _JJS { return (stem(5 , " bad" , " " )); }
21122118<scan> "worse" / _RBR { return (stem(5 , " badly" , " " )); }
0 commit comments