]> git.stg.codes - stg.git/blob - doc/xslt/webhelp/template/content/search/stemmers/en_stemmer.js
Set output encoding to utf-8.
[stg.git] / doc / xslt / webhelp / template / content / search / stemmers / en_stemmer.js
1 // Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
2 // paper, in
3 //
4 //  Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
5 //  no. 3, pp 130-137,
6 //
7 // see also http://www.tartarus.org/~martin/PorterStemmer
8
9 // Release 1 be 'andargor', Jul 2004
10 // Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
11
12
13 var stemmer = (function(){
14         var step2list = {
15                         "ational" : "ate",
16                         "tional" : "tion",
17                         "enci" : "ence",
18                         "anci" : "ance",
19                         "izer" : "ize",
20                         "bli" : "ble",
21                         "alli" : "al",
22                         "entli" : "ent",
23                         "eli" : "e",
24                         "ousli" : "ous",
25                         "ization" : "ize",
26                         "ation" : "ate",
27                         "ator" : "ate",
28                         "alism" : "al",
29                         "iveness" : "ive",
30                         "fulness" : "ful",
31                         "ousness" : "ous",
32                         "aliti" : "al",
33                         "iviti" : "ive",
34                         "biliti" : "ble",
35                         "logi" : "log"
36                 },
37
38                 step3list = {
39                         "icate" : "ic",
40                         "ative" : "",
41                         "alize" : "al",
42                         "iciti" : "ic",
43                         "ical" : "ic",
44                         "ful" : "",
45                         "ness" : ""
46                 },
47
48                 c = "[^aeiou]",          // consonant
49                 v = "[aeiouy]",          // vowel
50                 C = c + "[^aeiouy]*",    // consonant sequence
51                 V = v + "[aeiou]*",      // vowel sequence
52
53                 mgr0 = "^(" + C + ")?" + V + C,               // [C]VC... is m>0
54                 meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$",  // [C]VC[V] is m=1
55                 mgr1 = "^(" + C + ")?" + V + C + V + C,       // [C]VCVC... is m>1
56                 s_v = "^(" + C + ")?" + v;                   // vowel in stem
57
58         return function (w) {
59                 var     stem,
60                         suffix,
61                         firstch,
62                         re,
63                         re2,
64                         re3,
65                         re4,
66                         origword = w;
67
68                 if (w.length < 3) { return w; }
69
70                 firstch = w.substr(0,1);
71                 if (firstch == "y") {
72                         w = firstch.toUpperCase() + w.substr(1);
73                 }
74
75                 // Step 1a
76                 re = /^(.+?)(ss|i)es$/;
77                 re2 = /^(.+?)([^s])s$/;
78
79                 if (re.test(w)) { w = w.replace(re,"$1$2"); }
80                 else if (re2.test(w)) { w = w.replace(re2,"$1$2"); }
81
82                 // Step 1b
83                 re = /^(.+?)eed$/;
84                 re2 = /^(.+?)(ed|ing)$/;
85                 if (re.test(w)) {
86                         var fp = re.exec(w);
87                         re = new RegExp(mgr0);
88                         if (re.test(fp[1])) {
89                                 re = /.$/;
90                                 w = w.replace(re,"");
91                         }
92                 } else if (re2.test(w)) {
93                         var fp = re2.exec(w);
94                         stem = fp[1];
95                         re2 = new RegExp(s_v);
96                         if (re2.test(stem)) {
97                                 w = stem;
98                                 re2 = /(at|bl|iz)$/;
99                                 re3 = new RegExp("([^aeiouylsz])\\1$");
100                                 re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
101                                 if (re2.test(w)) {      w = w + "e"; }
102                                 else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
103                                 else if (re4.test(w)) { w = w + "e"; }
104                         }
105                 }
106
107                 // Step 1c
108                 re = /^(.+?)y$/;
109                 if (re.test(w)) {
110                         var fp = re.exec(w);
111                         stem = fp[1];
112                         re = new RegExp(s_v);
113                         if (re.test(stem)) { w = stem + "i"; }
114                 }
115
116                 // Step 2
117                 re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
118                 if (re.test(w)) {
119                         var fp = re.exec(w);
120                         stem = fp[1];
121                         suffix = fp[2];
122                         re = new RegExp(mgr0);
123                         if (re.test(stem)) {
124                                 w = stem + step2list[suffix];
125                         }
126                 }
127
128                 // Step 3
129                 re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
130                 if (re.test(w)) {
131                         var fp = re.exec(w);
132                         stem = fp[1];
133                         suffix = fp[2];
134                         re = new RegExp(mgr0);
135                         if (re.test(stem)) {
136                                 w = stem + step3list[suffix];
137                         }
138                 }
139
140                 // Step 4
141                 re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
142                 re2 = /^(.+?)(s|t)(ion)$/;
143                 if (re.test(w)) {
144                         var fp = re.exec(w);
145                         stem = fp[1];
146                         re = new RegExp(mgr1);
147                         if (re.test(stem)) {
148                                 w = stem;
149                         }
150                 } else if (re2.test(w)) {
151                         var fp = re2.exec(w);
152                         stem = fp[1] + fp[2];
153                         re2 = new RegExp(mgr1);
154                         if (re2.test(stem)) {
155                                 w = stem;
156                         }
157                 }
158
159                 // Step 5
160                 re = /^(.+?)e$/;
161                 if (re.test(w)) {
162                         var fp = re.exec(w);
163                         stem = fp[1];
164                         re = new RegExp(mgr1);
165                         re2 = new RegExp(meq1);
166                         re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
167                         if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
168                                 w = stem;
169                         }
170                 }
171
172                 re = /ll$/;
173                 re2 = new RegExp(mgr1);
174                 if (re.test(w) && re2.test(w)) {
175                         re = /.$/;
176                         w = w.replace(re,"");
177                 }
178
179                 // and turn initial Y back to y
180
181                 if (firstch == "y") {
182                         w = firstch.toLowerCase() + w.substr(1);
183                 }
184
185                 return w;
186         }
187 })();