2 * Author: Kasun Gajasinghe
3 * E-Mail: kasunbg AT gmail DOT com
6 * usage: stemmer(word);
7 * ex: var stem = stemmer(foobar);
8 * Implementation of the stemming algorithm from http://snowball.tartarus.org/algorithms/french/stemmer.html
12 * Copyright (c) 2010, Kasun Gajasinghe. All rights reserved.
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
17 * 1. Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
25 * THIS SOFTWARE IS PROVIDED BY KASUN GAJASINGHE ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
26 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
27 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL KASUN GAJASINGHE BE LIABLE FOR ANY DIRECT,
28 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
30 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
32 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 var stemmer = function(word){
37 // Letters in French include the following accented forms,
38 // â à ç ë é ê è ï î ô û ù
39 // The following letters are vowels:
40 // a e i o u y â à ë é ê è ï î ô û ù
42 word = word.toLowerCase();
44 word = word.replace(/qu/g, 'qU'); //have to perform first, as after the operation, capital U is not treated as a vowel
45 word = word.replace(/([aeiouyâàëéêèïîôûù])u([aeiouyâàëéêèïîôûù])/g, '$1U$2');
46 word = word.replace(/([aeiouyâàëéêèïîôûù])i([aeiouyâàëéêèïîôûù])/g, '$1I$2');
47 word = word.replace(/([aeiouyâàëéêèïîôûù])y/g, '$1Y');
48 word = word.replace(/y([aeiouyâàëéêèïîôûù])/g, 'Y$1');
52 if(word.search(/^(par|col|tap)/) != -1 || word.search(/^[aeiouyâàëéêèïîôûù]{2}/) != -1){
53 rv = word.substring(3);
56 rvIndex = word.substring(1).search(/[aeiouyâàëéêèïîôûù]/);
58 rvIndex +=2; //+2 is to supplement the substring(1) used to find rvIndex
59 rv = word.substring(rvIndex);
61 rvIndex = word.length;
65 // R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
66 // R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel
67 var r1Index = word.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/);
71 r1 = word.substring(r1Index);
73 r1Index = word.length;
79 r2Index = r1.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/);
82 r2 = r1.substring(r2Index);
86 r2Index = word.length;
89 if (r1Index != -1 && r1Index < 3) {
91 r1 = word.substring(r1Index);
95 Step 1: Standard suffix removal
97 var a1Index = word.search(/(ance|iqUe|isme|able|iste|eux|ances|iqUes|ismes|ables|istes)$/);
98 var a2Index = word.search(/(atrice|ateur|ation|atrices|ateurs|ations)$/);
99 var a3Index = word.search(/(logie|logies)$/);
100 var a4Index = word.search(/(usion|ution|usions|utions)$/);
101 var a5Index = word.search(/(ence|ences)$/);
102 var a6Index = word.search(/(ement|ements)$/);
103 var a7Index = word.search(/(ité|ités)$/);
104 var a8Index = word.search(/(if|ive|ifs|ives)$/);
105 var a9Index = word.search(/(eaux)$/);
106 var a10Index = word.search(/(aux)$/);
107 var a11Index = word.search(/(euse|euses)$/);
108 var a12Index = word.search(/[^aeiouyâàëéêèïîôûù](issement|issements)$/);
109 var a13Index = word.search(/(amment)$/);
110 var a14Index = word.search(/(emment)$/);
111 var a15Index = word.search(/[aeiouyâàëéêèïîôûù](ment|ments)$/);
113 if(a1Index != -1 && a1Index >= r2Index){
114 word = word.substring(0,a1Index);
115 } else if(a2Index != -1 && a2Index >= r2Index){
116 word = word.substring(0,a2Index);
117 var a2Index2 = word.search(/(ic)$/);
118 if(a2Index2 != -1 && a2Index2 >= r2Index){
119 word = word.substring(0, a2Index2); //if preceded by ic, delete if in R2,
120 } else { //else replace by iqU
121 word = word.replace(/(ic)$/,'iqU');
123 } else if(a3Index != -1 && a3Index >= r2Index){
124 word = word.replace(/(logie|logies)$/,'log'); //replace with log if in R2
125 } else if(a4Index != -1 && a4Index >= r2Index){
126 word = word.replace(/(usion|ution|usions|utions)$/,'u'); //replace with u if in R2
127 } else if(a5Index != -1 && a5Index >= r2Index){
128 word = word.replace(/(ence|ences)$/,'ent'); //replace with ent if in R2
129 } else if(a6Index != -1 && a6Index >= rvIndex){
130 word = word.substring(0,a6Index);
131 if(word.search(/(iv)$/) >= r2Index){
132 word = word.replace(/(iv)$/, '');
133 if(word.search(/(at)$/) >= r2Index){
134 word = word.replace(/(at)$/, '');
136 } else if(word.search(/(eus)$/) != -1){
137 var a6Index2 = word.search(/(eus)$/);
138 if(a6Index2 >=r2Index){
139 word = word.substring(0, a6Index2);
140 } else if(a6Index2 >= r1Index){
141 word = word.substring(0,a6Index2)+"eux";
143 } else if(word.search(/(abl|iqU)$/) >= r2Index){
144 word = word.replace(/(abl|iqU)$/,''); //if preceded by abl or iqU, delete if in R2,
145 } else if(word.search(/(ièr|Ièr)$/) >= rvIndex){
146 word = word.replace(/(ièr|Ièr)$/,'i'); //if preceded by abl or iqU, delete if in R2,
148 } else if(a7Index != -1 && a7Index >= r2Index){
149 word = word.substring(0,a7Index); //delete if in R2
150 if(word.search(/(abil)$/) != -1){ //if preceded by abil, delete if in R2, else replace by abl, otherwise,
151 var a7Index2 = word.search(/(abil)$/);
152 if(a7Index2 >=r2Index){
153 word = word.substring(0, a7Index2);
155 word = word.substring(0,a7Index2)+"abl";
157 } else if(word.search(/(ic)$/) != -1){
158 var a7Index3 = word.search(/(ic)$/);
159 if(a7Index3 != -1 && a7Index3 >= r2Index){
160 word = word.substring(0, a7Index3); //if preceded by ic, delete if in R2,
161 } else { //else replace by iqU
162 word = word.replace(/(ic)$/,'iqU');
164 } else if(word.search(/(iv)$/) != r2Index){
165 word = word.replace(/(iv)$/,'');
167 } else if(a8Index != -1 && a8Index >= r2Index){
168 word = word.substring(0,a8Index);
169 if(word.search(/(at)$/) >= r2Index){
170 word = word.replace(/(at)$/, '');
171 if(word.search(/(ic)$/) >= r2Index){
172 word = word.replace(/(ic)$/, '');
173 } else { word = word.replace(/(ic)$/, 'iqU'); }
175 } else if(a9Index != -1){ word = word.replace(/(eaux)/,'eau')
176 } else if(a10Index >= r1Index){ word = word.replace(/(aux)/,'al')
177 } else if(a11Index != -1 ){
178 var a11Index2 = word.search(/(euse|euses)$/);
179 if(a11Index2 >=r2Index){
180 word = word.substring(0, a11Index2);
181 } else if(a11Index2 >= r1Index){
182 word = word.substring(0, a11Index2)+"eux";
184 } else if(a12Index!=-1 && a12Index>=r1Index){
185 word = word.substring(0,a12Index+1); //+1- amendment to non-vowel
186 } else if(a13Index!=-1 && a13Index>=rvIndex){
187 word = word.replace(/(amment)$/,'ant');
188 } else if(a14Index!=-1 && a14Index>=rvIndex){
189 word = word.replace(/(emment)$/,'ent');
190 } else if(a15Index!=-1 && a15Index>=rvIndex){
191 word = word.substring(0,a15Index+1);
194 /* Step 2a: Verb suffixes beginning i*/
195 var wordStep1 = word;
196 var step2aDone = false;
197 if(oriWord == word.toLowerCase() || oriWord.search(/(amment|emment|ment|ments)$/) != -1){
199 var b1Regex = /([^aeiouyâàëéêèïîôûù])(îmes|ît|îtes|i|ie|ies|ir|ira|irai|iraIent|irais|irait|iras|irent|irez|iriez|irions|irons|iront|is|issaIent|issais|issait|issant|issante|issantes|issants|isse|issent|isses|issez|issiez|issions|issons|it)$/i;
200 if(word.search(b1Regex) >= rvIndex){
201 word = word.replace(b1Regex,'$1');
205 /* Step 2b: Other verb suffixes*/
206 if (step2aDone && wordStep1 == word) {
207 if (word.search(/(ions)$/) >= r2Index) {
208 word = word.replace(/(ions)$/, '');
210 var b2Regex = /(é|ée|ées|és|èrent|er|era|erai|eraIent|erais|erait|eras|erez|eriez|erions|erons|eront|ez|iez)$/i;
211 if (word.search(b2Regex) >= rvIndex) {
212 word = word.replace(b2Regex, '');
214 var b3Regex = /e(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i;
215 if (word.search(b3Regex) >= rvIndex) {
216 word = word.replace(b3Regex, '');
218 var b3Regex2 = /(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i;
219 if (word.search(b3Regex2) >= rvIndex) {
220 word = word.replace(b3Regex2, '');
227 if(oriWord != word.toLowerCase()){
230 if(word.search(/Y$/) != -1) {
231 word = word.replace(/Y$/, 'i');
232 } else if(word.search(/ç$/) != -1){
233 word = word.replace(/ç$/, 'c');
237 //If the word ends s, not preceded by a, i, o, u, è or s, delete it.
238 if (word.search(/([^aiouès])s$/) >= rvIndex) {
239 word = word.replace(/([^aiouès])s$/, '$1');
241 var e1Index = word.search(/ion$/);
242 if (e1Index >= r2Index && word.search(/[st]ion$/) >= rvIndex) {
243 word = word.substring(0, e1Index);
245 var e2Index = word.search(/(ier|ière|Ier|Ière)$/);
246 if (e2Index != -1 && e2Index >= rvIndex) {
247 word = word.substring(0, e2Index) + "i";
249 if (word.search(/e$/) >= rvIndex) {
250 word = word.replace(/e$/, ''); //delete last e
251 } else if (word.search(/guë$/) >= rvIndex) {
252 word = word.replace(/guë$/, 'gu');
258 /* Step 5: Undouble */
259 //word = word.replace(/(en|on|et|el|eil)(n|t|l)$/,'$1');
260 word = word.replace(/(en|on)(n)$/,'$1');
261 word = word.replace(/(ett)$/,'et');
262 word = word.replace(/(el|eil)(l)$/,'$1');
264 /* Step 6: Un-accent */
265 word = word.replace(/[éè]([^aeiouyâàëéêèïîôûù]+)$/,'e$1');
266 word = word.toLowerCase();
270 var eqOut = new Array();
271 var noteqOut = new Array();
274 To test the stemming, create two arrays named "voc" and "COut" which are for vocabualary and the stemmed output.
275 Then add the vocabulary strings and output strings. This method will generate the stemmed output for "voc" and will
276 compare the output with COut.
277 (I used porter's voc and out files and did a regex to convert them to js objects. regex: /");\nvoc.push("/g . This
278 will add strings to voc array such that output would look like: voc.push("foobar"); ) drop me an email for any help.
281 var start = new Date().getTime(); //execution time
284 noteqOut = new Array();
285 for(var k=0;k<voc.length;k++){
286 if(COut[k]==stemmer(voc[k])){
288 eqOut.push("v: "+voc[k]+" c: "+COut[k]);
290 noteqOut.push(voc[k]+", c: "+COut[k]+" s:"+stemmer(voc[k]));
293 var end = new Date().getTime(); //execution time
294 var time = end-start;
295 alert("equal count= "+eqCount+" out of "+voc.length+" words. time= "+time+" ms");
296 //console.log("equal count= "+eqCount+" out of "+voc.length+" words. time= "+time+" ms");