Script Library: porterstemming.r

Script Library: 1247 scripts
Archive version of: porterstemming.r ... version: 1 ... dalebrearcli 29-Mar-2009

Amendment note: new script || Publicly available? Yes
REBOL [
    file: %porterstemming.r
    date: 18-Nov-2007
    title: "Porter Stemming Algorithm"
    version: 1.0.1
    organization: "Digital Bear Consulting"
    url: http://www.digital-bear.com
    author: "Dale K. Brearcliffe"
    email: %daleb@digital-bear.com
    copyright:  "Copyright (c) 2007, 2009 Dale K. Brearcliffe"
    license: {This program is free software: you can redistribute it and/or
              modify it under the terms of the GNU Lesser General Public License
              as published by the Free Software Foundation, either version 3 of
              the License, or (at your option) any later version.
              This program is distributed in the hope that it will be useful,
              but WITHOUT ANY WARRANTY; without even the implied warranty of
              MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
              GNU Lesser General Public License for more details.
              You should have received a copy of the GNU Lesser General Public
              License along with this program.  If not, see 
              <http://www.gnu.org/licenses/>.}
    purpose: {Applies the Porter Stemming algorithm as presented in:
              Porter, 1980, An algorithm for suffix stripping, Program,
              Vol. 14, no. 3, pp 130-137.}
    remarks: {This program is based on the modified version of Porter's stemming
              algorithm posted at: http://tartarus.org/~martin/PorterStemmer/
              The program was tested with the sample test vocabulary found at
              the above web site. In testing it created the same output results
              as the sample output.
              Note that this program will force all text to lower case and
              return results as lower case.}
    usage: {do %porterstemming.r
            stemmedWord: porterStem word
           }
    library: [
        level: 'intermediate
        platform: 'all
        type: [function tool]
        domain: [text text-processing]
        tested-under: [view 2.7.6.3.1 XP]
        support: none
        license: LGPL
        see-also: http://tartarus.org/~martin/PorterStemmer/
    ]
]

porterStem: function [
 "Stems Words"
 word
][
][

word: lowercase word

;Check for minimum length
if (length? word) < 3 [return word]

;Set variables
vowels: "aeiou"
vowel: charset join vowels {y}
consonant: charset ["bcdfghjklmnpqrstvwxYz"]
consonant1: charset ["bcdfghjklmnpqrstvz"]
cvc: [1 consonant 1 vowel 1 consonant]
V: [some vowel]
C: [some consonant]
C1: [some consonant1]
VC: [V C]
ruleM: [(m: 0) any C VC (m: m + 1) VC (m: m + 1) VC (m: m + 1) VC (m: m + 1) any V]
listStep1a: [{sses} {ss} {ies} {i} {ss} {ss} {s} {}]
listStep1b: [{at} {ate} {bl} {ble} {iz} {ize} {bb} {b} {cc} {c} {dd} {d} {ff} {f} {gg} {g} {hh} {h} {jj} {j} {kk} {k} {ll} {ll} {mm} {m} {nn} {n} {pp} {p} {qq} {q} {rr} {r} {ss} {ss} {tt} {t} {vv} {v} {ww} {w} {xx} {x} {zz} {zz}]
listStep2: [{ational} {ate} {tional} {tion} {enci} {ence} {anci} {ance} {izer} {ize} {logi} {log} {bli} {ble} {alli} {al} {entli} {ent} {eli} {e} {ousli} {ous} {ization} {ize} {ation} {ate} {ator} {ate} {alism} {al} {iveness} {ive} {fulness} {ful} {ousness} {ous} {aliti} {al} {iviti} {ive} {biliti} {ble}]
listStep3: [{icate} {ic} {ative} {} {alize} {al} {iciti} {ic} {ical} {ic} {ful} {} {ness} {}]
listStep4: [{al} {} {ance} {} {ence} {} {er} {} {ic} {} {able} {} {ible} {} {ant} {} {ement} {} {ment} {} {ent} {} {ion} {} {ou} {} {ism} {} {ate} {} {iti} {} {ous} {} {ive} {} {ize} {}]

;partOfWord - Given a word and a suffix of interest, the function splits the word,
;returning a block containing the stem, the suffix and a boolean that is set to
;true if the suffix was found in the word.

partOfWord: function [
 "Creates parts of word breakdown. Returns a block with stem, suffix & pointer."
 arg1 [string!]
 arg2 [string!]
][
 l1
 l2
 matched
 returnvalue
 stem
 suffix
][
matched: false
l1: length? arg1
l2: length? arg2
stem: copy {}
suffix: copy {}
if l2 < l1 [
 suffix: rightString arg1 l2
 stem: leftString arg1 (l1 - l2)
 if suffix == arg2 [matched: true]
]
returnValue: copy []
append returnValue stem
append returnValue suffix
append returnValue matched
return returnValue
]

hasVowel?: function [
 "Returns true if the passed string contains a vowel."
 arg1 [string!]
][
 returnValue
 t
][
 t: intersect arg1 (join vowels {y})
 returnValue: false
 if t <> "" [returnValue: true]
 return returnValue
]

leftString: function [
   "Returns the left most arg2 characters of string arg1."
   arg1 [string!]
   arg2 [number!]
   ][
   l
   ][
   l: length? arg1
   if (l <= arg2) [return arg1]
   if (arg2 = 0) [return {}]
   if (arg2 < 0) [arg2: 0 - arg2]
   return copy/part arg1 arg2
]

rightString: function [
   "Returns the right most arg2 characters of string arg1."
   arg1 [string!]
   arg2 [number!]
   ][
   l
   lt
   ][
   l: length? arg1
   if (l <= arg2) [return arg1]
   if arg2 = 0 [return {}]
   either arg2 > 0 [
     lt: 0 - arg2
   ][
     lt: arg2
   ]
   return skip tail arg1 lt
]
 
;Fix the problem with the status of 'y' by changing the lower case 'y' to 
;uppercase in those cases where 'y' should be treated as a consonant

foreach letter vowels [
 replace word join letter {y} join letter {Y}
]
if (leftString word 1) == {y} [replace word {y} {Y}]

;Step 1a removes plurality
;Rule: SSES -> SS                  caresses  ->  caress
;Rule: IES  -> I                   ponies    ->  poni
;                                  ties      ->  ti
;Rule: SS   -> SS                  caress    ->  caress
;Rule: S    ->                     cats      ->  cat

forskip listStep1a 2 [
 partOfWordResults: partOfWord word first listStep1a
 stem: first partOfWordResults
 ptr: third partOfWordResults
 if ptr [
  word: join stem second listStep1a
  break
 ]
]
listStep1a: head listStep1a

;Step 1b removes past participles
;Rule: (m>0) EED -> EE              feed      ->  feed
;                                   agreed    ->  agree

step1bDone: false
step1b1: false
partOfWordResults: partOfWord word {eed}
stem: first partOfWordResults
ptr: third partOfWordResults
if ptr [
 step1bDone: true
 parse stem ruleM
 if m > 0 [
  word: join stem {ee}
 ]
]

;Rule: (*v*) ED  ->                 plastered ->  plaster
;                                   bled      ->  bled

if not step1bDone [
 partOfWordResults: partOfWord word {ed}
 stem: first partOfWordResults
 ptr: third partOfWordResults
 if ptr [
 	step1bDone: true
  if hasVowel? stem [
   word: stem
 	 step1b1: true
  ]
 ]
]

;Rule: (*v*) ING ->                  motoring  ->  motor
;                                    sing      ->  sing

if not step1bDone [
 partOfWordResults: partOfWord word {ing}
 stem: first partOfWordResults
 ptr: third partOfWordResults
 if ptr [
  if hasVowel? stem [
   word: stem
 	 step1b1: true
  ]
 ]
]

;If the second or third of the rules in Step 1b is successful, the following
;is done:
;
;    AT -> ATE                       conflat(ed)  ->  conflate
;    BL -> BLE                       troubl(ed)   ->  trouble
;    IZ -> IZE                       siz(ed)      ->  size
;    (*d and not (*L or *S or *Z))
;       -> single letter
;                                    hopp(ing)    ->  hop
;                                    tann(ed)     ->  tan
;                                    fall(ing)    ->  fall
;                                    hiss(ing)    ->  hiss
;                                    fizz(ed)     ->  fizz

if step1b1 [
 forskip listStep1b 2 [
  partOfWordResults: partOfWord word first listStep1b
  stem: first partOfWordResults
  ptr: third partOfWordResults
  if ptr [
   word: join stem second listStep1b
   step1b1: false
   break
  ]
 ] 
 listStep1b: head listStep1b
]

;Rule: (m=1 and *o) -> E  fail(ing)    ->  fail
;                         fil(ing)     ->  file

if step1b1 [
 parse word ruleM
 if m == 1 [
  letter: rightString word 1
	if parse letter C1 [
	 if parse (rightString word 3) cvc [
	  word: join word {e}]
	 ]
 ]
]

;Step 1c
;Rule: (*v*) Y -> I           happy        ->  happi
;                             sky          ->  sky

partOfWordResults: partOfWord word {y}
stem: first partOfWordResults
ptr: third partOfWordResults
if ptr [
 if hasVowel? stem [word: join stem {i}]
]
partOfWordResults: partOfWord word {Y}
stem: first partOfWordResults
ptr: third partOfWordResults
if ptr [
 if hasVowel? stem [word: join stem {i}]
]

;Step 2
;Rule: (m>0) ATIONAL ->  ATE           relational     ->  relate
;Rule: (m>0) TIONAL  ->  TION          conditional    ->  condition
;                                      rational       ->  rational
;Rule: (m>0) ENCI    ->  ENCE          valenci        ->  valence
;Rule: (m>0) ANCI    ->  ANCE          hesitanci      ->  hesitance
;Rule: (m>0) IZER    ->  IZE           digitizer      ->  digitize
;Rule: (m>0) LOGI    ->  LOG - New Rule added 
;Rule: (m>0) ABLI    ->  ABLE          conformabli    ->  conformable
;Rule: (m>0) BLI     ->  BLE - Replaces rule: (m>0) ABLI -> ABLE 
;Rule: (m>0) ALLI    ->  AL            radicalli      ->  radical
;Rule: (m>0) ENTLI   ->  ENT           differentli    ->  different
;Rule: (m>0) ELI     ->  E             vileli        - >  vile
;Rule: (m>0) OUSLI   ->  OUS           analogousli    ->  analogous
;Rule: (m>0) IZATION ->  IZE           vietnamization ->  vietnamize
;Rule: (m>0) ATION   ->  ATE           predication    ->  predicate
;Rule: (m>0) ATOR    ->  ATE           operator       ->  operate
;Rule: (m>0) ALISM   ->  AL            feudalism      ->  feudal
;Rule: (m>0) IVENESS ->  IVE           decisiveness   ->  decisive
;Rule: (m>0) FULNESS ->  FUL           hopefulness    ->  hopeful
;Rule: (m>0) OUSNESS ->  OUS           callousness    ->  callous
;Rule: (m>0) ALITI   ->  AL            formaliti      ->  formal
;Rule: (m>0) IVITI   ->  IVE           sensitiviti    ->  sensitive
;Rule: (m>0) BILITI  ->  BLE           sensibiliti    ->  sensible

forskip listStep2 2 [
 partOfWordResults: partOfWord word first listStep2
 stem: first partOfWordResults
 ptr: third partOfWordResults
 if ptr [
  parse stem ruleM
	if m > 0 [
   word: join stem second listStep2
	]
  break
 ]
] 
listStep2: head listStep2

;Step 3
;Rule: (m>0) ICATE ->  IC              triplicate     ->  triplic
;Rule: (m>0) ATIVE ->                  formative      ->  form
;Rule: (m>0) ALIZE ->  AL              formalize      ->  formal
;Rule: (m>0) ICITI ->  IC              electriciti    ->  electric
;Rule: (m>0) ICAL  ->  IC              electrical     ->  electric
;Rule: (m>0) FUL   ->                  hopeful        ->  hope
;Rule: (m>0) NESS  ->                  goodness       ->  good

forskip listStep3 2 [
 partOfWordResults: partOfWord word first listStep3
 stem: first partOfWordResults
 ptr: third partOfWordResults
 if ptr [
  parse stem ruleM
	if m > 0 [
   word: join stem second listStep3
	]
  break
 ]
] 
listStep3: head listStep3

;Step 4
;Rule: (m>1) AL    ->                  revival        ->  reviv
;Rule: (m>1) ANCE  ->                  allowance      ->  allow
;Rule: (m>1) ENCE  ->                  inference      ->  infer
;Rule: (m>1) ER    ->                  airliner       ->  airlin
;Rule: (m>1) IC    ->                  gyroscopic     ->  gyroscop
;Rule: (m>1) ABLE  ->                  adjustable     ->  adjust
;Rule: (m>1) IBLE  ->                  defensible     ->  defens
;Rule: (m>1) ANT   ->                  irritant       ->  irrit
;Rule: (m>1) EMENT ->                  replacement    ->  replac
;Rule: (m>1) MENT  ->                  adjustment     ->  adjust
;Rule: (m>1) ENT   ->                  dependent      ->  depend
;Rule: (m>1 and (*S or *T)) ION ->     adoption       ->  adopt
;Rule: (m>1) OU    ->                  homologou      ->  homolog
;Rule: (m>1) ISM   ->                  communism      ->  commun
;Rule: (m>1) ATE   ->                  activate       ->  activ
;Rule: (m>1) ITI   ->                  angulariti     ->  angular
;Rule: (m>1) OUS   ->                  homologous     ->  homolog
;Rule: (m>1) IVE   ->                  effective      ->  effect
;Rule: (m>1) IZE   ->                  bowdlerize     ->  bowdler

forskip listStep4 2 [
 partOfWordResults: partOfWord word first listStep4
 stem: first partOfWordResults
 ptr: third partOfWordResults
 if ptr [
  parse stem ruleM
	if m > 1 [
   either (first listStep4) == {ion} [
    if ((rightString stem 1) == {s}) or ((rightString stem 1) == {t}) [
		 word: join stem second listStep4
		]
	 ][
    word: join stem second listStep4
	 ]	
	]
	break
 ]
] 
listStep4: head listStep4

;Step 5a
;Rule: (m>1) E     ->                  probate        ->  probat
;                                      rate           ->  rate
;Rule: (m=1 and not *o) E ->           cease          ->  ceas

partOfWordResults: partOfWord word {e}
stem: first partOfWordResults
ptr: third partOfWordResults
if ptr [
 parse stem ruleM
 either m > 1 [
  word: stem
 ][
  if m == 1 [
   letter: rightString stem 1
	 if ((not parse letter C1) or (not parse (rightString stem 3) cvc)) [
	   word: stem
	 ]  
  ]
 ]
] 

;Step 5b
;Rule: (m > 1 and *d and *L) -> single letter
;                                      controll       ->  control
;                                      roll           ->  roll

partOfWordResults: partOfWord word {l}
stem: first partOfWordResults
ptr: third partOfWordResults
if ptr [
 parse stem ruleM
 if m > 1 [
  if (rightString stem 1) == {l} [word: stem]
 ]
]

return lowercase word
]