Script Library: 1238 scripts
  • Home
  • Script library
  • AltME Archive
  • Mailing list
  • Articles Index
  • Site search
 

Archive version of: simetrics.r ... version: 8 ... fvzv 12-Feb-2006

Amendment note: bug fix: jaro-winkler || Publicly available? Yes

REBOL [
	Title: "Similarity Metrics"
	File: %simetrics.r
	Date: 12-Feb-2006 
	Purpose: "Toolkit of string distance metrics."
	Version: 0.2.1
	Library: [
		Level: 'intermediate
		Platform: 'all
		Type: [function module tool]
		Domain: [ai math parse scientific text text-processing]
		tested-under: [
			core 2.5.6.3.1 on [WXP] "fvz"
		]
        Support: none
        License: 'bsd
        See-also: none
	]
	Author: "Francois Vanzeveren (fvz)"
	History: [
		0.2.1 [12-Feb-2006 "BUG FIX: in simetrics/ctx-jaro/get-prefix-length. Thanks to Sunanda from Rebol.org" "fvz"]
		0.2.0 [11-Feb-2006 {
			- NEW METRIC: Jaccard Similarity
			- /case refinement: matching is case sensitive
			- /strict refinement: matching is non-english characters sensitive
		} "fvz"]
		0.1.1 [11-Feb-2006 {
			- The script is now compatible with older versions of rebol (i.e. < 2.6.x)
			- BUG FIX Levenshtein: the subtitution's cost was not properly computed from the deletion and insertion's costs
			- Levenshtein: result normalized
			- NEW METRIC: Jaro and Jaro-Winkler metrics
			- New interface: 'get-similarity is the unique entry point to all metrics
		} "fvz"]
		0.1.0 [10-Feb-2006 "Levenshtein distance implemented" "fvz"]
		0.0.1 [9-Feb-2006 "Created this file" "fvz"]
	]
]

simetrics: context [
	set 'get-similarity function [
		{Measures the similarity of two strings.}
		s [string!] "Source string"
		t [string!] "Target string"
		/jaccard
			{Token based distance function. The Jaccard similarity between 
			word sets S and T is simply |S intersect T| / |S union T|}
		/jaro
			{Measures the similarity of two strings based on the 
			number and the order of common characters between them.}
		/jaro-winkler
			{Variant of the Jaro metric adjusting the weighting for common prefixes.}
		/levenshtein
			{Measures the distance (i.e. similarity) between two strings. 
			The distance is the number of edit operations (deletions, 
			insertions, substitutions) required to transform the source 
			string into the target string.}
		/levenstein {See /levenshtein.}
		/tfidf
		
		/del-cost dc [number!] "Deletion's cost. ONLY with /levenshtein refinement."
		/ins-cost ic [number!] "Insertion's cost. ONLY with /levenshtein refinement."
		/sub-cost sc [number!] "Substitution's cost. ONLY with /levenshtein refinement."
		/case "Characters are case-sensitive."
		/punctuation "Matching is punctuation sensitive. ONLY for token based distance metrics."
		/strict "Matching is non-english characters sensitive."
	] [] [
		trim s: copy any [s ""]
		trim t: copy any [t ""]
		if not case [
			lowercase s
			lowercase t
		]
		if not strict [
			parse/all s [
				any [
					mark: alpha-ext (change mark select/case alpha-map first mark)
					| skip
				]
			]
			parse/all t [
				any [
					mark: alpha-ext (change mark select/case alpha-map first mark)
					| skip
				]
			]
		]
		if jaccard [
			use [s-bag t-bag] [
				s-bag: either punctuation [ctx-token/tokenize/punctuation s] [ctx-token/tokenize s]
				t-bag: either punctuation [ctx-token/tokenize/punctuation t] [ctx-token/tokenize t]
				return divide
						length? intersect s-bag t-bag
						length? union s-bag t-bag
			]
		]
		if jaro [
			use [half-len s-common t-common transpositions] [
				; get half the length of the string rounded up - (this is the distance used for acceptable transpositions)
				half-len: to-integer divide min length? s length? t 2
				; get common characters
				s-common: ctx-jaro/get-common-characters s t half-len
				t-common: ctx-jaro/get-common-characters t s half-len
				
				; Check for empty and/or different size common strings
				if any [
					not-equal? length? s-common length? t-common
					empty? s-common empty? t-common
				] [return 0]
				
				; Get the number of transpositions
				; A transposition for s-common, t-common is a position i 
				; such that s-common[i] <> t-common[i]
				transpositions: 0
				for i 1 length? s-common 1 [
					if not-equal? s-common/:i t-common/:i [transpositions: add transpositions 1]
				]
				transpositions: divide transpositions 2
				return divide 
							add add 
									divide length? s-common length? s
									divide length? t-common length? t
								divide 
									subtract length? s-common transpositions
									length? s-common
							3
			]
		]
		if jaro-winkler [
			use [dist prefix-length] [
				dist: get-similarity/jaro s t
				; This extension modifies the weights of poorly matching pairs s, t which share a common prefix
				prefix-length: ctx-jaro/get-prefix-length s t
				return add  dist
							multiply multiply prefix-length ctx-jaro/PREFIXADUSTMENTSCALE 
									 subtract 1 dist
			]
		]
		if any [levenshtein levenstein] [
			use [dist max-len] [
				; 0.1.1
				either any [ins-cost del-cost sub-cost] [
					sc: any [sc sm-multiply sm-max dc ic 2 2]
					dc: any [dc ic sm-divide sc 2]
					ic: any [ic dc]
				] [
					sc: dc: ic: 1
				]
				dist: ctx-levenshtein/get-distance
							back tail s 
							back tail t 
							array reduce [length? t length? s]
							dc ic sc
				; get the max possible levenstein distance score for string
				max-len: max length? s length? t
				if zero? max-len [return 1] ; as both strings identically zero length
				; return actual / possible levenstein distance to get 0-1 range
				subtract 1 divide dist max-len
			]
		]
	]
	sm-divide: func [
		"Returns the first value divided by the second."
		value1 [number! pair! char! money! time! tuple! none!]
		value2 [number! pair! char! money! time! tuple! none!]
	] [
		if not all [value1 value2] [return none]
		divide value1 value2 
	]
	sm-multiply: func [
		"Returns the first value multiplied by the second."
		value1 [number! pair! char! money! time! tuple! none!]
		value2 [number! pair! char! money! time! tuple! none!]
	] [
		if not all [value1 value2] [return none]
		multiply value1 value2 
	]
	sm-max: func [
		"Returns the greater of the two values."
		value1 [number! pair! char! money! date! time! tuple! series! none!]
		value2 [number! pair! char! money! date! time! tuple! series! none!]
	] [
		if all [none? value1 none? value2] [return none]
		value1: any [value1 value2]
		value2: any [value2 value1]
		max value1 value2
	]
	alpha-map: make block! reduce [	
		make char! 192 #"A"
		make char! 193 #"A"
		make char! 194 #"A"
		make char! 195 #"A"
		make char! 196 #"A"
		make char! 197 #"A"
		make char! 199 #"C"
		make char! 200 #"E"
		make char! 201 #"E"
		make char! 202 #"E"
		make char! 203 #"E"
		make char! 204 #"I"
		make char! 205 #"I"
		make char! 206 #"I"
		make char! 207 #"I"
		make char! 209 #"N"
		make char! 210 #"O"
		make char! 211 #"O"
		make char! 212 #"O"
		make char! 213 #"O"
		make char! 214 #"O"
		make char! 216 #"O"
		make char! 217 #"U"
		make char! 218 #"U"
		make char! 219 #"U"
		make char! 220 #"U"
		make char! 221 #"Y"
		make char! 224 #"a"
		make char! 225 #"a"
		make char! 226 #"a"
		make char! 227 #"a"
		make char! 228 #"a"
		make char! 229 #"a"
		make char! 231 #"c"
		make char! 232 #"e"
		make char! 233 #"e"
		make char! 234 #"e"
		make char! 235 #"e"
		make char! 236 #"i"
		make char! 237 #"i"
		make char! 238 #"i"
		make char! 239 #"i"
		make char! 241 #"n"
		make char! 242 #"o"
		make char! 243 #"o"
		make char! 244 #"o"
		make char! 245 #"o"
		make char! 246 #"o"
		make char! 248 #"o"
		make char! 249 #"u"
		make char! 250 #"u"
		make char! 251 #"u"
		make char! 252 #"u"
		make char! 253 #"y"
		make char! 255 #"y"
	]
	alpha-ext: make block! []
	forskip alpha-map 2 [
		append alpha-ext first alpha-map
	]
	alpha-map: head alpha-map ; for compatibility with Rebol/Core < 2.6.x
	
	; Charsets
	digit: charset [#"0" - #"9"]
	alpha-ext: charset alpha-ext
	alpha: union charset [#"A" - #"Z" #"a" - #"z"] alpha-ext
	alphanum: union alpha digit
	space: charset reduce [#" " newline crlf tab]
	non-space: complement space
	
	ctx-jaro: context [
		;maximum prefix length to use.
		MINPREFIXTESTLENGTH: 6
		
		;prefix adjustment scale.
		PREFIXADUSTMENTSCALE: 0.1
		
		get-common-characters: func [
			{Returns a string of characters from string1 within string2 if they 
			are of a given distance seperation from the position in string1}
			string1 [string!]
			string2 [string!]
			distance-sep [integer!]
			/local return-commons pos str
		] [
			; create a return string of characters
			return-commons: copy ""
			; create a copy of string2 for processing
			string2: copy string2
			; iterate over string1
			forall string1 [
				if found? str: find/part 
							at string2 add 1 pos: subtract index? string1 distance-sep
							first string1
							subtract 
								add multiply distance-sep 2 min pos 0
								1 
				[
					; append character found
					append return-commons first string1
					; alter copied string2 for processing
					change/part str to-char 0 1
				]
				string2: head string2
			]
			return-commons
		]
		get-prefix-length: func [
			"Returns the prefix length found of common characters at the begining of the strings."
			string1 [string!]
			string2 [string!]
			/local n
		] [
			n: first minimum-of reduce [MINPREFIXTESTLENGTH length? string1 length? string2]
			for i 1 n 1 [
				; check the prefix is the same so far
				if not-equal? string1/:i string2/:i [
					; not the same so return as far as got
					return subtract i 1
				]
			]
			; 0.2.1
			return n ; first n characters are the same
		]
	]
	ctx-levenshtein: context [
		get-distance: function [
			s [string!] "Source string"
			t [string!] "Target string"
			m [block!]
			dc [number!] "Deletion's cost"
			ic [number!] "Insertion's cost"
			sc [number!] "Substitution's cost"
		] [
			letter-copy letter-substitute 
			letter-insert letter-delete
			i j 
		] [
			if empty? head s [return length? head t]
			if empty? head t [return length? head s]
; 0.1.1
;			if m/(index? t)/(index? s) [return m/(index? t)/(index? s)]
			j: index? t
			i: index? s
			if m/:j/:i [return m/:j/:i]	
			letter-copy: letter-substitute: letter-insert: letter-delete: 1E+99
			; Copy t[j] to s[i]
			if equal? first s first t [
				letter-copy: do select reduce [
					all [head? s head? t] [0]
					true [get-distance back s back t m dc ic sc]
				] true
			]
			; Substitute t[j] for s[i]
			letter-substitute: add sc do select reduce [
					all [head? s head? t] [0]
					head? s [subtract index? t 1]
					head? t [subtract index? s 1]
					true [get-distance back s back t m dc ic sc]
				] true
			; Insert the letter t[j]
			letter-insert: add ic do select reduce [
				head? t [index? s]
				true [get-distance s back t m dc ic sc]
			] true
			; Delete the letter s[i]
			letter-delete: add dc do select reduce [
				head? s [index? t]
				true [get-distance back s t m dc ic sc]
			] true
; 0.1.1
;			m/(index? t)/(index? s): first minimum-of reduce [letter-copy letter-substitute letter-insert letter-delete]
			poke m/:j i first minimum-of reduce [letter-copy letter-substitute letter-insert letter-delete]
			m/:j/:i
		]
	]
	
	ctx-token: context [
		tokenize: func [
			str [string!]
			/punctuation
			/local tokens t-alpha t-digit char mark
		] [
			tokens: make block! []
			t-alpha: copy ""
			t-digit: copy ""
			parse/all str [
				any [
					copy char space (
						if not empty? t-alpha [
							append tokens t-alpha
							t-alpha: copy ""
						]
						if not empty? t-digit [
							append tokens t-digit
							t-digit: copy ""
						]
					) | 
					copy char alpha (
						if not empty? t-digit [
							append tokens t-digit
							t-digit: copy ""
						]
						append t-alpha char
					) | 
					copy char digit (
						if not empty? t-alpha [
							append tokens t-alpha
							t-alpha: copy ""
						]
						append t-digit char
					) | 
					mark: (
						if not empty? t-alpha [
							append tokens t-alpha
							t-alpha: copy ""
						]
						if not empty? t-digit [
							append tokens t-digit
							t-digit: copy ""
						]
						char: copy/part mark 1
						if all [punctuation not empty? char] [append tokens char]
					) skip
				]
			]
			return tokens
		]
	]
]