New org capture template
[emacs.git] / .emacs.d / elisp / icicle / fuzzy-match.el
1 ;;; fuzzy-match.el --- fuzzy matching
2 ;;
3 ;; Filename: fuzzy-match.el
4 ;; Description: fuzzy matching
5 ;; Author: Simon Marshall <s i m o n AT g n u . o r g>
6 ;; Maintainer: Drew Adams <d r e w . a d a m s AT o r a c l e . c o m>
7 ;; Copyright (C) 2007-2012, Drew Adams, all rights reserved.
8 ;; Copyright (C) 1993, 1994 Simon Marshall, all rights reserved.
9 ;; Created: 1993, by Simon Marshall
10 ;; Version: 1.04
11 ;; Last-Updated: Sun Jan 1 14:05:19 2012 (-0800)
12 ;; By: dradams
13 ;; Update #: 176
14 ;; URL: http://www.emacswiki.org/cgi-bin/wiki/fuzzy-match.el
15 ;; Keywords: matching completion string
16 ;; Compatibility: GNU Emacs: 20.x, 21.x, 22.x, 23.x
17 ;;
18 ;; Features that might be required by this library:
19 ;;
20 ;; None
21 ;;
22 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
23 ;;
24 ;;; Commentary:
25 ;;
26 ;; Purpose:
27 ;;
28 ;; Fuzzy-match is a package of functions to provide non-exact comparison
29 ;; between strings. Since I am no expert on such things, and certain criteria
30 ;; for non-exact comparison had to be dropped for reasons of efficiency (e.g.,
31 ;; transposition), and the non-exact nature of non-exact comparison, this
32 ;; package may or may not provide what you want.
33 ;;
34 ;; Caveat:
35 ;;
36 ;; This is fuzzy software. Use it at your own risk.
37 ;;
38 ;; The fuzzy-matcher deals with comparing strings. For a programmer wishing to
39 ;; use the fuzzy-match library, the front-end functions are likely to be
40 ;; `FM-matchiness' (and corresponding `FM-closeness'), `FM-all-fuzzy-matches'
41 ;; (and corresponding `FM-all-close-matches'), and `FM-fuzzy-sort'. These can
42 ;; be thought to mirror `string-match', `all-completions' and `sort'.
43 ;;
44 ;; The function `FM-matchiness' returns an integer which is the number of
45 ;; matching characters from STRING1 in STRING2. What denotes "the number of
46 ;; matching characters" is arbitrary.
47 ;;
48 ;; The fuzziness between two strings, STRING1 and STRING2, is calculated by
49 ;; finding the position in STRING2 of a prefix of STRING1. The first character
50 ;; of STRING1 is found in STRING2. If we find it, we continue matching
51 ;; successive characters from STRING1 at successive STRING2 positions. When we
52 ;; have found the longest prefix of STRING1 in STRING2, we decide whether it is
53 ;; a match. It is considered a match if the length of the prefix is greater or
54 ;; equal to the offset of the beginning of the prefix of STRING1 in STRING2.
55 ;; This means that "food" will match "barfoo" because "foo" (the prefix)
56 ;; matches "foo" in "barfoo" with an offset and length of 3. However, "food"
57 ;; will not be considered to match "barfu", since the length is 1 while the
58 ;; offset is 3. The fuzz value of the match is the length of the prefix. If
59 ;; we find a match, we take the prefix off STRING1 and the string upto the end
60 ;; of the match in STRING2. If we do not find a match, we take off the first
61 ;; character in STRING1. Then we try and find the next prefix.
62 ;;
63 ;; So, to walk through an example:
64 ;;
65 ;; (FM-matchiness "pigface" "pigsfly"):
66 ;;
67 ;; STRING1 STRING2 MATCH LENGTH OFFSET FUZZ
68 ;; pigface pigsfly 3 0 3
69 ;; face sfly 1 1 1
70 ;; ace ly 0 0 0
71 ;; ce ly 0 0 0
72 ;; c ly 0 0 0
73 ;;
74 ;; => 4
75 ;;
76 ;; (FM-matchiness "begining-of-l" "beginning-of-l"):
77 ;;
78 ;; STRING1 STRING2 MATCH LENGTH OFFSET FUZZ
79 ;; begining-of-l beginning-of-l 5 0 5
80 ;; ing-of-l ning-of-l 8 1 8
81 ;;
82 ;; => 13
83 ;;
84 ;; Another function of interest is `FM-all-fuzzy-matches'. This returns a list
85 ;; of those strings that have the highest fuzzy match with a given string.
86 ;; Those strings are sorted so that there is a preference for strings with the
87 ;; same number of characters, and sharing the longest prefix with the given
88 ;; string:
89 ;;
90 ;; (FM-all-fuzzy-matches "abc" '("xxabcxx" "xabcxxx" "xabx"))
91 ;; => ("xabcxxx" "xxabcxx")
92 ;;
93 ;; (FM-all-fuzzy-matches "point-mx" (all-completions "point" obarray))
94 ;; => ("point-max" "point-max-marker")
95 ;;
96 ;; (FM-all-fuzzy-matches "begining-of-l" (all-completions "begin" obarray))
97 ;; => ("beginning-of-line")
98 ;;
99 ;; Corresponding to `FM-matchiness' and `FM-all-fuzzy-matches' are
100 ;; `FM-closeness' and `FM-all-close-matches'. They differ from the former
101 ;; functions in that they take into account the difference in length between
102 ;; the target string and candidate string:
103 ;;
104 ;; (FM-closeness "begining-of-l" "beginning-of-l")
105 ;; => 12
106 ;;
107 ;; Note from above that the matchiness is 13 and the difference in length of
108 ;; the two strings is 1.
109 ;;
110 ;; (FM-all-close-matches "point-mx" (all-completions "point" obarray))
111 ;; => ("point-max")
112 ;;
113 ;; Note from above that although the matchiness is equal between the target
114 ;; "point-mx" and the candidates "point-max" and "point-max-marker", the former
115 ;; candidate has less of a difference in length from the target.
116 ;;
117 ;; Other functions that may be of use to package writers using this package are
118 ;; `FM-map-fuzzy-matches' (and corresponding `FM-map-close-matches') and
119 ;; `FM-max-matchiness' (and corresponding `FM-max-closeness'). The mapping
120 ;; functions map matchiness or closeness to a list, while the max functions
121 ;; return the maximum matchiness or closeness from a list.
122 ;;
123 ;; Also provided are some interface functions for user packages. These
124 ;; functions are `FM-offer-corrections' and `FM-list-candidates'. To
125 ;; demonstrate the usefulness of this package, `lisp-spell-symbol' (analogous
126 ;; to `lisp-complete-symbol') is provided. Without an arg, the command uses
127 ;; `FM-all-close-matches' to find spelling corrections:
128 ;;
129 ;; (goto-char (point-mx M-x lisp-spell-symbol RET
130 ;; -| Replaced point-mx with point-max
131 ;; (goto-char (point-max
132 ;;
133 ;; With a double prefix arg, the command uses `FM-all-fuzzy-matches' to find
134 ;; spelling corrections:
135 ;;
136 ;; (goto-char (point-mx C-u C-u M-x lisp-spell-symbol RET
137 ;; -| Possible candidates are:
138 ;; -| point-max point-max-marker
139 ;;
140 ;; Any number of prefix args means that the user is prompted when replacing
141 ;; with the single correction candidate.
142 ;;
143 ;; Installation:
144 ;;
145 ;; Put this file where your Emacs can find it and byte compile it.
146 ;;
147 ;; To use, put in your package that uses these functions:
148 ;;
149 ;; (require 'fuzzy-match)
150 ;;
151 ;; To use the interactive package, put the following in your ~/.emacs file:
152 ;;
153 ;; (autoload 'lisp-spell-symbol "fuzzy-match"
154 ;; "Perform spell checking on Lisp symbol preceding point." t)
155 ;; (define-key esc-map "#" 'lisp-spell-symbol)
156 ;;
157 ;; This will define the key M-# (ESC #) to call `lisp-spell-symbol'.
158 ;; For Emacs-19 users you can also add an entry to the "ispell" menu-bar:
159 ;;
160 ;; (define-key-after ispell-menu-map [ispell-symbol]
161 ;; '("Check Symbol" . lisp-spell-symbol) 'ispell-word))
162 ;;
163 ;;
164 ;; If you like `fuzzy-match.el', you might also be interested in
165 ;; Icicles, which lets you use the same fuzzy matching for minibuffer
166 ;; input completion: http://www.emacswiki.org/cgi-bin/wiki/Icicles.
167
168 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
169 ;;
170 ;;; Change Log:
171 ;;
172 ;; 2011/01/04 dadams
173 ;; Added autoload cookies for commands.
174 ;; 2007/10/01 dadams
175 ;; FM-lessiness:
176 ;; Return t if no occurrence of a STRING prefix in STRING1 or STRING2.
177 ;; FM-all-fuzzy-matches: Return nil if best fuzzy match has matchiness 0.
178 ;; FM-offer-corrections:
179 ;; Added a complete interactive spec using FM-symbol-name-before-point.
180 ;; lisp-spell-symbol: Use FM-symbol-name-before-point.
181 ;; Added: FM-symbol-name-before-point. If no symbol, return "", not nil.
182 ;; Updated file header.
183 ;; - 1.00--1.01: smarshall
184 ;; Inlined FM-strstr-intern into FM-matchiness-intern for speed.
185 ;; Added FM*close* to mirror FM*fuzzy* functions.
186 ;; Added FM-offer-corrections, FM-list-candidates, lisp-spell-symbol.
187 ;; - 1.01--1.02: smarshall
188 ;; Made FM-offer-corrections deal with identical single correction.
189 ;; Made lisp-spell-symbol use FM-all-fuzzy-matches if user wants.
190 ;; Updated ispell-menu-map comments for Emacs-19.25.
191 ;; Removed mouse-choose-completion hack from FM-list-candidates.
192 ;; - 1.02--1.03: smarshall
193 ;; Corrected Copyleft.
194 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
195 ;;
196 ;; This program is free software; you can redistribute it and/or
197 ;; modify it under the terms of the GNU General Public License as
198 ;; published by the Free Software Foundation; either version 2, or
199 ;; (at your option) any later version.
200 ;;
201 ;; This program is distributed in the hope that it will be useful,
202 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
203 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
204 ;; General Public License for more details.
205 ;;
206 ;; You should have received a copy of the GNU General Public License
207 ;; along with this program; see the file COPYING. If not, write to
208 ;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth
209 ;; Floor, Boston, MA 02110-1301, USA.
210 ;;
211 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
212 ;;
213 ;;; Code:
214
215 ;;; Bizarre, but FM-strstr-intern and FM-matchiness-intern are quickest when
216 ;;; dealing with lists. If coded to deal with strings using aref and
217 ;;; string-match, it takes longer. (Though this might not be true if we had a
218 ;;; non-regexp version of string-match---of course it would be even better if
219 ;;; we could interface to ispell.) I'd be happy to be proved wrong.
220
221 (defsubst FM-string-to-char-list (string)
222 "Return the character list of STRING.
223 If STRING is already a list, this function just returns STRING."
224 (if (listp string) string (mapcar 'identity string)))
225
226 (defsubst FM-strings-to-char-lists (strings)
227 "Return the character lists of STRINGS.
228 See `FM-string-to-char-list'."
229 (mapcar 'FM-string-to-char-list strings))
230
231 (defsubst FM-char-list-to-string (charlist)
232 "Return the string of CHARLIST.
233 If CHARLIST is not a list, this function just returns CHARLIST."
234 (if (listp charlist) (mapconcat 'char-to-string charlist "") charlist))
235
236 (defsubst FM-char-lists-to-strings (charlists)
237 "Return the strings of CHARLISTS.
238 See `FM-char-list-to-string'."
239 (mapcar 'FM-char-list-to-string charlists))
240
241 (defsubst FM-strstr-intern (string1 string2)
242 "Find first occurrence of a prefix of STRING1 in STRING2.
243 Returns a cons pair of the length of the substring and the offset into STRING2,
244 or nil if no match is found.
245 STRING1 and STRING2 are character lists."
246 (let ((char1 (car string1))
247 (offset 0) len)
248 (while (and string2 (/= char1 (car string2)))
249 (setq offset (1+ offset) string2 (cdr string2)))
250 (if (null string2)
251 nil
252 (setq string1 (cdr string1) string2 (cdr string2) len 1)
253 (while (and string1 string2 (= (car string1) (car string2)))
254 (setq len (1+ len) string1 (cdr string1) string2 (cdr string2)))
255 (cons len offset))))
256
257 (defsubst FM-matchiness-intern (string1 string2)
258 "Return the fuzziness between STRING1 and STRING2.
259 STRING1 and STRING2 are character lists."
260 (let ((fuzz 0) len offset s1 s2 c1)
261 (while (and string1 string2)
262 ;; This is (FM-strstr-intern string1 string2) incoded for speed.
263 (setq c1 (car string1) s2 string2 offset 0)
264 (while (and s2 (/= c1 (car s2))) ; Where is c1 in s2?
265 (setq offset (1+ offset) s2 (cdr s2)))
266 (if (null s2)
267 (setq string1 (cdr string1))
268 (setq s1 (cdr string1) len 1) ; How long is it in s2?
269 (while (and s1 (setq s2 (cdr s2)) (= (car s1) (car s2)))
270 (setq len (1+ len) s1 (cdr s1)))
271 (if (< len offset) ; Is it regarded as a match?
272 (setq string1 (cdr string1))
273 (setq fuzz (+ fuzz len) string1 s1 string2 s2))))
274 fuzz))
275
276 (defun FM-lessiness (string string1 string2)
277 "Return non-nil if STRING1 is \"less\" than STRING2, based on STRING.
278 Comparison is based on the simularity:
279 - Between STRING and STRING1 and STRING2 (`FM-matchiness-intern').
280 - Between STRING and prefix length in STRING1 and STRING2 (`FM-strstr-intern').
281 - Between the length of STRING and STRING1 and STRING2.
282 - The offset of the first occurrence of a prefix in STRING1 and STRING2.
283 STRING, STRING1 and STRING2 can be character lists."
284 (let* ((string (FM-string-to-char-list string))
285 (string1 (FM-string-to-char-list string1))
286 (string2 (FM-string-to-char-list string2))
287 (fuzz1 (FM-matchiness-intern string string1))
288 (fuzz2 (FM-matchiness-intern string string2)))
289 (if (/= fuzz1 fuzz2)
290 (> fuzz1 fuzz2)
291 (let ((strstr1 (FM-strstr-intern string string1))
292 (strstr2 (FM-strstr-intern string string2)))
293 (cond ((or (null strstr1) (null strstr2)))
294 ((/= (cdr strstr1) (cdr strstr2))
295 (< (cdr strstr1) (cdr strstr2)))
296 ((/= (length string1) (length string2))
297 (< (length string1) (length string2)))
298 (t (> (car strstr1) (car strstr2))))))))
299
300 ;;; Useful functions...
301
302 (defun FM-matchiness (string1 string2)
303 "Return the fuzziness between STRING1 and STRING2.
304 This provides a gauge of the number of characters of STRING1 in STRING2.
305 STRING1 and STRING2 can be character lists."
306 (FM-matchiness-intern (FM-string-to-char-list string1)
307 (FM-string-to-char-list string2)))
308
309 (defun FM-closeness (string1 string2)
310 "Return the closeness between STRING1 and STRING2.
311 This provides a gauge of the similarity of STRING1 and STRING2.
312 STRING1 and STRING2 can be character lists."
313 (- (FM-matchiness-intern (FM-string-to-char-list string1)
314 (FM-string-to-char-list string2))
315 (abs (- (length string1) (length string2)))))
316
317 (defun FM-all-fuzzy-matches (string strings)
318 "Return most fuzzy matches to STRING in STRINGS.
319 Each element of STRINGS is tested to see if it fuzzily matches STRING.
320 The value is a list of all the strings from STRINGS that most fuzzily match.
321 The strings are fuzzily matched using `FM-matchiness'.
322 The list of fuzzy matches is sorted using `FM-fuzzy-sort'.
323 STRING and elements of STRINGS can be character lists."
324 (let* ((string (FM-string-to-char-list string))
325 (strings (FM-strings-to-char-lists strings))
326 (bestfuzz (FM-matchiness-intern string (car strings)))
327 (matches (list (car strings)))
328 (strings (cdr strings))
329 thisfuzz)
330 (while strings
331 (setq thisfuzz (FM-matchiness-intern string (car strings)))
332 (cond ((= bestfuzz thisfuzz)
333 (setq matches (cons (car strings) matches)))
334 ((< bestfuzz thisfuzz)
335 (setq bestfuzz thisfuzz
336 matches (list (car strings)))))
337 (setq strings (cdr strings)))
338 (and (not (zerop bestfuzz)) (FM-fuzzy-sort string matches))))
339
340 (defun FM-all-close-matches (string strings)
341 "Return most close matches to STRING in STRINGS.
342 Each element of STRINGS is tested to see if it closely matches STRING.
343 The value is a list of all the strings from STRINGS that most closely match.
344 The strings are fuzzily matched using `FM-closeness'.
345 The list of close matches is sorted using `FM-fuzzy-sort'.
346 STRING and elements of STRINGS can be character lists."
347 (let* ((bestfuzz (FM-closeness string (car strings)))
348 (matches (list (car strings)))
349 (strings (cdr strings))
350 thisfuzz)
351 (while strings
352 (setq thisfuzz (FM-closeness string (car strings)))
353 (cond ((= bestfuzz thisfuzz)
354 (setq matches (cons (car strings) matches)))
355 ((< bestfuzz thisfuzz)
356 (setq bestfuzz thisfuzz
357 matches (list (car strings)))))
358 (setq strings (cdr strings)))
359 (FM-fuzzy-sort string matches)))
360
361 (defun FM-map-fuzzy-matches (string strings)
362 "Return list of fuzzy matches to STRING in STRINGS.
363 Each element of the returned list is a cons pair of the form (string . fuzz)
364 where fuzz is the fuzzy match of string to STRING. See `FM-matchiness'.
365 STRING and elements of STRINGS can be character lists."
366 (let ((string (FM-string-to-char-list string)))
367 (mapcar (function (lambda (str) (cons str (FM-matchiness string str))))
368 strings)))
369
370 (defun FM-map-close-matches (string strings)
371 "Return list of close matches to STRING in STRINGS.
372 Each element of the returned list is a cons pair of the form (string . close)
373 where close is the close match of string to STRING. See `FM-closeness'.
374 STRING and elements of STRINGS can be character lists."
375 (let ((string (FM-string-to-char-list string)))
376 (mapcar (function (lambda (str) (cons str (FM-closeness string str))))
377 strings)))
378
379 (defun FM-max-matchiness (string strings)
380 "Return the maximum fuzzy matchiness of STRING in STRINGS.
381 STRING and elements of STRINGS can be character lists."
382 (let ((string (FM-string-to-char-list string)))
383 (apply 'max (mapcar (function (lambda (str) (FM-matchiness string str)))
384 strings))))
385
386 (defun FM-max-closeness (string strings)
387 "Return the maximum closeness of STRING in STRINGS.
388 STRING and elements of STRINGS can be character lists."
389 (let ((string (FM-string-to-char-list string)))
390 (apply 'max (mapcar (function (lambda (str) (FM-closeness string str)))
391 strings))))
392
393 (defun FM-fuzzy-sort (string strings)
394 "Return STRINGS fuzzily sorted based on STRING.
395 Sorting is done using `FM-lessiness' as predicate.
396 STRING and elements of STRINGS can be character lists."
397 (let ((string (FM-string-to-char-list string))
398 (strings (FM-strings-to-char-lists strings)))
399 (FM-char-lists-to-strings
400 (sort strings (function (lambda (string1 string2)
401 (FM-lessiness string string1 string2)))))))
402
403 ;;;###autoload
404 (defun FM-offer-corrections (item candidates &optional prompt-p)
405 "Offer corrections for ITEM from CANDIDATES. Maybe replace ITEM.
406 If PROMPT-P is non-nil and there is only one candidate, ask the user before
407 replacing ITEM. Replacement is performed by `replace-match'.
408 If more than one correction exists, call `FM-list-candidates' to display them.
409 Returns: nil if no correction was inserted.
410 `sole' if corrected with the only correction match.
411 `correct' if the only correction match is identical to ITEM.
412 `listed' if a completion listing was shown."
413 (interactive
414 (let* ((symb (FM-symbol-name-before-point))
415 (cands (and (not (string= "" symb))
416 (FM-all-fuzzy-matches
417 symb (all-completions (substring symb 0 1) obarray)))))
418 (list symb cands current-prefix-arg)))
419 (cond ((null candidates)
420 (if (string= "" item)
421 (message "No symbol before point to complete")
422 (message "No candidates for `%s'" item))
423 nil)
424 ((> (length candidates) 1) ; There's no unique correction.
425 (FM-list-candidates candidates)
426 'listed)
427 (t
428 (let ((candidate (car candidates)))
429 (cond ((string= item candidate)
430 (message "Replacement is the same as `%s'" item)
431 'correct)
432 ((or (null prompt-p)
433 (y-or-n-p (format "Replace `%s' with `%s' " item candidate)))
434 (replace-match candidate t t)
435 (message "Replaced %s with %s" item candidate)
436 'sole)
437 (t
438 nil))))))
439
440 (defun FM-symbol-name-before-point ()
441 "Return the symbol name before point or an empty string if no symbol."
442 ;; Do it this way to avoid reading a symbol name,
443 ;; which would create the symbol in obarray.
444 (save-excursion
445 (let* ((sym-chars "a-zA-Z0-9:_=<>/+-")
446 (sym (concat "[" sym-chars "]"))
447 (non-sym (concat "[^" sym-chars "]")) (limit (point)))
448 (when (re-search-backward non-sym nil 'move) (forward-char 1))
449 (if (or (eolp) (looking-at non-sym))
450 ""
451 (re-search-forward (concat sym "+") limit)
452 (buffer-substring-no-properties (match-beginning 0) (match-end 0))))))
453
454 (defun FM-list-candidates (candidates)
455 "List in help buffer CANDIDATES."
456 (let ((conf (current-window-configuration)) (buf " *Candidates*"))
457 (with-output-to-temp-buffer buf
458 (display-completion-list candidates)
459 (set-buffer buf)
460 (forward-line 3)
461 (while (search-backward "completion" nil 'move)
462 (replace-match "candidate")))))
463
464 ;;; Example code (see comment header):
465
466 ;;;###autoload
467 (defun lisp-spell-symbol (prompt)
468 "Perform spell checking on Lisp symbol preceding point.
469 With prefix arg(s) and only one candidate, ask the user before replacing.
470 With double prefix arg (\\[universal-argument] \\[universal-argument]), use \
471 `FM-all-fuzzy-matches' rather than
472 `FM-all-close-matches' to find Lisp symbol candidates. This is useful if the
473 Lisp symbol stub is only partially complete.
474
475 To minimize matching effort and results, the first character of the
476 symbol is assumed to be correct. See also `FM-offer-corrections'."
477 (interactive "p")
478 (let ((symbol (FM-symbol-name-before-point)))
479 (if (string= "" symbol)
480 (message "Not after a symbol")
481 (let ((symbols (all-completions (substring symbol 0 1) obarray))
482 (fuzzy-matcher (if (= prompt 16)
483 'FM-all-fuzzy-matches
484 'FM-all-close-matches)))
485 (message "Checking symbol `%s'" symbol)
486 (FM-offer-corrections symbol
487 (funcall fuzzy-matcher symbol symbols)
488 (/= prompt 1))))))
489
490
491 (provide 'fuzzy-match)
492
493 ;;; fuzzy-match.el ends here