1/*
2** 2012 April 10
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12**
13** This module implements the spellfix1 VIRTUAL TABLE that can be used
14** to search a large vocabulary for close matches. See separate
15** documentation (http://www.sqlite.org/spellfix1.html) for details.
16*/
17#include "sqlite3ext.h"
18SQLITE_EXTENSION_INIT1
19
20#ifndef SQLITE_AMALGAMATION
21# if !defined(NDEBUG) && !defined(SQLITE_DEBUG)
22# define NDEBUG 1
23# endif
24# if defined(NDEBUG) && defined(SQLITE_DEBUG)
25# undef NDEBUG
26# endif
27# include <string.h>
28# include <stdio.h>
29# include <stdlib.h>
30# include <assert.h>
31# define ALWAYS(X) 1
32# define NEVER(X) 0
33typedef unsigned char u8;
34typedef unsigned short u16;
35#endif
36#include <ctype.h>
37
38#ifndef SQLITE_OMIT_VIRTUALTABLE
39
40/*
41** Character classes for ASCII characters:
42**
43** 0 '' Silent letters: H W
44** 1 'A' Any vowel: A E I O U (Y)
45** 2 'B' A bilabeal stop or fricative: B F P V W
46** 3 'C' Other fricatives or back stops: C G J K Q S X Z
47** 4 'D' Alveolar stops: D T
48** 5 'H' Letter H at the beginning of a word
49** 6 'L' Glide: L
50** 7 'R' Semivowel: R
51** 8 'M' Nasals: M N
52** 9 'Y' Letter Y at the beginning of a word.
53** 10 '9' Digits: 0 1 2 3 4 5 6 7 8 9
54** 11 ' ' White space
55** 12 '?' Other.
56*/
57#define CCLASS_SILENT 0
58#define CCLASS_VOWEL 1
59#define CCLASS_B 2
60#define CCLASS_C 3
61#define CCLASS_D 4
62#define CCLASS_H 5
63#define CCLASS_L 6
64#define CCLASS_R 7
65#define CCLASS_M 8
66#define CCLASS_Y 9
67#define CCLASS_DIGIT 10
68#define CCLASS_SPACE 11
69#define CCLASS_OTHER 12
70
71/*
72** The following table gives the character class for non-initial ASCII
73** characters.
74*/
75static const unsigned char midClass[] = {
76 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
77 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
78 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
79 /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
80 /* */ CCLASS_SPACE, /* */ CCLASS_SPACE, /* */ CCLASS_OTHER,
81 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
82 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
83 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
84 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
85 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
86 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_SPACE,
87 /* ! */ CCLASS_OTHER, /* " */ CCLASS_OTHER, /* # */ CCLASS_OTHER,
88 /* $ */ CCLASS_OTHER, /* % */ CCLASS_OTHER, /* & */ CCLASS_OTHER,
89 /* ' */ CCLASS_SILENT, /* ( */ CCLASS_OTHER, /* ) */ CCLASS_OTHER,
90 /* * */ CCLASS_OTHER, /* + */ CCLASS_OTHER, /* , */ CCLASS_OTHER,
91 /* - */ CCLASS_OTHER, /* . */ CCLASS_OTHER, /* / */ CCLASS_OTHER,
92 /* 0 */ CCLASS_DIGIT, /* 1 */ CCLASS_DIGIT, /* 2 */ CCLASS_DIGIT,
93 /* 3 */ CCLASS_DIGIT, /* 4 */ CCLASS_DIGIT, /* 5 */ CCLASS_DIGIT,
94 /* 6 */ CCLASS_DIGIT, /* 7 */ CCLASS_DIGIT, /* 8 */ CCLASS_DIGIT,
95 /* 9 */ CCLASS_DIGIT, /* : */ CCLASS_OTHER, /* ; */ CCLASS_OTHER,
96 /* < */ CCLASS_OTHER, /* = */ CCLASS_OTHER, /* > */ CCLASS_OTHER,
97 /* ? */ CCLASS_OTHER, /* @ */ CCLASS_OTHER, /* A */ CCLASS_VOWEL,
98 /* B */ CCLASS_B, /* C */ CCLASS_C, /* D */ CCLASS_D,
99 /* E */ CCLASS_VOWEL, /* F */ CCLASS_B, /* G */ CCLASS_C,
100 /* H */ CCLASS_SILENT, /* I */ CCLASS_VOWEL, /* J */ CCLASS_C,
101 /* K */ CCLASS_C, /* L */ CCLASS_L, /* M */ CCLASS_M,
102 /* N */ CCLASS_M, /* O */ CCLASS_VOWEL, /* P */ CCLASS_B,
103 /* Q */ CCLASS_C, /* R */ CCLASS_R, /* S */ CCLASS_C,
104 /* T */ CCLASS_D, /* U */ CCLASS_VOWEL, /* V */ CCLASS_B,
105 /* W */ CCLASS_B, /* X */ CCLASS_C, /* Y */ CCLASS_VOWEL,
106 /* Z */ CCLASS_C, /* [ */ CCLASS_OTHER, /* \ */ CCLASS_OTHER,
107 /* ] */ CCLASS_OTHER, /* ^ */ CCLASS_OTHER, /* _ */ CCLASS_OTHER,
108 /* ` */ CCLASS_OTHER, /* a */ CCLASS_VOWEL, /* b */ CCLASS_B,
109 /* c */ CCLASS_C, /* d */ CCLASS_D, /* e */ CCLASS_VOWEL,
110 /* f */ CCLASS_B, /* g */ CCLASS_C, /* h */ CCLASS_SILENT,
111 /* i */ CCLASS_VOWEL, /* j */ CCLASS_C, /* k */ CCLASS_C,
112 /* l */ CCLASS_L, /* m */ CCLASS_M, /* n */ CCLASS_M,
113 /* o */ CCLASS_VOWEL, /* p */ CCLASS_B, /* q */ CCLASS_C,
114 /* r */ CCLASS_R, /* s */ CCLASS_C, /* t */ CCLASS_D,
115 /* u */ CCLASS_VOWEL, /* v */ CCLASS_B, /* w */ CCLASS_B,
116 /* x */ CCLASS_C, /* y */ CCLASS_VOWEL, /* z */ CCLASS_C,
117 /* { */ CCLASS_OTHER, /* | */ CCLASS_OTHER, /* } */ CCLASS_OTHER,
118 /* ~ */ CCLASS_OTHER, /* */ CCLASS_OTHER,
119};
120/*
121** This tables gives the character class for ASCII characters that form the
122** initial character of a word. The only difference from midClass is with
123** the letters H, W, and Y.
124*/
125static const unsigned char initClass[] = {
126 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
127 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
128 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
129 /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
130 /* */ CCLASS_SPACE, /* */ CCLASS_SPACE, /* */ CCLASS_OTHER,
131 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
132 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
133 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
134 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
135 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
136 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_SPACE,
137 /* ! */ CCLASS_OTHER, /* " */ CCLASS_OTHER, /* # */ CCLASS_OTHER,
138 /* $ */ CCLASS_OTHER, /* % */ CCLASS_OTHER, /* & */ CCLASS_OTHER,
139 /* ' */ CCLASS_OTHER, /* ( */ CCLASS_OTHER, /* ) */ CCLASS_OTHER,
140 /* * */ CCLASS_OTHER, /* + */ CCLASS_OTHER, /* , */ CCLASS_OTHER,
141 /* - */ CCLASS_OTHER, /* . */ CCLASS_OTHER, /* / */ CCLASS_OTHER,
142 /* 0 */ CCLASS_DIGIT, /* 1 */ CCLASS_DIGIT, /* 2 */ CCLASS_DIGIT,
143 /* 3 */ CCLASS_DIGIT, /* 4 */ CCLASS_DIGIT, /* 5 */ CCLASS_DIGIT,
144 /* 6 */ CCLASS_DIGIT, /* 7 */ CCLASS_DIGIT, /* 8 */ CCLASS_DIGIT,
145 /* 9 */ CCLASS_DIGIT, /* : */ CCLASS_OTHER, /* ; */ CCLASS_OTHER,
146 /* < */ CCLASS_OTHER, /* = */ CCLASS_OTHER, /* > */ CCLASS_OTHER,
147 /* ? */ CCLASS_OTHER, /* @ */ CCLASS_OTHER, /* A */ CCLASS_VOWEL,
148 /* B */ CCLASS_B, /* C */ CCLASS_C, /* D */ CCLASS_D,
149 /* E */ CCLASS_VOWEL, /* F */ CCLASS_B, /* G */ CCLASS_C,
150 /* H */ CCLASS_SILENT, /* I */ CCLASS_VOWEL, /* J */ CCLASS_C,
151 /* K */ CCLASS_C, /* L */ CCLASS_L, /* M */ CCLASS_M,
152 /* N */ CCLASS_M, /* O */ CCLASS_VOWEL, /* P */ CCLASS_B,
153 /* Q */ CCLASS_C, /* R */ CCLASS_R, /* S */ CCLASS_C,
154 /* T */ CCLASS_D, /* U */ CCLASS_VOWEL, /* V */ CCLASS_B,
155 /* W */ CCLASS_B, /* X */ CCLASS_C, /* Y */ CCLASS_Y,
156 /* Z */ CCLASS_C, /* [ */ CCLASS_OTHER, /* \ */ CCLASS_OTHER,
157 /* ] */ CCLASS_OTHER, /* ^ */ CCLASS_OTHER, /* _ */ CCLASS_OTHER,
158 /* ` */ CCLASS_OTHER, /* a */ CCLASS_VOWEL, /* b */ CCLASS_B,
159 /* c */ CCLASS_C, /* d */ CCLASS_D, /* e */ CCLASS_VOWEL,
160 /* f */ CCLASS_B, /* g */ CCLASS_C, /* h */ CCLASS_SILENT,
161 /* i */ CCLASS_VOWEL, /* j */ CCLASS_C, /* k */ CCLASS_C,
162 /* l */ CCLASS_L, /* m */ CCLASS_M, /* n */ CCLASS_M,
163 /* o */ CCLASS_VOWEL, /* p */ CCLASS_B, /* q */ CCLASS_C,
164 /* r */ CCLASS_R, /* s */ CCLASS_C, /* t */ CCLASS_D,
165 /* u */ CCLASS_VOWEL, /* v */ CCLASS_B, /* w */ CCLASS_B,
166 /* x */ CCLASS_C, /* y */ CCLASS_Y, /* z */ CCLASS_C,
167 /* { */ CCLASS_OTHER, /* | */ CCLASS_OTHER, /* } */ CCLASS_OTHER,
168 /* ~ */ CCLASS_OTHER, /* */ CCLASS_OTHER,
169};
170
171/*
172** Mapping from the character class number (0-13) to a symbol for each
173** character class. Note that initClass[] can be used to map the class
174** symbol back into the class number.
175*/
176static const unsigned char className[] = ".ABCDHLRMY9 ?";
177
178/*
179** Generate a "phonetic hash" from a string of ASCII characters
180** in zIn[0..nIn-1].
181**
182** * Map characters by character class as defined above.
183** * Omit double-letters
184** * Omit vowels beside R and L
185** * Omit T when followed by CH
186** * Omit W when followed by R
187** * Omit D when followed by J or G
188** * Omit K in KN or G in GN at the beginning of a word
189**
190** Space to hold the result is obtained from sqlite3_malloc()
191**
192** Return NULL if memory allocation fails.
193*/
194static unsigned char *phoneticHash(const unsigned char *zIn, int nIn)
195{
196 unsigned char *zOut = sqlite3_malloc64( nIn + 1 );
197 int i;
198 int nOut = 0;
199 char cPrev = 0x77;
200 char cPrevX = 0x77;
201 const unsigned char *aClass = initClass;
202
203 if( zOut==0 ) return 0;
204 if( nIn>2 ) {
205 switch( zIn[0] ) {
206 case 'g':
207 case 'k': {
208 if( zIn[1]=='n' ) {
209 zIn++;
210 nIn--;
211 }
212 break;
213 }
214 }
215 }
216 for(i=0; i<nIn; i++) {
217 unsigned char c = zIn[i];
218 if( i+1<nIn ) {
219 if( c=='w' && zIn[i+1]=='r' ) continue;
220 if( c=='d' && (zIn[i+1]=='j' || zIn[i+1]=='g') ) continue;
221 if( i+2<nIn ) {
222 if( c=='t' && zIn[i+1]=='c' && zIn[i+2]=='h' ) continue;
223 }
224 }
225 c = aClass[c&0x7f];
226 if( c==CCLASS_SPACE ) continue;
227 if( c==CCLASS_OTHER && cPrev!=CCLASS_DIGIT ) continue;
228 aClass = midClass;
229 if( c==CCLASS_VOWEL && (cPrevX==CCLASS_R || cPrevX==CCLASS_L) ) {
230 continue; /* No vowels beside L or R */
231 }
232 if( (c==CCLASS_R || c==CCLASS_L) && cPrevX==CCLASS_VOWEL ) {
233 nOut--; /* No vowels beside L or R */
234 }
235 cPrev = c;
236 if( c==CCLASS_SILENT ) continue;
237 cPrevX = c;
238 c = className[c];
239 assert( nOut>=0 );
240 if( nOut==0 || c!=zOut[nOut-1] ) zOut[nOut++] = c;
241 }
242 zOut[nOut] = 0;
243 return zOut;
244}
245
246/*
247** This is an SQL function wrapper around phoneticHash(). See
248** the description of phoneticHash() for additional information.
249*/
250static void phoneticHashSqlFunc(
251 sqlite3_context *context,
252 int argc,
253 sqlite3_value **argv
254)
255{
256 const unsigned char *zIn;
257 unsigned char *zOut;
258
259 zIn = sqlite3_value_text(argv[0]);
260 if( zIn==0 ) return;
261 zOut = phoneticHash(zIn, sqlite3_value_bytes(argv[0]));
262 if( zOut==0 ) {
263 sqlite3_result_error_nomem(context);
264 } else {
265 sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
266 }
267}
268
269/*
270** Return the character class number for a character given its
271** context.
272*/
273static char characterClass(char cPrev, char c)
274{
275 return cPrev==0 ? initClass[c&0x7f] : midClass[c&0x7f];
276}
277
278/*
279** Return the cost of inserting or deleting character c immediately
280** following character cPrev. If cPrev==0, that means c is the first
281** character of the word.
282*/
283static int insertOrDeleteCost(char cPrev, char c, char cNext)
284{
285 char classC = characterClass(cPrev, c);
286 char classCprev;
287
288 if( classC==CCLASS_SILENT ) {
289 /* Insert or delete "silent" characters such as H or W */
290 return 1;
291 }
292 if( cPrev==c ) {
293 /* Repeated characters, or miss a repeat */
294 return 10;
295 }
296 if( classC==CCLASS_VOWEL && (cPrev=='r' || cNext=='r') ) {
297 return 20; /* Insert a vowel before or after 'r' */
298 }
299 classCprev = characterClass(cPrev, cPrev);
300 if( classC==classCprev ) {
301 if( classC==CCLASS_VOWEL ) {
302 /* Remove or add a new vowel to a vowel cluster */
303 return 15;
304 } else {
305 /* Remove or add a consonant not in the same class */
306 return 50;
307 }
308 }
309
310 /* any other character insertion or deletion */
311 return 100;
312}
313
314/*
315** Divide the insertion cost by this factor when appending to the
316** end of the word.
317*/
318#define FINAL_INS_COST_DIV 4
319
320/*
321** Return the cost of substituting cTo in place of cFrom assuming
322** the previous character is cPrev. If cPrev==0 then cTo is the first
323** character of the word.
324*/
325static int substituteCost(char cPrev, char cFrom, char cTo)
326{
327 char classFrom, classTo;
328 if( cFrom==cTo ) {
329 /* Exact match */
330 return 0;
331 }
332 if( cFrom==(cTo^0x20) && ((cTo>='A' && cTo<='Z') || (cTo>='a' && cTo<='z')) ) {
333 /* differ only in case */
334 return 0;
335 }
336 classFrom = characterClass(cPrev, cFrom);
337 classTo = characterClass(cPrev, cTo);
338 if( classFrom==classTo ) {
339 /* Same character class */
340 return 40;
341 }
342 if( classFrom>=CCLASS_B && classFrom<=CCLASS_Y
343 && classTo>=CCLASS_B && classTo<=CCLASS_Y ) {
344 /* Convert from one consonant to another, but in a different class */
345 return 75;
346 }
347 /* Any other subsitution */
348 return 100;
349}
350
351/*
352** Given two strings zA and zB which are pure ASCII, return the cost
353** of transforming zA into zB. If zA ends with '*' assume that it is
354** a prefix of zB and give only minimal penalty for extra characters
355** on the end of zB.
356**
357** Smaller numbers mean a closer match.
358**
359** Negative values indicate an error:
360** -1 One of the inputs is NULL
361** -2 Non-ASCII characters on input
362** -3 Unable to allocate memory
363**
364** If pnMatch is not NULL, then *pnMatch is set to the number of bytes
365** of zB that matched the pattern in zA. If zA does not end with a '*',
366** then this value is always the number of bytes in zB (i.e. strlen(zB)).
367** If zA does end in a '*', then it is the number of bytes in the prefix
368** of zB that was deemed to match zA.
369*/
370static int editdist1(const char *zA, const char *zB, int *pnMatch)
371{
372 int nA, nB; /* Number of characters in zA[] and zB[] */
373 int xA, xB; /* Loop counters for zA[] and zB[] */
374 char cA = 0, cB; /* Current character of zA and zB */
375 char cAprev, cBprev; /* Previous character of zA and zB */
376 char cAnext, cBnext; /* Next character in zA and zB */
377 int d; /* North-west cost value */
378 int dc = 0; /* North-west character value */
379 int res; /* Final result */
380 int *m; /* The cost matrix */
381 char *cx; /* Corresponding character values */
382 int *toFree = 0; /* Malloced space */
383 int nMatch = 0;
384 int mStack[60+15]; /* Stack space to use if not too much is needed */
385
386 /* Early out if either input is NULL */
387 if( zA==0 || zB==0 ) return -1;
388
389 /* Skip any common prefix */
390 while( zA[0] && zA[0]==zB[0] ) {
391 dc = zA[0];
392 zA++;
393 zB++;
394 nMatch++;
395 }
396 if( pnMatch ) *pnMatch = nMatch;
397 if( zA[0]==0 && zB[0]==0 ) return 0;
398
399#if 0
400 printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' ');
401#endif
402
403 /* Verify input strings and measure their lengths */
404 for(nA=0; zA[nA]; nA++) {
405 if( zA[nA]&0x80 ) return -2;
406 }
407 for(nB=0; zB[nB]; nB++) {
408 if( zB[nB]&0x80 ) return -2;
409 }
410
411 /* Special processing if either string is empty */
412 if( nA==0 ) {
413 cBprev = (char)dc;
414 for(xB=res=0; (cB = zB[xB])!=0; xB++) {
415 res += insertOrDeleteCost(cBprev, cB, zB[xB+1])/FINAL_INS_COST_DIV;
416 cBprev = cB;
417 }
418 return res;
419 }
420 if( nB==0 ) {
421 cAprev = (char)dc;
422 for(xA=res=0; (cA = zA[xA])!=0; xA++) {
423 res += insertOrDeleteCost(cAprev, cA, zA[xA+1]);
424 cAprev = cA;
425 }
426 return res;
427 }
428
429 /* A is a prefix of B */
430 if( zA[0]=='*' && zA[1]==0 ) return 0;
431
432 /* Allocate and initialize the Wagner matrix */
433 if( nB<(sizeof(mStack)*4)/(sizeof(mStack[0])*5) ) {
434 m = mStack;
435 } else {
436 m = toFree = sqlite3_malloc64( (nB+1)*5*sizeof(m[0])/4 );
437 if( m==0 ) return -3;
438 }
439 cx = (char*)&m[nB+1];
440
441 /* Compute the Wagner edit distance */
442 m[0] = 0;
443 cx[0] = (char)dc;
444 cBprev = (char)dc;
445 for(xB=1; xB<=nB; xB++) {
446 cBnext = zB[xB];
447 cB = zB[xB-1];
448 cx[xB] = cB;
449 m[xB] = m[xB-1] + insertOrDeleteCost(cBprev, cB, cBnext);
450 cBprev = cB;
451 }
452 cAprev = (char)dc;
453 for(xA=1; xA<=nA; xA++) {
454 int lastA = (xA==nA);
455 cA = zA[xA-1];
456 cAnext = zA[xA];
457 if( cA=='*' && lastA ) break;
458 d = m[0];
459 dc = cx[0];
460 m[0] = d + insertOrDeleteCost(cAprev, cA, cAnext);
461 cBprev = 0;
462 for(xB=1; xB<=nB; xB++) {
463 int totalCost, insCost, delCost, subCost, ncx;
464 cB = zB[xB-1];
465 cBnext = zB[xB];
466
467 /* Cost to insert cB */
468 insCost = insertOrDeleteCost(cx[xB-1], cB, cBnext);
469 if( lastA ) insCost /= FINAL_INS_COST_DIV;
470
471 /* Cost to delete cA */
472 delCost = insertOrDeleteCost(cx[xB], cA, cBnext);
473
474 /* Cost to substitute cA->cB */
475 subCost = substituteCost(cx[xB-1], cA, cB);
476
477 /* Best cost */
478 totalCost = insCost + m[xB-1];
479 ncx = cB;
480 if( (delCost + m[xB])<totalCost ) {
481 totalCost = delCost + m[xB];
482 ncx = cA;
483 }
484 if( (subCost + d)<totalCost ) {
485 totalCost = subCost + d;
486 }
487
488#if 0
489 printf("%d,%d d=%4d u=%4d r=%4d dc=%c cA=%c cB=%c"
490 " ins=%4d del=%4d sub=%4d t=%4d ncx=%c\n",
491 xA, xB, d, m[xB], m[xB-1], dc?dc:' ', cA, cB,
492 insCost, delCost, subCost, totalCost, ncx?ncx:' ');
493#endif
494
495 /* Update the matrix */
496 d = m[xB];
497 dc = cx[xB];
498 m[xB] = totalCost;
499 cx[xB] = (char)ncx;
500 cBprev = cB;
501 }
502 cAprev = cA;
503 }
504
505 /* Free the wagner matrix and return the result */
506 if( cA=='*' ) {
507 res = m[1];
508 for(xB=1; xB<=nB; xB++) {
509 if( m[xB]<res ) {
510 res = m[xB];
511 if( pnMatch ) *pnMatch = xB+nMatch;
512 }
513 }
514 } else {
515 res = m[nB];
516 /* In the current implementation, pnMatch is always NULL if zA does
517 ** not end in "*" */
518 assert( pnMatch==0 );
519 }
520 sqlite3_free(toFree);
521 return res;
522}
523
524/*
525** Function: editdist(A,B)
526**
527** Return the cost of transforming string A into string B. Both strings
528** must be pure ASCII text. If A ends with '*' then it is assumed to be
529** a prefix of B and extra characters on the end of B have minimal additional
530** cost.
531*/
532static void editdistSqlFunc(
533 sqlite3_context *context,
534 int argc,
535 sqlite3_value **argv
536)
537{
538 int res = editdist1(
539 (const char*)sqlite3_value_text(argv[0]),
540 (const char*)sqlite3_value_text(argv[1]),
541 0);
542 if( res<0 ) {
543 if( res==(-3) ) {
544 sqlite3_result_error_nomem(context);
545 } else if( res==(-2) ) {
546 sqlite3_result_error(context, "non-ASCII input to editdist()", -1);
547 } else {
548 sqlite3_result_error(context, "NULL input to editdist()", -1);
549 }
550 } else {
551 sqlite3_result_int(context, res);
552 }
553}
554
555/* End of the fixed-cost edit distance implementation
556******************************************************************************
557*****************************************************************************
558** Begin: Configurable cost unicode edit distance routines
559*/
560/* Forward declaration of structures */
561typedef struct EditDist3Cost EditDist3Cost;
562typedef struct EditDist3Config EditDist3Config;
563typedef struct EditDist3Point EditDist3Point;
564typedef struct EditDist3From EditDist3From;
565typedef struct EditDist3FromString EditDist3FromString;
566typedef struct EditDist3To EditDist3To;
567typedef struct EditDist3ToString EditDist3ToString;
568typedef struct EditDist3Lang EditDist3Lang;
569
570/*
571** An entry in the edit cost table
572*/
573struct EditDist3Cost {
574 EditDist3Cost *pNext; /* Next cost element */
575 u8 nFrom; /* Number of bytes in aFrom */
576 u8 nTo; /* Number of bytes in aTo */
577 u16 iCost; /* Cost of this transformation */
578 char a[4] ; /* FROM string followed by TO string */
579 /* Additional TO and FROM string bytes appended as necessary */
580};
581
582/*
583** Edit costs for a particular language ID
584*/
585struct EditDist3Lang {
586 int iLang; /* Language ID */
587 int iInsCost; /* Default insertion cost */
588 int iDelCost; /* Default deletion cost */
589 int iSubCost; /* Default substitution cost */
590 EditDist3Cost *pCost; /* Costs */
591};
592
593/*
594** The default EditDist3Lang object, with default costs.
595*/
596static const EditDist3Lang editDist3Lang = { 0, 100, 100, 150, 0 };
597
598/*
599** Complete configuration
600*/
601struct EditDist3Config {
602 int nLang; /* Number of language IDs. Size of a[] */
603 EditDist3Lang *a; /* One for each distinct language ID */
604};
605
606/*
607** Extra information about each character in the FROM string.
608*/
609struct EditDist3From {
610 int nSubst; /* Number of substitution cost entries */
611 int nDel; /* Number of deletion cost entries */
612 int nByte; /* Number of bytes in this character */
613 EditDist3Cost **apSubst; /* Array of substitution costs for this element */
614 EditDist3Cost **apDel; /* Array of deletion cost entries */
615};
616
617/*
618** A precompiled FROM string.
619*
620** In the common case we expect the FROM string to be reused multiple times.
621** In other words, the common case will be to measure the edit distance
622** from a single origin string to multiple target strings.
623*/
624struct EditDist3FromString {
625 char *z; /* The complete text of the FROM string */
626 int n; /* Number of characters in the FROM string */
627 int isPrefix; /* True if ends with '*' character */
628 EditDist3From *a; /* Extra info about each char of the FROM string */
629};
630
631/*
632** Extra information about each character in the TO string.
633*/
634struct EditDist3To {
635 int nIns; /* Number of insertion cost entries */
636 int nByte; /* Number of bytes in this character */
637 EditDist3Cost **apIns; /* Array of deletion cost entries */
638};
639
640/*
641** A precompiled FROM string
642*/
643struct EditDist3ToString {
644 char *z; /* The complete text of the TO string */
645 int n; /* Number of characters in the TO string */
646 EditDist3To *a; /* Extra info about each char of the TO string */
647};
648
649/*
650** Clear or delete an instance of the object that records all edit-distance
651** weights.
652*/
653static void editDist3ConfigClear(EditDist3Config *p)
654{
655 int i;
656 if( p==0 ) return;
657 for(i=0; i<p->nLang; i++) {
658 EditDist3Cost *pCost, *pNext;
659 pCost = p->a[i].pCost;
660 while( pCost ) {
661 pNext = pCost->pNext;
662 sqlite3_free(pCost);
663 pCost = pNext;
664 }
665 }
666 sqlite3_free(p->a);
667 memset(p, 0, sizeof(*p));
668}
669static void editDist3ConfigDelete(void *pIn)
670{
671 EditDist3Config *p = (EditDist3Config*)pIn;
672 editDist3ConfigClear(p);
673 sqlite3_free(p);
674}
675
676/* Compare the FROM values of two EditDist3Cost objects, for sorting.
677** Return negative, zero, or positive if the A is less than, equal to,
678** or greater than B.
679*/
680static int editDist3CostCompare(EditDist3Cost *pA, EditDist3Cost *pB)
681{
682 int n = pA->nFrom;
683 int rc;
684 if( n>pB->nFrom ) n = pB->nFrom;
685 rc = strncmp(pA->a, pB->a, n);
686 if( rc==0 ) rc = pA->nFrom - pB->nFrom;
687 return rc;
688}
689
690/*
691** Merge together two sorted lists of EditDist3Cost objects, in order
692** of increasing FROM.
693*/
694static EditDist3Cost *editDist3CostMerge(
695 EditDist3Cost *pA,
696 EditDist3Cost *pB
697)
698{
699 EditDist3Cost *pHead = 0;
700 EditDist3Cost **ppTail = &pHead;
701 EditDist3Cost *p;
702 while( pA && pB ) {
703 if( editDist3CostCompare(pA,pB)<=0 ) {
704 p = pA;
705 pA = pA->pNext;
706 } else {
707 p = pB;
708 pB = pB->pNext;
709 }
710 *ppTail = p;
711 ppTail = &p->pNext;
712 }
713 if( pA ) {
714 *ppTail = pA;
715 } else {
716 *ppTail = pB;
717 }
718 return pHead;
719}
720
721/*
722** Sort a list of EditDist3Cost objects into order of increasing FROM
723*/
724static EditDist3Cost *editDist3CostSort(EditDist3Cost *pList)
725{
726 EditDist3Cost *ap[60], *p;
727 int i;
728 int mx = 0;
729 ap[0] = 0;
730 ap[1] = 0;
731 while( pList ) {
732 p = pList;
733 pList = p->pNext;
734 p->pNext = 0;
735 for(i=0; ap[i]; i++) {
736 p = editDist3CostMerge(ap[i],p);
737 ap[i] = 0;
738 }
739 ap[i] = p;
740 if( i>mx ) {
741 mx = i;
742 ap[i+1] = 0;
743 }
744 }
745 p = 0;
746 for(i=0; i<=mx; i++) {
747 if( ap[i] ) p = editDist3CostMerge(p,ap[i]);
748 }
749 return p;
750}
751
752/*
753** Load all edit-distance weights from a table.
754*/
755static int editDist3ConfigLoad(
756 EditDist3Config *p, /* The edit distance configuration to load */
757 sqlite3 *db, /* Load from this database */
758 const char *zTable /* Name of the table from which to load */
759)
760{
761 sqlite3_stmt *pStmt;
762 int rc, rc2;
763 char *zSql;
764 int iLangPrev = -9999;
765 EditDist3Lang *pLang = 0;
766
767 zSql = sqlite3_mprintf("SELECT iLang, cFrom, cTo, iCost"
768 " FROM \"%w\" WHERE iLang>=0 ORDER BY iLang", zTable);
769 if( zSql==0 ) return SQLITE_NOMEM;
770 rc = sqlite3_prepare(db, zSql, -1, &pStmt, 0);
771 sqlite3_free(zSql);
772 if( rc ) return rc;
773 editDist3ConfigClear(p);
774 while( sqlite3_step(pStmt)==SQLITE_ROW ) {
775 int iLang = sqlite3_column_int(pStmt, 0);
776 const char *zFrom = (const char*)sqlite3_column_text(pStmt, 1);
777 int nFrom = zFrom ? sqlite3_column_bytes(pStmt, 1) : 0;
778 const char *zTo = (const char*)sqlite3_column_text(pStmt, 2);
779 int nTo = zTo ? sqlite3_column_bytes(pStmt, 2) : 0;
780 int iCost = sqlite3_column_int(pStmt, 3);
781
782 assert( zFrom!=0 || nFrom==0 );
783 assert( zTo!=0 || nTo==0 );
784 if( nFrom>100 || nTo>100 ) continue;
785 if( iCost<0 ) continue;
786 if( iCost>=10000 ) continue; /* Costs above 10K are considered infinite */
787 if( pLang==0 || iLang!=iLangPrev ) {
788 EditDist3Lang *pNew;
789 pNew = sqlite3_realloc64(p->a, (p->nLang+1)*sizeof(p->a[0]));
790 if( pNew==0 ) {
791 rc = SQLITE_NOMEM;
792 break;
793 }
794 p->a = pNew;
795 pLang = &p->a[p->nLang];
796 p->nLang++;
797 pLang->iLang = iLang;
798 pLang->iInsCost = 100;
799 pLang->iDelCost = 100;
800 pLang->iSubCost = 150;
801 pLang->pCost = 0;
802 iLangPrev = iLang;
803 }
804 if( nFrom==1 && zFrom[0]=='?' && nTo==0 ) {
805 pLang->iDelCost = iCost;
806 } else if( nFrom==0 && nTo==1 && zTo[0]=='?' ) {
807 pLang->iInsCost = iCost;
808 } else if( nFrom==1 && nTo==1 && zFrom[0]=='?' && zTo[0]=='?' ) {
809 pLang->iSubCost = iCost;
810 } else {
811 EditDist3Cost *pCost;
812 int nExtra = nFrom + nTo - 4;
813 if( nExtra<0 ) nExtra = 0;
814 pCost = sqlite3_malloc64( sizeof(*pCost) + nExtra );
815 if( pCost==0 ) {
816 rc = SQLITE_NOMEM;
817 break;
818 }
819 pCost->nFrom = (u8)nFrom;
820 pCost->nTo = (u8)nTo;
821 pCost->iCost = (u16)iCost;
822 memcpy(pCost->a, zFrom, nFrom);
823 memcpy(pCost->a + nFrom, zTo, nTo);
824 pCost->pNext = pLang->pCost;
825 pLang->pCost = pCost;
826 }
827 }
828 rc2 = sqlite3_finalize(pStmt);
829 if( rc==SQLITE_OK ) rc = rc2;
830 if( rc==SQLITE_OK ) {
831 int iLang;
832 for(iLang=0; iLang<p->nLang; iLang++) {
833 p->a[iLang].pCost = editDist3CostSort(p->a[iLang].pCost);
834 }
835 }
836 return rc;
837}
838
839/*
840** Return the length (in bytes) of a utf-8 character. Or return a maximum
841** of N.
842*/
843static int utf8Len(unsigned char c, int N)
844{
845 int len = 1;
846 if( c>0x7f ) {
847 if( (c&0xe0)==0xc0 ) {
848 len = 2;
849 } else if( (c&0xf0)==0xe0 ) {
850 len = 3;
851 } else {
852 len = 4;
853 }
854 }
855 if( len>N ) len = N;
856 return len;
857}
858
859/*
860** Return TRUE (non-zero) if the To side of the given cost matches
861** the given string.
862*/
863static int matchTo(EditDist3Cost *p, const char *z, int n)
864{
865 assert( n>0 );
866 if( p->a[p->nFrom]!=z[0] ) return 0;
867 if( p->nTo>n ) return 0;
868 if( strncmp(p->a+p->nFrom, z, p->nTo)!=0 ) return 0;
869 return 1;
870}
871
872/*
873** Return TRUE (non-zero) if the From side of the given cost matches
874** the given string.
875*/
876static int matchFrom(EditDist3Cost *p, const char *z, int n)
877{
878 assert( p->nFrom<=n );
879 if( p->nFrom ) {
880 if( p->a[0]!=z[0] ) return 0;
881 if( strncmp(p->a, z, p->nFrom)!=0 ) return 0;
882 }
883 return 1;
884}
885
886/*
887** Return TRUE (non-zero) of the next FROM character and the next TO
888** character are the same.
889*/
890static int matchFromTo(
891 EditDist3FromString *pStr, /* Left hand string */
892 int n1, /* Index of comparison character on the left */
893 const char *z2, /* Right-handl comparison character */
894 int n2 /* Bytes remaining in z2[] */
895)
896{
897 int b1 = pStr->a[n1].nByte;
898 if( b1>n2 ) return 0;
899 assert( b1>0 );
900 if( pStr->z[n1]!=z2[0] ) return 0;
901 if( strncmp(pStr->z+n1, z2, b1)!=0 ) return 0;
902 return 1;
903}
904
905/*
906** Delete an EditDist3FromString objecct
907*/
908static void editDist3FromStringDelete(EditDist3FromString *p)
909{
910 int i;
911 if( p ) {
912 for(i=0; i<p->n; i++) {
913 sqlite3_free(p->a[i].apDel);
914 sqlite3_free(p->a[i].apSubst);
915 }
916 sqlite3_free(p);
917 }
918}
919
920/*
921** Create a EditDist3FromString object.
922*/
923static EditDist3FromString *editDist3FromStringNew(
924 const EditDist3Lang *pLang,
925 const char *z,
926 int n
927)
928{
929 EditDist3FromString *pStr;
930 EditDist3Cost *p;
931 int i;
932
933 if( z==0 ) return 0;
934 if( n<0 ) n = (int)strlen(z);
935 pStr = sqlite3_malloc64( sizeof(*pStr) + sizeof(pStr->a[0])*n + n + 1 );
936 if( pStr==0 ) return 0;
937 pStr->a = (EditDist3From*)&pStr[1];
938 memset(pStr->a, 0, sizeof(pStr->a[0])*n);
939 pStr->n = n;
940 pStr->z = (char*)&pStr->a[n];
941 memcpy(pStr->z, z, n+1);
942 if( n && z[n-1]=='*' ) {
943 pStr->isPrefix = 1;
944 n--;
945 pStr->n--;
946 pStr->z[n] = 0;
947 } else {
948 pStr->isPrefix = 0;
949 }
950
951 for(i=0; i<n; i++) {
952 EditDist3From *pFrom = &pStr->a[i];
953 memset(pFrom, 0, sizeof(*pFrom));
954 pFrom->nByte = utf8Len((unsigned char)z[i], n-i);
955 for(p=pLang->pCost; p; p=p->pNext) {
956 EditDist3Cost **apNew;
957 if( i+p->nFrom>n ) continue;
958 if( matchFrom(p, z+i, n-i)==0 ) continue;
959 if( p->nTo==0 ) {
960 apNew = sqlite3_realloc64(pFrom->apDel,
961 sizeof(*apNew)*(pFrom->nDel+1));
962 if( apNew==0 ) break;
963 pFrom->apDel = apNew;
964 apNew[pFrom->nDel++] = p;
965 } else {
966 apNew = sqlite3_realloc64(pFrom->apSubst,
967 sizeof(*apNew)*(pFrom->nSubst+1));
968 if( apNew==0 ) break;
969 pFrom->apSubst = apNew;
970 apNew[pFrom->nSubst++] = p;
971 }
972 }
973 if( p ) {
974 editDist3FromStringDelete(pStr);
975 pStr = 0;
976 break;
977 }
978 }
979 return pStr;
980}
981
982/*
983** Update entry m[i] such that it is the minimum of its current value
984** and m[j]+iCost.
985*/
986static void updateCost(
987 unsigned int *m,
988 int i,
989 int j,
990 int iCost
991)
992{
993 unsigned int b;
994 assert( iCost>=0 );
995 assert( iCost<10000 );
996 b = m[j] + iCost;
997 if( b<m[i] ) m[i] = b;
998}
999
1000/*
1001** How much stack space (int bytes) to use for Wagner matrix in
1002** editDist3Core(). If more space than this is required, the entire
1003** matrix is taken from the heap. To reduce the load on the memory
1004** allocator, make this value as large as practical for the
1005** architecture in use.
1006*/
1007#ifndef SQLITE_SPELLFIX_STACKALLOC_SZ
1008# define SQLITE_SPELLFIX_STACKALLOC_SZ (1024)
1009#endif
1010
1011/* Compute the edit distance between two strings.
1012**
1013** If an error occurs, return a negative number which is the error code.
1014**
1015** If pnMatch is not NULL, then *pnMatch is set to the number of characters
1016** (not bytes) in z2 that matched the search pattern in *pFrom. If pFrom does
1017** not contain the pattern for a prefix-search, then this is always the number
1018** of characters in z2. If pFrom does contain a prefix search pattern, then
1019** it is the number of characters in the prefix of z2 that was deemed to
1020** match pFrom.
1021*/
1022static int editDist3Core(
1023 EditDist3FromString *pFrom, /* The FROM string */
1024 const char *z2, /* The TO string */
1025 int n2, /* Length of the TO string */
1026 const EditDist3Lang *pLang, /* Edit weights for a particular language ID */
1027 int *pnMatch /* OUT: Characters in matched prefix */
1028)
1029{
1030 int k, n;
1031 int i1, b1;
1032 int i2, b2;
1033 EditDist3FromString f = *pFrom;
1034 EditDist3To *a2;
1035 unsigned int *m;
1036 unsigned int *pToFree;
1037 int szRow;
1038 EditDist3Cost *p;
1039 int res;
1040 sqlite3_uint64 nByte;
1041 unsigned int stackSpace[SQLITE_SPELLFIX_STACKALLOC_SZ/sizeof(unsigned int)];
1042
1043 /* allocate the Wagner matrix and the aTo[] array for the TO string */
1044 n = (f.n+1)*(n2+1);
1045 n = (n+1)&~1;
1046 nByte = n*sizeof(m[0]) + sizeof(a2[0])*n2;
1047 if( nByte<=sizeof(stackSpace) ) {
1048 m = stackSpace;
1049 pToFree = 0;
1050 } else {
1051 m = pToFree = sqlite3_malloc64( nByte );
1052 if( m==0 ) return -1; /* Out of memory */
1053 }
1054 a2 = (EditDist3To*)&m[n];
1055 memset(a2, 0, sizeof(a2[0])*n2);
1056
1057 /* Fill in the a1[] matrix for all characters of the TO string */
1058 for(i2=0; i2<n2; i2++) {
1059 a2[i2].nByte = utf8Len((unsigned char)z2[i2], n2-i2);
1060 for(p=pLang->pCost; p; p=p->pNext) {
1061 EditDist3Cost **apNew;
1062 if( p->nFrom>0 ) break;
1063 if( i2+p->nTo>n2 ) continue;
1064 if( p->a[0]>z2[i2] ) break;
1065 if( matchTo(p, z2+i2, n2-i2)==0 ) continue;
1066 a2[i2].nIns++;
1067 apNew = sqlite3_realloc64(a2[i2].apIns, sizeof(*apNew)*a2[i2].nIns);
1068 if( apNew==0 ) {
1069 res = -1; /* Out of memory */
1070 goto editDist3Abort;
1071 }
1072 a2[i2].apIns = apNew;
1073 a2[i2].apIns[a2[i2].nIns-1] = p;
1074 }
1075 }
1076
1077 /* Prepare to compute the minimum edit distance */
1078 szRow = f.n+1;
1079 memset(m, 0x01, (n2+1)*szRow*sizeof(m[0]));
1080 m[0] = 0;
1081
1082 /* First fill in the top-row of the matrix with FROM deletion costs */
1083 for(i1=0; i1<f.n; i1 += b1) {
1084 b1 = f.a[i1].nByte;
1085 updateCost(m, i1+b1, i1, pLang->iDelCost);
1086 for(k=0; k<f.a[i1].nDel; k++) {
1087 p = f.a[i1].apDel[k];
1088 updateCost(m, i1+p->nFrom, i1, p->iCost);
1089 }
1090 }
1091
1092 /* Fill in all subsequent rows, top-to-bottom, left-to-right */
1093 for(i2=0; i2<n2; i2 += b2) {
1094 int rx; /* Starting index for current row */
1095 int rxp; /* Starting index for previous row */
1096 b2 = a2[i2].nByte;
1097 rx = szRow*(i2+b2);
1098 rxp = szRow*i2;
1099 updateCost(m, rx, rxp, pLang->iInsCost);
1100 for(k=0; k<a2[i2].nIns; k++) {
1101 p = a2[i2].apIns[k];
1102 updateCost(m, szRow*(i2+p->nTo), rxp, p->iCost);
1103 }
1104 for(i1=0; i1<f.n; i1+=b1) {
1105 int cx; /* Index of current cell */
1106 int cxp; /* Index of cell immediately to the left */
1107 int cxd; /* Index of cell to the left and one row above */
1108 int cxu; /* Index of cell immediately above */
1109 b1 = f.a[i1].nByte;
1110 cxp = rx + i1;
1111 cx = cxp + b1;
1112 cxd = rxp + i1;
1113 cxu = cxd + b1;
1114 updateCost(m, cx, cxp, pLang->iDelCost);
1115 for(k=0; k<f.a[i1].nDel; k++) {
1116 p = f.a[i1].apDel[k];
1117 updateCost(m, cxp+p->nFrom, cxp, p->iCost);
1118 }
1119 updateCost(m, cx, cxu, pLang->iInsCost);
1120 if( matchFromTo(&f, i1, z2+i2, n2-i2) ) {
1121 updateCost(m, cx, cxd, 0);
1122 }
1123 updateCost(m, cx, cxd, pLang->iSubCost);
1124 for(k=0; k<f.a[i1].nSubst; k++) {
1125 p = f.a[i1].apSubst[k];
1126 if( matchTo(p, z2+i2, n2-i2) ) {
1127 updateCost(m, cxd+p->nFrom+szRow*p->nTo, cxd, p->iCost);
1128 }
1129 }
1130 }
1131 }
1132
1133#if 0 /* Enable for debugging */
1134 printf(" ^");
1135 for(i1=0; i1<f.n; i1++) printf(" %c-%2x", f.z[i1], f.z[i1]&0xff);
1136 printf("\n ^:");
1137 for(i1=0; i1<szRow; i1++) {
1138 int v = m[i1];
1139 if( v>9999 ) printf(" ****");
1140 else printf(" %4d", v);
1141 }
1142 printf("\n");
1143 for(i2=0; i2<n2; i2++) {
1144 printf("%c-%02x:", z2[i2], z2[i2]&0xff);
1145 for(i1=0; i1<szRow; i1++) {
1146 int v = m[(i2+1)*szRow+i1];
1147 if( v>9999 ) printf(" ****");
1148 else printf(" %4d", v);
1149 }
1150 printf("\n");
1151 }
1152#endif
1153
1154 /* Free memory allocations and return the result */
1155 res = (int)m[szRow*(n2+1)-1];
1156 n = n2;
1157 if( f.isPrefix ) {
1158 for(i2=1; i2<=n2; i2++) {
1159 int b = m[szRow*i2-1];
1160 if( b<=res ) {
1161 res = b;
1162 n = i2 - 1;
1163 }
1164 }
1165 }
1166 if( pnMatch ) {
1167 int nExtra = 0;
1168 for(k=0; k<n; k++) {
1169 if( (z2[k] & 0xc0)==0x80 ) nExtra++;
1170 }
1171 *pnMatch = n - nExtra;
1172 }
1173
1174editDist3Abort:
1175 for(i2=0; i2<n2; i2++) sqlite3_free(a2[i2].apIns);
1176 sqlite3_free(pToFree);
1177 return res;
1178}
1179
1180/*
1181** Get an appropriate EditDist3Lang object.
1182*/
1183static const EditDist3Lang *editDist3FindLang(
1184 EditDist3Config *pConfig,
1185 int iLang
1186)
1187{
1188 int i;
1189 for(i=0; i<pConfig->nLang; i++) {
1190 if( pConfig->a[i].iLang==iLang ) return &pConfig->a[i];
1191 }
1192 return &editDist3Lang;
1193}
1194
1195/*
1196** Function: editdist3(A,B,iLang)
1197** editdist3(tablename)
1198**
1199** Return the cost of transforming string A into string B using edit
1200** weights for iLang.
1201**
1202** The second form loads edit weights into memory from a table.
1203*/
1204static void editDist3SqlFunc(
1205 sqlite3_context *context,
1206 int argc,
1207 sqlite3_value **argv
1208)
1209{
1210 EditDist3Config *pConfig = (EditDist3Config*)sqlite3_user_data(context);
1211 sqlite3 *db = sqlite3_context_db_handle(context);
1212 int rc;
1213 if( argc==1 ) {
1214 const char *zTable = (const char*)sqlite3_value_text(argv[0]);
1215 rc = editDist3ConfigLoad(pConfig, db, zTable);
1216 if( rc ) sqlite3_result_error_code(context, rc);
1217 } else {
1218 const char *zA = (const char*)sqlite3_value_text(argv[0]);
1219 const char *zB = (const char*)sqlite3_value_text(argv[1]);
1220 int nA = sqlite3_value_bytes(argv[0]);
1221 int nB = sqlite3_value_bytes(argv[1]);
1222 int iLang = argc==3 ? sqlite3_value_int(argv[2]) : 0;
1223 const EditDist3Lang *pLang = editDist3FindLang(pConfig, iLang);
1224 EditDist3FromString *pFrom;
1225 int dist;
1226
1227 pFrom = editDist3FromStringNew(pLang, zA, nA);
1228 if( pFrom==0 ) {
1229 sqlite3_result_error_nomem(context);
1230 return;
1231 }
1232 dist = editDist3Core(pFrom, zB, nB, pLang, 0);
1233 editDist3FromStringDelete(pFrom);
1234 if( dist==(-1) ) {
1235 sqlite3_result_error_nomem(context);
1236 } else {
1237 sqlite3_result_int(context, dist);
1238 }
1239 }
1240}
1241
1242/*
1243** Register the editDist3 function with SQLite
1244*/
1245static int editDist3Install(sqlite3 *db)
1246{
1247 int rc;
1248 EditDist3Config *pConfig = sqlite3_malloc64( sizeof(*pConfig) );
1249 if( pConfig==0 ) return SQLITE_NOMEM;
1250 memset(pConfig, 0, sizeof(*pConfig));
1251 rc = sqlite3_create_function_v2(db, "editdist3",
1252 2, SQLITE_UTF8|SQLITE_DETERMINISTIC, pConfig,
1253 editDist3SqlFunc, 0, 0, 0);
1254 if( rc==SQLITE_OK ) {
1255 rc = sqlite3_create_function_v2(db, "editdist3",
1256 3, SQLITE_UTF8|SQLITE_DETERMINISTIC, pConfig,
1257 editDist3SqlFunc, 0, 0, 0);
1258 }
1259 if( rc==SQLITE_OK ) {
1260 rc = sqlite3_create_function_v2(db, "editdist3",
1261 1, SQLITE_UTF8|SQLITE_DETERMINISTIC, pConfig,
1262 editDist3SqlFunc, 0, 0, editDist3ConfigDelete);
1263 } else {
1264 sqlite3_free(pConfig);
1265 }
1266 return rc;
1267}
1268/* End configurable cost unicode edit distance routines
1269******************************************************************************
1270******************************************************************************
1271** Begin transliterate unicode-to-ascii implementation
1272*/
1273
1274#if !SQLITE_AMALGAMATION
1275/*
1276** This lookup table is used to help decode the first byte of
1277** a multi-byte UTF8 character.
1278*/
1279static const unsigned char sqlite3Utf8Trans1[] = {
1280 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1281 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1282 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1283 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
1284 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1285 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1286 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1287 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
1288};
1289#endif
1290
1291/*
1292** Return the value of the first UTF-8 character in the string.
1293*/
1294static int utf8Read(const unsigned char *z, int n, int *pSize)
1295{
1296 int c, i;
1297
1298 /* All callers to this routine (in the current implementation)
1299 ** always have n>0. */
1300 if( NEVER(n==0) ) {
1301 c = i = 0;
1302 } else {
1303 c = z[0];
1304 i = 1;
1305 if( c>=0xc0 ) {
1306 c = sqlite3Utf8Trans1[c-0xc0];
1307 while( i<n && (z[i] & 0xc0)==0x80 ) {
1308 c = (c<<6) + (0x3f & z[i++]);
1309 }
1310 }
1311 }
1312 *pSize = i;
1313 return c;
1314}
1315
1316/*
1317** Return the number of characters in the utf-8 string in the nIn byte
1318** buffer pointed to by zIn.
1319*/
1320static int utf8Charlen(const char *zIn, int nIn)
1321{
1322 int i;
1323 int nChar = 0;
1324 for(i=0; i<nIn; nChar++) {
1325 int sz;
1326 utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1327 i += sz;
1328 }
1329 return nChar;
1330}
1331
1332typedef struct Transliteration Transliteration;
1333struct Transliteration {
1334 unsigned short int cFrom;
1335 unsigned char cTo0, cTo1, cTo2, cTo3;
1336#ifdef SQLITE_SPELLFIX_5BYTE_MAPPINGS
1337 unsigned char cTo4;
1338#endif
1339};
1340
1341/*
1342** Table of translations from unicode characters into ASCII.
1343*/
1344static const Transliteration translit[] = {
1345 { 0x00A0, 0x20, 0x00, 0x00, 0x00 }, /* to */
1346 { 0x00B5, 0x75, 0x00, 0x00, 0x00 }, /* µ to u */
1347 { 0x00C0, 0x41, 0x00, 0x00, 0x00 }, /* À to A */
1348 { 0x00C1, 0x41, 0x00, 0x00, 0x00 }, /* Á to A */
1349 { 0x00C2, 0x41, 0x00, 0x00, 0x00 }, /* Â to A */
1350 { 0x00C3, 0x41, 0x00, 0x00, 0x00 }, /* Ã to A */
1351 { 0x00C4, 0x41, 0x65, 0x00, 0x00 }, /* Ä to Ae */
1352 { 0x00C5, 0x41, 0x61, 0x00, 0x00 }, /* Å to Aa */
1353 { 0x00C6, 0x41, 0x45, 0x00, 0x00 }, /* Æ to AE */
1354 { 0x00C7, 0x43, 0x00, 0x00, 0x00 }, /* Ç to C */
1355 { 0x00C8, 0x45, 0x00, 0x00, 0x00 }, /* È to E */
1356 { 0x00C9, 0x45, 0x00, 0x00, 0x00 }, /* É to E */
1357 { 0x00CA, 0x45, 0x00, 0x00, 0x00 }, /* Ê to E */
1358 { 0x00CB, 0x45, 0x00, 0x00, 0x00 }, /* Ë to E */
1359 { 0x00CC, 0x49, 0x00, 0x00, 0x00 }, /* Ì to I */
1360 { 0x00CD, 0x49, 0x00, 0x00, 0x00 }, /* Í to I */
1361 { 0x00CE, 0x49, 0x00, 0x00, 0x00 }, /* Î to I */
1362 { 0x00CF, 0x49, 0x00, 0x00, 0x00 }, /* Ï to I */
1363 { 0x00D0, 0x44, 0x00, 0x00, 0x00 }, /* Ð to D */
1364 { 0x00D1, 0x4E, 0x00, 0x00, 0x00 }, /* Ñ to N */
1365 { 0x00D2, 0x4F, 0x00, 0x00, 0x00 }, /* Ò to O */
1366 { 0x00D3, 0x4F, 0x00, 0x00, 0x00 }, /* Ó to O */
1367 { 0x00D4, 0x4F, 0x00, 0x00, 0x00 }, /* Ô to O */
1368 { 0x00D5, 0x4F, 0x00, 0x00, 0x00 }, /* Õ to O */
1369 { 0x00D6, 0x4F, 0x65, 0x00, 0x00 }, /* Ö to Oe */
1370 { 0x00D7, 0x78, 0x00, 0x00, 0x00 }, /* × to x */
1371 { 0x00D8, 0x4F, 0x00, 0x00, 0x00 }, /* Ø to O */
1372 { 0x00D9, 0x55, 0x00, 0x00, 0x00 }, /* Ù to U */
1373 { 0x00DA, 0x55, 0x00, 0x00, 0x00 }, /* Ú to U */
1374 { 0x00DB, 0x55, 0x00, 0x00, 0x00 }, /* Û to U */
1375 { 0x00DC, 0x55, 0x65, 0x00, 0x00 }, /* Ü to Ue */
1376 { 0x00DD, 0x59, 0x00, 0x00, 0x00 }, /* Ý to Y */
1377 { 0x00DE, 0x54, 0x68, 0x00, 0x00 }, /* Þ to Th */
1378 { 0x00DF, 0x73, 0x73, 0x00, 0x00 }, /* ß to ss */
1379 { 0x00E0, 0x61, 0x00, 0x00, 0x00 }, /* à to a */
1380 { 0x00E1, 0x61, 0x00, 0x00, 0x00 }, /* á to a */
1381 { 0x00E2, 0x61, 0x00, 0x00, 0x00 }, /* â to a */
1382 { 0x00E3, 0x61, 0x00, 0x00, 0x00 }, /* ã to a */
1383 { 0x00E4, 0x61, 0x65, 0x00, 0x00 }, /* ä to ae */
1384 { 0x00E5, 0x61, 0x61, 0x00, 0x00 }, /* å to aa */
1385 { 0x00E6, 0x61, 0x65, 0x00, 0x00 }, /* æ to ae */
1386 { 0x00E7, 0x63, 0x00, 0x00, 0x00 }, /* ç to c */
1387 { 0x00E8, 0x65, 0x00, 0x00, 0x00 }, /* è to e */
1388 { 0x00E9, 0x65, 0x00, 0x00, 0x00 }, /* é to e */
1389 { 0x00EA, 0x65, 0x00, 0x00, 0x00 }, /* ê to e */
1390 { 0x00EB, 0x65, 0x00, 0x00, 0x00 }, /* ë to e */
1391 { 0x00EC, 0x69, 0x00, 0x00, 0x00 }, /* ì to i */
1392 { 0x00ED, 0x69, 0x00, 0x00, 0x00 }, /* í to i */
1393 { 0x00EE, 0x69, 0x00, 0x00, 0x00 }, /* î to i */
1394 { 0x00EF, 0x69, 0x00, 0x00, 0x00 }, /* ï to i */
1395 { 0x00F0, 0x64, 0x00, 0x00, 0x00 }, /* ð to d */
1396 { 0x00F1, 0x6E, 0x00, 0x00, 0x00 }, /* ñ to n */
1397 { 0x00F2, 0x6F, 0x00, 0x00, 0x00 }, /* ò to o */
1398 { 0x00F3, 0x6F, 0x00, 0x00, 0x00 }, /* ó to o */
1399 { 0x00F4, 0x6F, 0x00, 0x00, 0x00 }, /* ô to o */
1400 { 0x00F5, 0x6F, 0x00, 0x00, 0x00 }, /* õ to o */
1401 { 0x00F6, 0x6F, 0x65, 0x00, 0x00 }, /* ö to oe */
1402 { 0x00F7, 0x3A, 0x00, 0x00, 0x00 }, /* ÷ to : */
1403 { 0x00F8, 0x6F, 0x00, 0x00, 0x00 }, /* ø to o */
1404 { 0x00F9, 0x75, 0x00, 0x00, 0x00 }, /* ù to u */
1405 { 0x00FA, 0x75, 0x00, 0x00, 0x00 }, /* ú to u */
1406 { 0x00FB, 0x75, 0x00, 0x00, 0x00 }, /* û to u */
1407 { 0x00FC, 0x75, 0x65, 0x00, 0x00 }, /* ü to ue */
1408 { 0x00FD, 0x79, 0x00, 0x00, 0x00 }, /* ý to y */
1409 { 0x00FE, 0x74, 0x68, 0x00, 0x00 }, /* þ to th */
1410 { 0x00FF, 0x79, 0x00, 0x00, 0x00 }, /* ÿ to y */
1411 { 0x0100, 0x41, 0x00, 0x00, 0x00 }, /* Ā to A */
1412 { 0x0101, 0x61, 0x00, 0x00, 0x00 }, /* ā to a */
1413 { 0x0102, 0x41, 0x00, 0x00, 0x00 }, /* Ă to A */
1414 { 0x0103, 0x61, 0x00, 0x00, 0x00 }, /* ă to a */
1415 { 0x0104, 0x41, 0x00, 0x00, 0x00 }, /* Ą to A */
1416 { 0x0105, 0x61, 0x00, 0x00, 0x00 }, /* ą to a */
1417 { 0x0106, 0x43, 0x00, 0x00, 0x00 }, /* Ć to C */
1418 { 0x0107, 0x63, 0x00, 0x00, 0x00 }, /* ć to c */
1419 { 0x0108, 0x43, 0x68, 0x00, 0x00 }, /* Ĉ to Ch */
1420 { 0x0109, 0x63, 0x68, 0x00, 0x00 }, /* ĉ to ch */
1421 { 0x010A, 0x43, 0x00, 0x00, 0x00 }, /* Ċ to C */
1422 { 0x010B, 0x63, 0x00, 0x00, 0x00 }, /* ċ to c */
1423 { 0x010C, 0x43, 0x00, 0x00, 0x00 }, /* Č to C */
1424 { 0x010D, 0x63, 0x00, 0x00, 0x00 }, /* č to c */
1425 { 0x010E, 0x44, 0x00, 0x00, 0x00 }, /* Ď to D */
1426 { 0x010F, 0x64, 0x00, 0x00, 0x00 }, /* ď to d */
1427 { 0x0110, 0x44, 0x00, 0x00, 0x00 }, /* Đ to D */
1428 { 0x0111, 0x64, 0x00, 0x00, 0x00 }, /* đ to d */
1429 { 0x0112, 0x45, 0x00, 0x00, 0x00 }, /* Ē to E */
1430 { 0x0113, 0x65, 0x00, 0x00, 0x00 }, /* ē to e */
1431 { 0x0114, 0x45, 0x00, 0x00, 0x00 }, /* Ĕ to E */
1432 { 0x0115, 0x65, 0x00, 0x00, 0x00 }, /* ĕ to e */
1433 { 0x0116, 0x45, 0x00, 0x00, 0x00 }, /* Ė to E */
1434 { 0x0117, 0x65, 0x00, 0x00, 0x00 }, /* ė to e */
1435 { 0x0118, 0x45, 0x00, 0x00, 0x00 }, /* Ę to E */
1436 { 0x0119, 0x65, 0x00, 0x00, 0x00 }, /* ę to e */
1437 { 0x011A, 0x45, 0x00, 0x00, 0x00 }, /* Ě to E */
1438 { 0x011B, 0x65, 0x00, 0x00, 0x00 }, /* ě to e */
1439 { 0x011C, 0x47, 0x68, 0x00, 0x00 }, /* Ĝ to Gh */
1440 { 0x011D, 0x67, 0x68, 0x00, 0x00 }, /* ĝ to gh */
1441 { 0x011E, 0x47, 0x00, 0x00, 0x00 }, /* Ğ to G */
1442 { 0x011F, 0x67, 0x00, 0x00, 0x00 }, /* ğ to g */
1443 { 0x0120, 0x47, 0x00, 0x00, 0x00 }, /* Ġ to G */
1444 { 0x0121, 0x67, 0x00, 0x00, 0x00 }, /* ġ to g */
1445 { 0x0122, 0x47, 0x00, 0x00, 0x00 }, /* Ģ to G */
1446 { 0x0123, 0x67, 0x00, 0x00, 0x00 }, /* ģ to g */
1447 { 0x0124, 0x48, 0x68, 0x00, 0x00 }, /* Ĥ to Hh */
1448 { 0x0125, 0x68, 0x68, 0x00, 0x00 }, /* ĥ to hh */
1449 { 0x0126, 0x48, 0x00, 0x00, 0x00 }, /* Ħ to H */
1450 { 0x0127, 0x68, 0x00, 0x00, 0x00 }, /* ħ to h */
1451 { 0x0128, 0x49, 0x00, 0x00, 0x00 }, /* Ĩ to I */
1452 { 0x0129, 0x69, 0x00, 0x00, 0x00 }, /* ĩ to i */
1453 { 0x012A, 0x49, 0x00, 0x00, 0x00 }, /* Ī to I */
1454 { 0x012B, 0x69, 0x00, 0x00, 0x00 }, /* ī to i */
1455 { 0x012C, 0x49, 0x00, 0x00, 0x00 }, /* Ĭ to I */
1456 { 0x012D, 0x69, 0x00, 0x00, 0x00 }, /* ĭ to i */
1457 { 0x012E, 0x49, 0x00, 0x00, 0x00 }, /* Į to I */
1458 { 0x012F, 0x69, 0x00, 0x00, 0x00 }, /* į to i */
1459 { 0x0130, 0x49, 0x00, 0x00, 0x00 }, /* İ to I */
1460 { 0x0131, 0x69, 0x00, 0x00, 0x00 }, /* ı to i */
1461 { 0x0132, 0x49, 0x4A, 0x00, 0x00 }, /* IJ to IJ */
1462 { 0x0133, 0x69, 0x6A, 0x00, 0x00 }, /* ij to ij */
1463 { 0x0134, 0x4A, 0x68, 0x00, 0x00 }, /* Ĵ to Jh */
1464 { 0x0135, 0x6A, 0x68, 0x00, 0x00 }, /* ĵ to jh */
1465 { 0x0136, 0x4B, 0x00, 0x00, 0x00 }, /* Ķ to K */
1466 { 0x0137, 0x6B, 0x00, 0x00, 0x00 }, /* ķ to k */
1467 { 0x0138, 0x6B, 0x00, 0x00, 0x00 }, /* ĸ to k */
1468 { 0x0139, 0x4C, 0x00, 0x00, 0x00 }, /* Ĺ to L */
1469 { 0x013A, 0x6C, 0x00, 0x00, 0x00 }, /* ĺ to l */
1470 { 0x013B, 0x4C, 0x00, 0x00, 0x00 }, /* Ļ to L */
1471 { 0x013C, 0x6C, 0x00, 0x00, 0x00 }, /* ļ to l */
1472 { 0x013D, 0x4C, 0x00, 0x00, 0x00 }, /* Ľ to L */
1473 { 0x013E, 0x6C, 0x00, 0x00, 0x00 }, /* ľ to l */
1474 { 0x013F, 0x4C, 0x2E, 0x00, 0x00 }, /* Ŀ to L. */
1475 { 0x0140, 0x6C, 0x2E, 0x00, 0x00 }, /* ŀ to l. */
1476 { 0x0141, 0x4C, 0x00, 0x00, 0x00 }, /* Ł to L */
1477 { 0x0142, 0x6C, 0x00, 0x00, 0x00 }, /* ł to l */
1478 { 0x0143, 0x4E, 0x00, 0x00, 0x00 }, /* Ń to N */
1479 { 0x0144, 0x6E, 0x00, 0x00, 0x00 }, /* ń to n */
1480 { 0x0145, 0x4E, 0x00, 0x00, 0x00 }, /* Ņ to N */
1481 { 0x0146, 0x6E, 0x00, 0x00, 0x00 }, /* ņ to n */
1482 { 0x0147, 0x4E, 0x00, 0x00, 0x00 }, /* Ň to N */
1483 { 0x0148, 0x6E, 0x00, 0x00, 0x00 }, /* ň to n */
1484 { 0x0149, 0x27, 0x6E, 0x00, 0x00 }, /* ʼn to 'n */
1485 { 0x014A, 0x4E, 0x47, 0x00, 0x00 }, /* Ŋ to NG */
1486 { 0x014B, 0x6E, 0x67, 0x00, 0x00 }, /* ŋ to ng */
1487 { 0x014C, 0x4F, 0x00, 0x00, 0x00 }, /* Ō to O */
1488 { 0x014D, 0x6F, 0x00, 0x00, 0x00 }, /* ō to o */
1489 { 0x014E, 0x4F, 0x00, 0x00, 0x00 }, /* Ŏ to O */
1490 { 0x014F, 0x6F, 0x00, 0x00, 0x00 }, /* ŏ to o */
1491 { 0x0150, 0x4F, 0x00, 0x00, 0x00 }, /* Ő to O */
1492 { 0x0151, 0x6F, 0x00, 0x00, 0x00 }, /* ő to o */
1493 { 0x0152, 0x4F, 0x45, 0x00, 0x00 }, /* Œ to OE */
1494 { 0x0153, 0x6F, 0x65, 0x00, 0x00 }, /* œ to oe */
1495 { 0x0154, 0x52, 0x00, 0x00, 0x00 }, /* Ŕ to R */
1496 { 0x0155, 0x72, 0x00, 0x00, 0x00 }, /* ŕ to r */
1497 { 0x0156, 0x52, 0x00, 0x00, 0x00 }, /* Ŗ to R */
1498 { 0x0157, 0x72, 0x00, 0x00, 0x00 }, /* ŗ to r */
1499 { 0x0158, 0x52, 0x00, 0x00, 0x00 }, /* Ř to R */
1500 { 0x0159, 0x72, 0x00, 0x00, 0x00 }, /* ř to r */
1501 { 0x015A, 0x53, 0x00, 0x00, 0x00 }, /* Ś to S */
1502 { 0x015B, 0x73, 0x00, 0x00, 0x00 }, /* ś to s */
1503 { 0x015C, 0x53, 0x68, 0x00, 0x00 }, /* Ŝ to Sh */
1504 { 0x015D, 0x73, 0x68, 0x00, 0x00 }, /* ŝ to sh */
1505 { 0x015E, 0x53, 0x00, 0x00, 0x00 }, /* Ş to S */
1506 { 0x015F, 0x73, 0x00, 0x00, 0x00 }, /* ş to s */
1507 { 0x0160, 0x53, 0x00, 0x00, 0x00 }, /* Š to S */
1508 { 0x0161, 0x73, 0x00, 0x00, 0x00 }, /* š to s */
1509 { 0x0162, 0x54, 0x00, 0x00, 0x00 }, /* Ţ to T */
1510 { 0x0163, 0x74, 0x00, 0x00, 0x00 }, /* ţ to t */
1511 { 0x0164, 0x54, 0x00, 0x00, 0x00 }, /* Ť to T */
1512 { 0x0165, 0x74, 0x00, 0x00, 0x00 }, /* ť to t */
1513 { 0x0166, 0x54, 0x00, 0x00, 0x00 }, /* Ŧ to T */
1514 { 0x0167, 0x74, 0x00, 0x00, 0x00 }, /* ŧ to t */
1515 { 0x0168, 0x55, 0x00, 0x00, 0x00 }, /* Ũ to U */
1516 { 0x0169, 0x75, 0x00, 0x00, 0x00 }, /* ũ to u */
1517 { 0x016A, 0x55, 0x00, 0x00, 0x00 }, /* Ū to U */
1518 { 0x016B, 0x75, 0x00, 0x00, 0x00 }, /* ū to u */
1519 { 0x016C, 0x55, 0x00, 0x00, 0x00 }, /* Ŭ to U */
1520 { 0x016D, 0x75, 0x00, 0x00, 0x00 }, /* ŭ to u */
1521 { 0x016E, 0x55, 0x00, 0x00, 0x00 }, /* Ů to U */
1522 { 0x016F, 0x75, 0x00, 0x00, 0x00 }, /* ů to u */
1523 { 0x0170, 0x55, 0x00, 0x00, 0x00 }, /* Ű to U */
1524 { 0x0171, 0x75, 0x00, 0x00, 0x00 }, /* ű to u */
1525 { 0x0172, 0x55, 0x00, 0x00, 0x00 }, /* Ų to U */
1526 { 0x0173, 0x75, 0x00, 0x00, 0x00 }, /* ų to u */
1527 { 0x0174, 0x57, 0x00, 0x00, 0x00 }, /* Ŵ to W */
1528 { 0x0175, 0x77, 0x00, 0x00, 0x00 }, /* ŵ to w */
1529 { 0x0176, 0x59, 0x00, 0x00, 0x00 }, /* Ŷ to Y */
1530 { 0x0177, 0x79, 0x00, 0x00, 0x00 }, /* ŷ to y */
1531 { 0x0178, 0x59, 0x00, 0x00, 0x00 }, /* Ÿ to Y */
1532 { 0x0179, 0x5A, 0x00, 0x00, 0x00 }, /* Ź to Z */
1533 { 0x017A, 0x7A, 0x00, 0x00, 0x00 }, /* ź to z */
1534 { 0x017B, 0x5A, 0x00, 0x00, 0x00 }, /* Ż to Z */
1535 { 0x017C, 0x7A, 0x00, 0x00, 0x00 }, /* ż to z */
1536 { 0x017D, 0x5A, 0x00, 0x00, 0x00 }, /* Ž to Z */
1537 { 0x017E, 0x7A, 0x00, 0x00, 0x00 }, /* ž to z */
1538 { 0x017F, 0x73, 0x00, 0x00, 0x00 }, /* ſ to s */
1539 { 0x0192, 0x66, 0x00, 0x00, 0x00 }, /* ƒ to f */
1540 { 0x0218, 0x53, 0x00, 0x00, 0x00 }, /* Ș to S */
1541 { 0x0219, 0x73, 0x00, 0x00, 0x00 }, /* ș to s */
1542 { 0x021A, 0x54, 0x00, 0x00, 0x00 }, /* Ț to T */
1543 { 0x021B, 0x74, 0x00, 0x00, 0x00 }, /* ț to t */
1544 { 0x0386, 0x41, 0x00, 0x00, 0x00 }, /* Ά to A */
1545 { 0x0388, 0x45, 0x00, 0x00, 0x00 }, /* Έ to E */
1546 { 0x0389, 0x49, 0x00, 0x00, 0x00 }, /* Ή to I */
1547 { 0x038A, 0x49, 0x00, 0x00, 0x00 }, /* Ί to I */
1548 { 0x038C, 0x4f, 0x00, 0x00, 0x00 }, /* Ό to O */
1549 { 0x038E, 0x59, 0x00, 0x00, 0x00 }, /* Ύ to Y */
1550 { 0x038F, 0x4f, 0x00, 0x00, 0x00 }, /* Ώ to O */
1551 { 0x0390, 0x69, 0x00, 0x00, 0x00 }, /* ΐ to i */
1552 { 0x0391, 0x41, 0x00, 0x00, 0x00 }, /* Α to A */
1553 { 0x0392, 0x42, 0x00, 0x00, 0x00 }, /* Β to B */
1554 { 0x0393, 0x47, 0x00, 0x00, 0x00 }, /* Γ to G */
1555 { 0x0394, 0x44, 0x00, 0x00, 0x00 }, /* Δ to D */
1556 { 0x0395, 0x45, 0x00, 0x00, 0x00 }, /* Ε to E */
1557 { 0x0396, 0x5a, 0x00, 0x00, 0x00 }, /* Ζ to Z */
1558 { 0x0397, 0x49, 0x00, 0x00, 0x00 }, /* Η to I */
1559 { 0x0398, 0x54, 0x68, 0x00, 0x00 }, /* Θ to Th */
1560 { 0x0399, 0x49, 0x00, 0x00, 0x00 }, /* Ι to I */
1561 { 0x039A, 0x4b, 0x00, 0x00, 0x00 }, /* Κ to K */
1562 { 0x039B, 0x4c, 0x00, 0x00, 0x00 }, /* Λ to L */
1563 { 0x039C, 0x4d, 0x00, 0x00, 0x00 }, /* Μ to M */
1564 { 0x039D, 0x4e, 0x00, 0x00, 0x00 }, /* Ν to N */
1565 { 0x039E, 0x58, 0x00, 0x00, 0x00 }, /* Ξ to X */
1566 { 0x039F, 0x4f, 0x00, 0x00, 0x00 }, /* Ο to O */
1567 { 0x03A0, 0x50, 0x00, 0x00, 0x00 }, /* Π to P */
1568 { 0x03A1, 0x52, 0x00, 0x00, 0x00 }, /* Ρ to R */
1569 { 0x03A3, 0x53, 0x00, 0x00, 0x00 }, /* Σ to S */
1570 { 0x03A4, 0x54, 0x00, 0x00, 0x00 }, /* Τ to T */
1571 { 0x03A5, 0x59, 0x00, 0x00, 0x00 }, /* Υ to Y */
1572 { 0x03A6, 0x46, 0x00, 0x00, 0x00 }, /* Φ to F */
1573 { 0x03A7, 0x43, 0x68, 0x00, 0x00 }, /* Χ to Ch */
1574 { 0x03A8, 0x50, 0x73, 0x00, 0x00 }, /* Ψ to Ps */
1575 { 0x03A9, 0x4f, 0x00, 0x00, 0x00 }, /* Ω to O */
1576 { 0x03AA, 0x49, 0x00, 0x00, 0x00 }, /* Ϊ to I */
1577 { 0x03AB, 0x59, 0x00, 0x00, 0x00 }, /* Ϋ to Y */
1578 { 0x03AC, 0x61, 0x00, 0x00, 0x00 }, /* ά to a */
1579 { 0x03AD, 0x65, 0x00, 0x00, 0x00 }, /* έ to e */
1580 { 0x03AE, 0x69, 0x00, 0x00, 0x00 }, /* ή to i */
1581 { 0x03AF, 0x69, 0x00, 0x00, 0x00 }, /* ί to i */
1582 { 0x03B1, 0x61, 0x00, 0x00, 0x00 }, /* α to a */
1583 { 0x03B2, 0x62, 0x00, 0x00, 0x00 }, /* β to b */
1584 { 0x03B3, 0x67, 0x00, 0x00, 0x00 }, /* γ to g */
1585 { 0x03B4, 0x64, 0x00, 0x00, 0x00 }, /* δ to d */
1586 { 0x03B5, 0x65, 0x00, 0x00, 0x00 }, /* ε to e */
1587 { 0x03B6, 0x7a, 0x00, 0x00, 0x00 }, /* ζ to z */
1588 { 0x03B7, 0x69, 0x00, 0x00, 0x00 }, /* η to i */
1589 { 0x03B8, 0x74, 0x68, 0x00, 0x00 }, /* θ to th */
1590 { 0x03B9, 0x69, 0x00, 0x00, 0x00 }, /* ι to i */
1591 { 0x03BA, 0x6b, 0x00, 0x00, 0x00 }, /* κ to k */
1592 { 0x03BB, 0x6c, 0x00, 0x00, 0x00 }, /* λ to l */
1593 { 0x03BC, 0x6d, 0x00, 0x00, 0x00 }, /* μ to m */
1594 { 0x03BD, 0x6e, 0x00, 0x00, 0x00 }, /* ν to n */
1595 { 0x03BE, 0x78, 0x00, 0x00, 0x00 }, /* ξ to x */
1596 { 0x03BF, 0x6f, 0x00, 0x00, 0x00 }, /* ο to o */
1597 { 0x03C0, 0x70, 0x00, 0x00, 0x00 }, /* π to p */
1598 { 0x03C1, 0x72, 0x00, 0x00, 0x00 }, /* ρ to r */
1599 { 0x03C3, 0x73, 0x00, 0x00, 0x00 }, /* σ to s */
1600 { 0x03C4, 0x74, 0x00, 0x00, 0x00 }, /* τ to t */
1601 { 0x03C5, 0x79, 0x00, 0x00, 0x00 }, /* υ to y */
1602 { 0x03C6, 0x66, 0x00, 0x00, 0x00 }, /* φ to f */
1603 { 0x03C7, 0x63, 0x68, 0x00, 0x00 }, /* χ to ch */
1604 { 0x03C8, 0x70, 0x73, 0x00, 0x00 }, /* ψ to ps */
1605 { 0x03C9, 0x6f, 0x00, 0x00, 0x00 }, /* ω to o */
1606 { 0x03CA, 0x69, 0x00, 0x00, 0x00 }, /* ϊ to i */
1607 { 0x03CB, 0x79, 0x00, 0x00, 0x00 }, /* ϋ to y */
1608 { 0x03CC, 0x6f, 0x00, 0x00, 0x00 }, /* ό to o */
1609 { 0x03CD, 0x79, 0x00, 0x00, 0x00 }, /* ύ to y */
1610 { 0x03CE, 0x69, 0x00, 0x00, 0x00 }, /* ώ to i */
1611 { 0x0400, 0x45, 0x00, 0x00, 0x00 }, /* Ѐ to E */
1612 { 0x0401, 0x45, 0x00, 0x00, 0x00 }, /* Ё to E */
1613 { 0x0402, 0x44, 0x00, 0x00, 0x00 }, /* Ђ to D */
1614 { 0x0403, 0x47, 0x00, 0x00, 0x00 }, /* Ѓ to G */
1615 { 0x0404, 0x45, 0x00, 0x00, 0x00 }, /* Є to E */
1616 { 0x0405, 0x5a, 0x00, 0x00, 0x00 }, /* Ѕ to Z */
1617 { 0x0406, 0x49, 0x00, 0x00, 0x00 }, /* І to I */
1618 { 0x0407, 0x49, 0x00, 0x00, 0x00 }, /* Ї to I */
1619 { 0x0408, 0x4a, 0x00, 0x00, 0x00 }, /* Ј to J */
1620 { 0x0409, 0x49, 0x00, 0x00, 0x00 }, /* Љ to I */
1621 { 0x040A, 0x4e, 0x00, 0x00, 0x00 }, /* Њ to N */
1622 { 0x040B, 0x44, 0x00, 0x00, 0x00 }, /* Ћ to D */
1623 { 0x040C, 0x4b, 0x00, 0x00, 0x00 }, /* Ќ to K */
1624 { 0x040D, 0x49, 0x00, 0x00, 0x00 }, /* Ѝ to I */
1625 { 0x040E, 0x55, 0x00, 0x00, 0x00 }, /* Ў to U */
1626 { 0x040F, 0x44, 0x00, 0x00, 0x00 }, /* Џ to D */
1627 { 0x0410, 0x41, 0x00, 0x00, 0x00 }, /* А to A */
1628 { 0x0411, 0x42, 0x00, 0x00, 0x00 }, /* Б to B */
1629 { 0x0412, 0x56, 0x00, 0x00, 0x00 }, /* В to V */
1630 { 0x0413, 0x47, 0x00, 0x00, 0x00 }, /* Г to G */
1631 { 0x0414, 0x44, 0x00, 0x00, 0x00 }, /* Д to D */
1632 { 0x0415, 0x45, 0x00, 0x00, 0x00 }, /* Е to E */
1633 { 0x0416, 0x5a, 0x68, 0x00, 0x00 }, /* Ж to Zh */
1634 { 0x0417, 0x5a, 0x00, 0x00, 0x00 }, /* З to Z */
1635 { 0x0418, 0x49, 0x00, 0x00, 0x00 }, /* И to I */
1636 { 0x0419, 0x49, 0x00, 0x00, 0x00 }, /* Й to I */
1637 { 0x041A, 0x4b, 0x00, 0x00, 0x00 }, /* К to K */
1638 { 0x041B, 0x4c, 0x00, 0x00, 0x00 }, /* Л to L */
1639 { 0x041C, 0x4d, 0x00, 0x00, 0x00 }, /* М to M */
1640 { 0x041D, 0x4e, 0x00, 0x00, 0x00 }, /* Н to N */
1641 { 0x041E, 0x4f, 0x00, 0x00, 0x00 }, /* О to O */
1642 { 0x041F, 0x50, 0x00, 0x00, 0x00 }, /* П to P */
1643 { 0x0420, 0x52, 0x00, 0x00, 0x00 }, /* Р to R */
1644 { 0x0421, 0x53, 0x00, 0x00, 0x00 }, /* С to S */
1645 { 0x0422, 0x54, 0x00, 0x00, 0x00 }, /* Т to T */
1646 { 0x0423, 0x55, 0x00, 0x00, 0x00 }, /* У to U */
1647 { 0x0424, 0x46, 0x00, 0x00, 0x00 }, /* Ф to F */
1648 { 0x0425, 0x4b, 0x68, 0x00, 0x00 }, /* Х to Kh */
1649 { 0x0426, 0x54, 0x63, 0x00, 0x00 }, /* Ц to Tc */
1650 { 0x0427, 0x43, 0x68, 0x00, 0x00 }, /* Ч to Ch */
1651 { 0x0428, 0x53, 0x68, 0x00, 0x00 }, /* Ш to Sh */
1652 { 0x0429, 0x53, 0x68, 0x63, 0x68 }, /* Щ to Shch */
1653 { 0x042A, 0x61, 0x00, 0x00, 0x00 }, /* to A */
1654 { 0x042B, 0x59, 0x00, 0x00, 0x00 }, /* Ы to Y */
1655 { 0x042C, 0x59, 0x00, 0x00, 0x00 }, /* to Y */
1656 { 0x042D, 0x45, 0x00, 0x00, 0x00 }, /* Э to E */
1657 { 0x042E, 0x49, 0x75, 0x00, 0x00 }, /* Ю to Iu */
1658 { 0x042F, 0x49, 0x61, 0x00, 0x00 }, /* Я to Ia */
1659 { 0x0430, 0x61, 0x00, 0x00, 0x00 }, /* а to a */
1660 { 0x0431, 0x62, 0x00, 0x00, 0x00 }, /* б to b */
1661 { 0x0432, 0x76, 0x00, 0x00, 0x00 }, /* в to v */
1662 { 0x0433, 0x67, 0x00, 0x00, 0x00 }, /* г to g */
1663 { 0x0434, 0x64, 0x00, 0x00, 0x00 }, /* д to d */
1664 { 0x0435, 0x65, 0x00, 0x00, 0x00 }, /* е to e */
1665 { 0x0436, 0x7a, 0x68, 0x00, 0x00 }, /* ж to zh */
1666 { 0x0437, 0x7a, 0x00, 0x00, 0x00 }, /* з to z */
1667 { 0x0438, 0x69, 0x00, 0x00, 0x00 }, /* и to i */
1668 { 0x0439, 0x69, 0x00, 0x00, 0x00 }, /* й to i */
1669 { 0x043A, 0x6b, 0x00, 0x00, 0x00 }, /* к to k */
1670 { 0x043B, 0x6c, 0x00, 0x00, 0x00 }, /* л to l */
1671 { 0x043C, 0x6d, 0x00, 0x00, 0x00 }, /* м to m */
1672 { 0x043D, 0x6e, 0x00, 0x00, 0x00 }, /* н to n */
1673 { 0x043E, 0x6f, 0x00, 0x00, 0x00 }, /* о to o */
1674 { 0x043F, 0x70, 0x00, 0x00, 0x00 }, /* п to p */
1675 { 0x0440, 0x72, 0x00, 0x00, 0x00 }, /* р to r */
1676 { 0x0441, 0x73, 0x00, 0x00, 0x00 }, /* с to s */
1677 { 0x0442, 0x74, 0x00, 0x00, 0x00 }, /* т to t */
1678 { 0x0443, 0x75, 0x00, 0x00, 0x00 }, /* у to u */
1679 { 0x0444, 0x66, 0x00, 0x00, 0x00 }, /* ф to f */
1680 { 0x0445, 0x6b, 0x68, 0x00, 0x00 }, /* х to kh */
1681 { 0x0446, 0x74, 0x63, 0x00, 0x00 }, /* ц to tc */
1682 { 0x0447, 0x63, 0x68, 0x00, 0x00 }, /* ч to ch */
1683 { 0x0448, 0x73, 0x68, 0x00, 0x00 }, /* ш to sh */
1684 { 0x0449, 0x73, 0x68, 0x63, 0x68 }, /* щ to shch */
1685 { 0x044A, 0x61, 0x00, 0x00, 0x00 }, /* to a */
1686 { 0x044B, 0x79, 0x00, 0x00, 0x00 }, /* ы to y */
1687 { 0x044C, 0x79, 0x00, 0x00, 0x00 }, /* to y */
1688 { 0x044D, 0x65, 0x00, 0x00, 0x00 }, /* э to e */
1689 { 0x044E, 0x69, 0x75, 0x00, 0x00 }, /* ю to iu */
1690 { 0x044F, 0x69, 0x61, 0x00, 0x00 }, /* я to ia */
1691 { 0x0450, 0x65, 0x00, 0x00, 0x00 }, /* ѐ to e */
1692 { 0x0451, 0x65, 0x00, 0x00, 0x00 }, /* ё to e */
1693 { 0x0452, 0x64, 0x00, 0x00, 0x00 }, /* ђ to d */
1694 { 0x0453, 0x67, 0x00, 0x00, 0x00 }, /* ѓ to g */
1695 { 0x0454, 0x65, 0x00, 0x00, 0x00 }, /* є to e */
1696 { 0x0455, 0x7a, 0x00, 0x00, 0x00 }, /* ѕ to z */
1697 { 0x0456, 0x69, 0x00, 0x00, 0x00 }, /* і to i */
1698 { 0x0457, 0x69, 0x00, 0x00, 0x00 }, /* ї to i */
1699 { 0x0458, 0x6a, 0x00, 0x00, 0x00 }, /* ј to j */
1700 { 0x0459, 0x69, 0x00, 0x00, 0x00 }, /* љ to i */
1701 { 0x045A, 0x6e, 0x00, 0x00, 0x00 }, /* њ to n */
1702 { 0x045B, 0x64, 0x00, 0x00, 0x00 }, /* ћ to d */
1703 { 0x045C, 0x6b, 0x00, 0x00, 0x00 }, /* ќ to k */
1704 { 0x045D, 0x69, 0x00, 0x00, 0x00 }, /* ѝ to i */
1705 { 0x045E, 0x75, 0x00, 0x00, 0x00 }, /* ў to u */
1706 { 0x045F, 0x64, 0x00, 0x00, 0x00 }, /* џ to d */
1707 { 0x1E02, 0x42, 0x00, 0x00, 0x00 }, /* Ḃ to B */
1708 { 0x1E03, 0x62, 0x00, 0x00, 0x00 }, /* ḃ to b */
1709 { 0x1E0A, 0x44, 0x00, 0x00, 0x00 }, /* Ḋ to D */
1710 { 0x1E0B, 0x64, 0x00, 0x00, 0x00 }, /* ḋ to d */
1711 { 0x1E1E, 0x46, 0x00, 0x00, 0x00 }, /* Ḟ to F */
1712 { 0x1E1F, 0x66, 0x00, 0x00, 0x00 }, /* ḟ to f */
1713 { 0x1E40, 0x4D, 0x00, 0x00, 0x00 }, /* Ṁ to M */
1714 { 0x1E41, 0x6D, 0x00, 0x00, 0x00 }, /* ṁ to m */
1715 { 0x1E56, 0x50, 0x00, 0x00, 0x00 }, /* Ṗ to P */
1716 { 0x1E57, 0x70, 0x00, 0x00, 0x00 }, /* ṗ to p */
1717 { 0x1E60, 0x53, 0x00, 0x00, 0x00 }, /* Ṡ to S */
1718 { 0x1E61, 0x73, 0x00, 0x00, 0x00 }, /* ṡ to s */
1719 { 0x1E6A, 0x54, 0x00, 0x00, 0x00 }, /* Ṫ to T */
1720 { 0x1E6B, 0x74, 0x00, 0x00, 0x00 }, /* ṫ to t */
1721 { 0x1E80, 0x57, 0x00, 0x00, 0x00 }, /* Ẁ to W */
1722 { 0x1E81, 0x77, 0x00, 0x00, 0x00 }, /* ẁ to w */
1723 { 0x1E82, 0x57, 0x00, 0x00, 0x00 }, /* Ẃ to W */
1724 { 0x1E83, 0x77, 0x00, 0x00, 0x00 }, /* ẃ to w */
1725 { 0x1E84, 0x57, 0x00, 0x00, 0x00 }, /* Ẅ to W */
1726 { 0x1E85, 0x77, 0x00, 0x00, 0x00 }, /* ẅ to w */
1727 { 0x1EF2, 0x59, 0x00, 0x00, 0x00 }, /* Ỳ to Y */
1728 { 0x1EF3, 0x79, 0x00, 0x00, 0x00 }, /* ỳ to y */
1729 { 0xFB00, 0x66, 0x66, 0x00, 0x00 }, /* ff to ff */
1730 { 0xFB01, 0x66, 0x69, 0x00, 0x00 }, /* fi to fi */
1731 { 0xFB02, 0x66, 0x6C, 0x00, 0x00 }, /* fl to fl */
1732 { 0xFB05, 0x73, 0x74, 0x00, 0x00 }, /* ſt to st */
1733 { 0xFB06, 0x73, 0x74, 0x00, 0x00 }, /* st to st */
1734};
1735
1736static const Transliteration *spellfixFindTranslit(int c, int *pxTop)
1737{
1738 *pxTop = (sizeof(translit)/sizeof(translit[0])) - 1;
1739 return translit;
1740}
1741
1742/*
1743** Convert the input string from UTF-8 into pure ASCII by converting
1744** all non-ASCII characters to some combination of characters in the
1745** ASCII subset.
1746**
1747** The returned string might contain more characters than the input.
1748**
1749** Space to hold the returned string comes from sqlite3_malloc() and
1750** should be freed by the caller.
1751*/
1752static unsigned char *transliterate(const unsigned char *zIn, int nIn)
1753{
1754#ifdef SQLITE_SPELLFIX_5BYTE_MAPPINGS
1755 unsigned char *zOut = sqlite3_malloc64( nIn*5 + 1 );
1756#else
1757 unsigned char *zOut = sqlite3_malloc64( nIn*4 + 1 );
1758#endif
1759 int c, sz, nOut;
1760 if( zOut==0 ) return 0;
1761 nOut = 0;
1762 while( nIn>0 ) {
1763 c = utf8Read(zIn, nIn, &sz);
1764 zIn += sz;
1765 nIn -= sz;
1766 if( c<=127 ) {
1767 zOut[nOut++] = (unsigned char)c;
1768 } else {
1769 int xTop, xBtm, x;
1770 const Transliteration *tbl = spellfixFindTranslit(c, &xTop);
1771 xBtm = 0;
1772 while( xTop>=xBtm ) {
1773 x = (xTop + xBtm)/2;
1774 if( tbl[x].cFrom==c ) {
1775 zOut[nOut++] = tbl[x].cTo0;
1776 if( tbl[x].cTo1 ) {
1777 zOut[nOut++] = tbl[x].cTo1;
1778 if( tbl[x].cTo2 ) {
1779 zOut[nOut++] = tbl[x].cTo2;
1780 if( tbl[x].cTo3 ) {
1781 zOut[nOut++] = tbl[x].cTo3;
1782#ifdef SQLITE_SPELLFIX_5BYTE_MAPPINGS
1783 if( tbl[x].cTo4 ) {
1784 zOut[nOut++] = tbl[x].cTo4;
1785 }
1786#endif /* SQLITE_SPELLFIX_5BYTE_MAPPINGS */
1787 }
1788 }
1789 }
1790 c = 0;
1791 break;
1792 } else if( tbl[x].cFrom>c ) {
1793 xTop = x-1;
1794 } else {
1795 xBtm = x+1;
1796 }
1797 }
1798 if( c ) zOut[nOut++] = '?';
1799 }
1800 }
1801 zOut[nOut] = 0;
1802 return zOut;
1803}
1804
1805/*
1806** Return the number of characters in the shortest prefix of the input
1807** string that transliterates to an ASCII string nTrans bytes or longer.
1808** Or, if the transliteration of the input string is less than nTrans
1809** bytes in size, return the number of characters in the input string.
1810*/
1811static int translen_to_charlen(const char *zIn, int nIn, int nTrans)
1812{
1813 int i, c, sz, nOut;
1814 int nChar;
1815
1816 i = nOut = 0;
1817 for(nChar=0; i<nIn && nOut<nTrans; nChar++) {
1818 c = utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1819 i += sz;
1820
1821 nOut++;
1822 if( c>=128 ) {
1823 int xTop, xBtm, x;
1824 const Transliteration *tbl = spellfixFindTranslit(c, &xTop);
1825 xBtm = 0;
1826 while( xTop>=xBtm ) {
1827 x = (xTop + xBtm)/2;
1828 if( tbl[x].cFrom==c ) {
1829 if( tbl[x].cTo1 ) {
1830 nOut++;
1831 if( tbl[x].cTo2 ) {
1832 nOut++;
1833 if( tbl[x].cTo3 ) {
1834 nOut++;
1835 }
1836 }
1837 }
1838 break;
1839 } else if( tbl[x].cFrom>c ) {
1840 xTop = x-1;
1841 } else {
1842 xBtm = x+1;
1843 }
1844 }
1845 }
1846 }
1847
1848 return nChar;
1849}
1850
1851/*
1852** spellfix1_translit(X)
1853**
1854** Convert a string that contains non-ASCII Roman characters into
1855** pure ASCII.
1856*/
1857static void transliterateSqlFunc(
1858 sqlite3_context *context,
1859 int argc,
1860 sqlite3_value **argv
1861)
1862{
1863 const unsigned char *zIn = sqlite3_value_text(argv[0]);
1864 int nIn = sqlite3_value_bytes(argv[0]);
1865 unsigned char *zOut = transliterate(zIn, nIn);
1866 if( zOut==0 ) {
1867 sqlite3_result_error_nomem(context);
1868 } else {
1869 sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
1870 }
1871}
1872
1873/*
1874** spellfix1_scriptcode(X)
1875**
1876** Try to determine the dominant script used by the word X and return
1877** its ISO 15924 numeric code.
1878**
1879** The current implementation only understands the following scripts:
1880**
1881** 215 (Latin)
1882** 220 (Cyrillic)
1883** 200 (Greek)
1884**
1885** This routine will return 998 if the input X contains characters from
1886** two or more of the above scripts or 999 if X contains no characters
1887** from any of the above scripts.
1888*/
1889static void scriptCodeSqlFunc(
1890 sqlite3_context *context,
1891 int argc,
1892 sqlite3_value **argv
1893)
1894{
1895 const unsigned char *zIn = sqlite3_value_text(argv[0]);
1896 int nIn = sqlite3_value_bytes(argv[0]);
1897 int c, sz;
1898 int scriptMask = 0;
1899 int res;
1900 int seenDigit = 0;
1901# define SCRIPT_LATIN 0x0001
1902# define SCRIPT_CYRILLIC 0x0002
1903# define SCRIPT_GREEK 0x0004
1904# define SCRIPT_HEBREW 0x0008
1905# define SCRIPT_ARABIC 0x0010
1906
1907 while( nIn>0 ) {
1908 c = utf8Read(zIn, nIn, &sz);
1909 zIn += sz;
1910 nIn -= sz;
1911 if( c<0x02af ) {
1912 if( c>=0x80 || midClass[c&0x7f]<CCLASS_DIGIT ) {
1913 scriptMask |= SCRIPT_LATIN;
1914 } else if( c>='0' && c<='9' ) {
1915 seenDigit = 1;
1916 }
1917 } else if( c>=0x0400 && c<=0x04ff ) {
1918 scriptMask |= SCRIPT_CYRILLIC;
1919 } else if( c>=0x0386 && c<=0x03ce ) {
1920 scriptMask |= SCRIPT_GREEK;
1921 } else if( c>=0x0590 && c<=0x05ff ) {
1922 scriptMask |= SCRIPT_HEBREW;
1923 } else if( c>=0x0600 && c<=0x06ff ) {
1924 scriptMask |= SCRIPT_ARABIC;
1925 }
1926 }
1927 if( scriptMask==0 && seenDigit ) scriptMask = SCRIPT_LATIN;
1928 switch( scriptMask ) {
1929 case 0:
1930 res = 999;
1931 break;
1932 case SCRIPT_LATIN:
1933 res = 215;
1934 break;
1935 case SCRIPT_CYRILLIC:
1936 res = 220;
1937 break;
1938 case SCRIPT_GREEK:
1939 res = 200;
1940 break;
1941 case SCRIPT_HEBREW:
1942 res = 125;
1943 break;
1944 case SCRIPT_ARABIC:
1945 res = 160;
1946 break;
1947 default:
1948 res = 998;
1949 break;
1950 }
1951 sqlite3_result_int(context, res);
1952}
1953
1954/* End transliterate
1955******************************************************************************
1956******************************************************************************
1957** Begin spellfix1 virtual table.
1958*/
1959
1960/* Maximum length of a phonehash used for querying the shadow table */
1961#define SPELLFIX_MX_HASH 32
1962
1963/* Maximum number of hash strings to examine per query */
1964#define SPELLFIX_MX_RUN 1
1965
1966typedef struct spellfix1_vtab spellfix1_vtab;
1967typedef struct spellfix1_cursor spellfix1_cursor;
1968
1969/* Fuzzy-search virtual table object */
1970struct spellfix1_vtab {
1971 sqlite3_vtab base; /* Base class - must be first */
1972 sqlite3 *db; /* Database connection */
1973 char *zDbName; /* Name of database holding this table */
1974 char *zTableName; /* Name of the virtual table */
1975 char *zCostTable; /* Table holding edit-distance cost numbers */
1976 EditDist3Config *pConfig3; /* Parsed edit distance costs */
1977};
1978
1979/* Fuzzy-search cursor object */
1980struct spellfix1_cursor {
1981 sqlite3_vtab_cursor base; /* Base class - must be first */
1982 spellfix1_vtab *pVTab; /* The table to which this cursor belongs */
1983 char *zPattern; /* rhs of MATCH clause */
1984 int idxNum; /* idxNum value passed to xFilter() */
1985 int nRow; /* Number of rows of content */
1986 int nAlloc; /* Number of allocated rows */
1987 int iRow; /* Current row of content */
1988 int iLang; /* Value of the langid= constraint */
1989 int iTop; /* Value of the top= constraint */
1990 int iScope; /* Value of the scope= constraint */
1991 int nSearch; /* Number of vocabulary items checked */
1992 sqlite3_stmt *pFullScan; /* Shadow query for a full table scan */
1993 struct spellfix1_row { /* For each row of content */
1994 sqlite3_int64 iRowid; /* Rowid for this row */
1995 char *zWord; /* Text for this row */
1996 int iRank; /* Rank for this row */
1997 int iDistance; /* Distance from pattern for this row */
1998 int iScore; /* Score for sorting */
1999 int iMatchlen; /* Value of matchlen column (or -1) */
2000 char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */
2001 } *a;
2002};
2003
2004/*
2005** Construct one or more SQL statements from the format string given
2006** and then evaluate those statements. The success code is written
2007** into *pRc.
2008**
2009** If *pRc is initially non-zero then this routine is a no-op.
2010*/
2011static void spellfix1DbExec(
2012 int *pRc, /* Success code */
2013 sqlite3 *db, /* Database in which to run SQL */
2014 const char *zFormat, /* Format string for SQL */
2015 ... /* Arguments to the format string */
2016)
2017{
2018 va_list ap;
2019 char *zSql;
2020 if( *pRc ) return;
2021 va_start(ap, zFormat);
2022 zSql = sqlite3_vmprintf(zFormat, ap);
2023 va_end(ap);
2024 if( zSql==0 ) {
2025 *pRc = SQLITE_NOMEM;
2026 } else {
2027 *pRc = sqlite3_exec(db, zSql, 0, 0, 0);
2028 sqlite3_free(zSql);
2029 }
2030}
2031
2032/*
2033** xDisconnect/xDestroy method for the fuzzy-search module.
2034*/
2035static int spellfix1Uninit(int isDestroy, sqlite3_vtab *pVTab)
2036{
2037 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2038 int rc = SQLITE_OK;
2039 if( isDestroy ) {
2040 sqlite3 *db = p->db;
2041 spellfix1DbExec(&rc, db, "DROP TABLE IF EXISTS \"%w\".\"%w_vocab\"",
2042 p->zDbName, p->zTableName);
2043 }
2044 if( rc==SQLITE_OK ) {
2045 sqlite3_free(p->zTableName);
2046 editDist3ConfigDelete(p->pConfig3);
2047 sqlite3_free(p->zCostTable);
2048 sqlite3_free(p);
2049 }
2050 return rc;
2051}
2052static int spellfix1Disconnect(sqlite3_vtab *pVTab)
2053{
2054 return spellfix1Uninit(0, pVTab);
2055}
2056static int spellfix1Destroy(sqlite3_vtab *pVTab)
2057{
2058 return spellfix1Uninit(1, pVTab);
2059}
2060
2061/*
2062** Make a copy of a string. Remove leading and trailing whitespace
2063** and dequote it.
2064*/
2065static char *spellfix1Dequote(const char *zIn)
2066{
2067 char *zOut;
2068 int i, j;
2069 char c;
2070 while( isspace((unsigned char)zIn[0]) ) zIn++;
2071 zOut = sqlite3_mprintf("%s", zIn);
2072 if( zOut==0 ) return 0;
2073 i = (int)strlen(zOut);
2074#if 0 /* The parser will never leave spaces at the end */
2075 while( i>0 && isspace(zOut[i-1]) ) {
2076 i--;
2077 }
2078#endif
2079 zOut[i] = 0;
2080 c = zOut[0];
2081 if( c=='\'' || c=='"' ) {
2082 for(i=1, j=0; ALWAYS(zOut[i]); i++) {
2083 zOut[j++] = zOut[i];
2084 if( zOut[i]==c ) {
2085 if( zOut[i+1]==c ) {
2086 i++;
2087 } else {
2088 zOut[j-1] = 0;
2089 break;
2090 }
2091 }
2092 }
2093 }
2094 return zOut;
2095}
2096
2097/*
2098** xConnect/xCreate method for the spellfix1 module. Arguments are:
2099**
2100** argv[0] -> module name ("spellfix1")
2101** argv[1] -> database name
2102** argv[2] -> table name
2103** argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter)
2104*/
2105static int spellfix1Init(
2106 int isCreate,
2107 sqlite3 *db,
2108 void *pAux,
2109 int argc, const char *const*argv,
2110 sqlite3_vtab **ppVTab,
2111 char **pzErr
2112)
2113{
2114 spellfix1_vtab *pNew = 0;
2115 /* const char *zModule = argv[0]; // not used */
2116 const char *zDbName = argv[1];
2117 const char *zTableName = argv[2];
2118 int nDbName;
2119 int rc = SQLITE_OK;
2120 int i;
2121
2122 nDbName = (int)strlen(zDbName);
2123 pNew = sqlite3_malloc64( sizeof(*pNew) + nDbName + 1);
2124 if( pNew==0 ) {
2125 rc = SQLITE_NOMEM;
2126 } else {
2127 memset(pNew, 0, sizeof(*pNew));
2128 pNew->zDbName = (char*)&pNew[1];
2129 memcpy(pNew->zDbName, zDbName, nDbName+1);
2130 pNew->zTableName = sqlite3_mprintf("%s", zTableName);
2131 pNew->db = db;
2132 if( pNew->zTableName==0 ) {
2133 rc = SQLITE_NOMEM;
2134 } else {
2135 sqlite3_vtab_config(db, SQLITE_VTAB_INNOCUOUS);
2136 rc = sqlite3_declare_vtab(db,
2137 "CREATE TABLE x(word,rank,distance,langid, "
2138 "score, matchlen, phonehash HIDDEN, "
2139 "top HIDDEN, scope HIDDEN, srchcnt HIDDEN, "
2140 "soundslike HIDDEN, command HIDDEN)"
2141 );
2142#define SPELLFIX_COL_WORD 0
2143#define SPELLFIX_COL_RANK 1
2144#define SPELLFIX_COL_DISTANCE 2
2145#define SPELLFIX_COL_LANGID 3
2146#define SPELLFIX_COL_SCORE 4
2147#define SPELLFIX_COL_MATCHLEN 5
2148#define SPELLFIX_COL_PHONEHASH 6
2149#define SPELLFIX_COL_TOP 7
2150#define SPELLFIX_COL_SCOPE 8
2151#define SPELLFIX_COL_SRCHCNT 9
2152#define SPELLFIX_COL_SOUNDSLIKE 10
2153#define SPELLFIX_COL_COMMAND 11
2154 }
2155 if( rc==SQLITE_OK && isCreate ) {
2156 spellfix1DbExec(&rc, db,
2157 "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n"
2158 " id INTEGER PRIMARY KEY,\n"
2159 " rank INT,\n"
2160 " langid INT,\n"
2161 " word TEXT,\n"
2162 " k1 TEXT,\n"
2163 " k2 TEXT\n"
2164 ");\n",
2165 zDbName, zTableName
2166 );
2167 spellfix1DbExec(&rc, db,
2168 "CREATE INDEX IF NOT EXISTS \"%w\".\"%w_vocab_index_langid_k2\" "
2169 "ON \"%w_vocab\"(langid,k2);",
2170 zDbName, zTableName, zTableName
2171 );
2172 }
2173 for(i=3; rc==SQLITE_OK && i<argc; i++) {
2174 if( strncmp(argv[i],"edit_cost_table=",16)==0 && pNew->zCostTable==0 ) {
2175 pNew->zCostTable = spellfix1Dequote(&argv[i][16]);
2176 if( pNew->zCostTable==0 ) rc = SQLITE_NOMEM;
2177 continue;
2178 }
2179 *pzErr = sqlite3_mprintf("bad argument to spellfix1(): \"%s\"", argv[i]);
2180 rc = SQLITE_ERROR;
2181 }
2182 }
2183
2184 if( rc && pNew ) {
2185 *ppVTab = 0;
2186 spellfix1Uninit(0, &pNew->base);
2187 } else {
2188 *ppVTab = (sqlite3_vtab *)pNew;
2189 }
2190 return rc;
2191}
2192
2193/*
2194** The xConnect and xCreate methods
2195*/
2196static int spellfix1Connect(
2197 sqlite3 *db,
2198 void *pAux,
2199 int argc, const char *const*argv,
2200 sqlite3_vtab **ppVTab,
2201 char **pzErr
2202)
2203{
2204 return spellfix1Init(0, db, pAux, argc, argv, ppVTab, pzErr);
2205}
2206static int spellfix1Create(
2207 sqlite3 *db,
2208 void *pAux,
2209 int argc, const char *const*argv,
2210 sqlite3_vtab **ppVTab,
2211 char **pzErr
2212)
2213{
2214 return spellfix1Init(1, db, pAux, argc, argv, ppVTab, pzErr);
2215}
2216
2217/*
2218** Clear all of the content from a cursor.
2219*/
2220static void spellfix1ResetCursor(spellfix1_cursor *pCur)
2221{
2222 int i;
2223 for(i=0; i<pCur->nRow; i++) {
2224 sqlite3_free(pCur->a[i].zWord);
2225 }
2226 pCur->nRow = 0;
2227 pCur->iRow = 0;
2228 pCur->nSearch = 0;
2229 if( pCur->pFullScan ) {
2230 sqlite3_finalize(pCur->pFullScan);
2231 pCur->pFullScan = 0;
2232 }
2233}
2234
2235/*
2236** Resize the cursor to hold up to N rows of content
2237*/
2238static void spellfix1ResizeCursor(spellfix1_cursor *pCur, int N)
2239{
2240 struct spellfix1_row *aNew;
2241 assert( N>=pCur->nRow );
2242 aNew = sqlite3_realloc64(pCur->a, sizeof(pCur->a[0])*N);
2243 if( aNew==0 && N>0 ) {
2244 spellfix1ResetCursor(pCur);
2245 sqlite3_free(pCur->a);
2246 pCur->nAlloc = 0;
2247 pCur->a = 0;
2248 } else {
2249 pCur->nAlloc = N;
2250 pCur->a = aNew;
2251 }
2252}
2253
2254/*
2255** Close a fuzzy-search cursor.
2256*/
2257static int spellfix1Close(sqlite3_vtab_cursor *cur)
2258{
2259 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2260 spellfix1ResetCursor(pCur);
2261 spellfix1ResizeCursor(pCur, 0);
2262 sqlite3_free(pCur->zPattern);
2263 sqlite3_free(pCur);
2264 return SQLITE_OK;
2265}
2266
2267#define SPELLFIX_IDXNUM_MATCH 0x01 /* word MATCH $str */
2268#define SPELLFIX_IDXNUM_LANGID 0x02 /* langid == $langid */
2269#define SPELLFIX_IDXNUM_TOP 0x04 /* top = $top */
2270#define SPELLFIX_IDXNUM_SCOPE 0x08 /* scope = $scope */
2271#define SPELLFIX_IDXNUM_DISTLT 0x10 /* distance < $distance */
2272#define SPELLFIX_IDXNUM_DISTLE 0x20 /* distance <= $distance */
2273#define SPELLFIX_IDXNUM_ROWID 0x40 /* rowid = $rowid */
2274#define SPELLFIX_IDXNUM_DIST (0x10|0x20) /* DISTLT and DISTLE */
2275
2276/*
2277**
2278** The plan number is a bitmask of the SPELLFIX_IDXNUM_* values defined
2279** above.
2280**
2281** filter.argv[*] values contains $str, $langid, $top, $scope and $rowid
2282** if specified and in that order.
2283*/
2284static int spellfix1BestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo)
2285{
2286 int iPlan = 0;
2287 int iLangTerm = -1;
2288 int iTopTerm = -1;
2289 int iScopeTerm = -1;
2290 int iDistTerm = -1;
2291 int iRowidTerm = -1;
2292 int i;
2293 const struct sqlite3_index_constraint *pConstraint;
2294 pConstraint = pIdxInfo->aConstraint;
2295 for(i=0; i<pIdxInfo->nConstraint; i++, pConstraint++) {
2296 if( pConstraint->usable==0 ) continue;
2297
2298 /* Terms of the form: word MATCH $str */
2299 if( (iPlan & SPELLFIX_IDXNUM_MATCH)==0
2300 && pConstraint->iColumn==SPELLFIX_COL_WORD
2301 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH
2302 ) {
2303 iPlan |= SPELLFIX_IDXNUM_MATCH;
2304 pIdxInfo->aConstraintUsage[i].argvIndex = 1;
2305 pIdxInfo->aConstraintUsage[i].omit = 1;
2306 }
2307
2308 /* Terms of the form: langid = $langid */
2309 if( (iPlan & SPELLFIX_IDXNUM_LANGID)==0
2310 && pConstraint->iColumn==SPELLFIX_COL_LANGID
2311 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2312 ) {
2313 iPlan |= SPELLFIX_IDXNUM_LANGID;
2314 iLangTerm = i;
2315 }
2316
2317 /* Terms of the form: top = $top */
2318 if( (iPlan & SPELLFIX_IDXNUM_TOP)==0
2319 && pConstraint->iColumn==SPELLFIX_COL_TOP
2320 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2321 ) {
2322 iPlan |= SPELLFIX_IDXNUM_TOP;
2323 iTopTerm = i;
2324 }
2325
2326 /* Terms of the form: scope = $scope */
2327 if( (iPlan & SPELLFIX_IDXNUM_SCOPE)==0
2328 && pConstraint->iColumn==SPELLFIX_COL_SCOPE
2329 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2330 ) {
2331 iPlan |= SPELLFIX_IDXNUM_SCOPE;
2332 iScopeTerm = i;
2333 }
2334
2335 /* Terms of the form: distance < $dist or distance <= $dist */
2336 if( (iPlan & SPELLFIX_IDXNUM_DIST)==0
2337 && pConstraint->iColumn==SPELLFIX_COL_DISTANCE
2338 && (pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT
2339 || pConstraint->op==SQLITE_INDEX_CONSTRAINT_LE)
2340 ) {
2341 if( pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT ) {
2342 iPlan |= SPELLFIX_IDXNUM_DISTLT;
2343 } else {
2344 iPlan |= SPELLFIX_IDXNUM_DISTLE;
2345 }
2346 iDistTerm = i;
2347 }
2348
2349 /* Terms of the form: distance < $dist or distance <= $dist */
2350 if( (iPlan & SPELLFIX_IDXNUM_ROWID)==0
2351 && pConstraint->iColumn<0
2352 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2353 ) {
2354 iPlan |= SPELLFIX_IDXNUM_ROWID;
2355 iRowidTerm = i;
2356 }
2357 }
2358 if( iPlan&SPELLFIX_IDXNUM_MATCH ) {
2359 int idx = 2;
2360 pIdxInfo->idxNum = iPlan;
2361 if( pIdxInfo->nOrderBy==1
2362 && pIdxInfo->aOrderBy[0].iColumn==SPELLFIX_COL_SCORE
2363 && pIdxInfo->aOrderBy[0].desc==0
2364 ) {
2365 pIdxInfo->orderByConsumed = 1; /* Default order by iScore */
2366 }
2367 if( iPlan&SPELLFIX_IDXNUM_LANGID ) {
2368 pIdxInfo->aConstraintUsage[iLangTerm].argvIndex = idx++;
2369 pIdxInfo->aConstraintUsage[iLangTerm].omit = 1;
2370 }
2371 if( iPlan&SPELLFIX_IDXNUM_TOP ) {
2372 pIdxInfo->aConstraintUsage[iTopTerm].argvIndex = idx++;
2373 pIdxInfo->aConstraintUsage[iTopTerm].omit = 1;
2374 }
2375 if( iPlan&SPELLFIX_IDXNUM_SCOPE ) {
2376 pIdxInfo->aConstraintUsage[iScopeTerm].argvIndex = idx++;
2377 pIdxInfo->aConstraintUsage[iScopeTerm].omit = 1;
2378 }
2379 if( iPlan&SPELLFIX_IDXNUM_DIST ) {
2380 pIdxInfo->aConstraintUsage[iDistTerm].argvIndex = idx++;
2381 pIdxInfo->aConstraintUsage[iDistTerm].omit = 1;
2382 }
2383 pIdxInfo->estimatedCost = 1e5;
2384 } else if( (iPlan & SPELLFIX_IDXNUM_ROWID) ) {
2385 pIdxInfo->idxNum = SPELLFIX_IDXNUM_ROWID;
2386 pIdxInfo->aConstraintUsage[iRowidTerm].argvIndex = 1;
2387 pIdxInfo->aConstraintUsage[iRowidTerm].omit = 1;
2388 pIdxInfo->estimatedCost = 5;
2389 } else {
2390 pIdxInfo->idxNum = 0;
2391 pIdxInfo->estimatedCost = 1e50;
2392 }
2393 return SQLITE_OK;
2394}
2395
2396/*
2397** Open a new fuzzy-search cursor.
2398*/
2399static int spellfix1Open(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor)
2400{
2401 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2402 spellfix1_cursor *pCur;
2403 pCur = sqlite3_malloc64( sizeof(*pCur) );
2404 if( pCur==0 ) return SQLITE_NOMEM;
2405 memset(pCur, 0, sizeof(*pCur));
2406 pCur->pVTab = p;
2407 *ppCursor = &pCur->base;
2408 return SQLITE_OK;
2409}
2410
2411/*
2412** Adjust a distance measurement by the words rank in order to show
2413** preference to common words.
2414*/
2415static int spellfix1Score(int iDistance, int iRank)
2416{
2417 int iLog2;
2418 for(iLog2=0; iRank>0; iLog2++, iRank>>=1) {}
2419 return iDistance + 32 - iLog2;
2420}
2421
2422/*
2423** Compare two spellfix1_row objects for sorting purposes in qsort() such
2424** that they sort in order of increasing distance.
2425*/
2426static int SQLITE_CDECL spellfix1RowCompare(const void *A, const void *B)
2427{
2428 const struct spellfix1_row *a = (const struct spellfix1_row*)A;
2429 const struct spellfix1_row *b = (const struct spellfix1_row*)B;
2430 return a->iScore - b->iScore;
2431}
2432
2433/*
2434** A structure used to pass information from spellfix1FilterForMatch()
2435** into spellfix1RunQuery().
2436*/
2437typedef struct MatchQuery {
2438 spellfix1_cursor *pCur; /* The cursor being queried */
2439 sqlite3_stmt *pStmt; /* shadow table query statment */
2440 char zHash[SPELLFIX_MX_HASH]; /* The current phonehash for zPattern */
2441 const char *zPattern; /* Transliterated input string */
2442 int nPattern; /* Length of zPattern */
2443 EditDist3FromString *pMatchStr3; /* Original unicode string */
2444 EditDist3Config *pConfig3; /* Edit-distance cost coefficients */
2445 const EditDist3Lang *pLang; /* The selected language coefficients */
2446 int iLang; /* The language id */
2447 int iScope; /* Default scope */
2448 int iMaxDist; /* Maximum allowed edit distance, or -1 */
2449 int rc; /* Error code */
2450 int nRun; /* Number of prior runs for the same zPattern */
2451 char azPrior[SPELLFIX_MX_RUN][SPELLFIX_MX_HASH]; /* Prior hashes */
2452} MatchQuery;
2453
2454/*
2455** Run a query looking for the best matches against zPattern using
2456** zHash as the character class seed hash.
2457*/
2458static void spellfix1RunQuery(MatchQuery *p, const char *zQuery, int nQuery)
2459{
2460 const char *zK1;
2461 const char *zWord;
2462 int iDist;
2463 int iRank;
2464 int iScore;
2465 int iWorst = 0;
2466 int idx;
2467 int idxWorst = -1;
2468 int i;
2469 int iScope = p->iScope;
2470 spellfix1_cursor *pCur = p->pCur;
2471 sqlite3_stmt *pStmt = p->pStmt;
2472 char zHash1[SPELLFIX_MX_HASH];
2473 char zHash2[SPELLFIX_MX_HASH];
2474 char *zClass;
2475 int nClass;
2476 int rc;
2477
2478 if( pCur->a==0 || p->rc ) return; /* Prior memory allocation failure */
2479 zClass = (char*)phoneticHash((unsigned char*)zQuery, nQuery);
2480 if( zClass==0 ) {
2481 p->rc = SQLITE_NOMEM;
2482 return;
2483 }
2484 nClass = (int)strlen(zClass);
2485 if( nClass>SPELLFIX_MX_HASH-2 ) {
2486 nClass = SPELLFIX_MX_HASH-2;
2487 zClass[nClass] = 0;
2488 }
2489 if( nClass<=iScope ) {
2490 if( nClass>2 ) {
2491 iScope = nClass-1;
2492 } else {
2493 iScope = nClass;
2494 }
2495 }
2496 memcpy(zHash1, zClass, iScope);
2497 sqlite3_free(zClass);
2498 zHash1[iScope] = 0;
2499 memcpy(zHash2, zHash1, iScope);
2500 zHash2[iScope] = 'Z';
2501 zHash2[iScope+1] = 0;
2502#if SPELLFIX_MX_RUN>1
2503 for(i=0; i<p->nRun; i++) {
2504 if( strcmp(p->azPrior[i], zHash1)==0 ) return;
2505 }
2506#endif
2507 assert( p->nRun<SPELLFIX_MX_RUN );
2508 memcpy(p->azPrior[p->nRun++], zHash1, iScope+1);
2509 if( sqlite3_bind_text(pStmt, 1, zHash1, -1, SQLITE_STATIC)==SQLITE_NOMEM
2510 || sqlite3_bind_text(pStmt, 2, zHash2, -1, SQLITE_STATIC)==SQLITE_NOMEM
2511 ) {
2512 p->rc = SQLITE_NOMEM;
2513 return;
2514 }
2515#if SPELLFIX_MX_RUN>1
2516 for(i=0; i<pCur->nRow; i++) {
2517 if( pCur->a[i].iScore>iWorst ) {
2518 iWorst = pCur->a[i].iScore;
2519 idxWorst = i;
2520 }
2521 }
2522#endif
2523 while( sqlite3_step(pStmt)==SQLITE_ROW ) {
2524 int iMatchlen = -1;
2525 iRank = sqlite3_column_int(pStmt, 2);
2526 if( p->pMatchStr3 ) {
2527 int nWord = sqlite3_column_bytes(pStmt, 1);
2528 zWord = (const char*)sqlite3_column_text(pStmt, 1);
2529 iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen);
2530 } else {
2531 zK1 = (const char*)sqlite3_column_text(pStmt, 3);
2532 if( zK1==0 ) continue;
2533 iDist = editdist1(p->zPattern, zK1, 0);
2534 }
2535 if( iDist<0 ) {
2536 p->rc = SQLITE_NOMEM;
2537 break;
2538 }
2539 pCur->nSearch++;
2540
2541 /* If there is a "distance < $dist" or "distance <= $dist" constraint,
2542 ** check if this row meets it. If not, jump back up to the top of the
2543 ** loop to process the next row. Otherwise, if the row does match the
2544 ** distance constraint, check if the pCur->a[] array is already full.
2545 ** If it is and no explicit "top = ?" constraint was present in the
2546 ** query, grow the array to ensure there is room for the new entry. */
2547 assert( (p->iMaxDist>=0)==((pCur->idxNum & SPELLFIX_IDXNUM_DIST) ? 1 : 0) );
2548 if( p->iMaxDist>=0 ) {
2549 if( iDist>p->iMaxDist ) continue;
2550 if( pCur->nRow>=pCur->nAlloc && (pCur->idxNum & SPELLFIX_IDXNUM_TOP)==0 ) {
2551 spellfix1ResizeCursor(pCur, pCur->nAlloc*2 + 10);
2552 if( pCur->a==0 ) break;
2553 }
2554 }
2555
2556 iScore = spellfix1Score(iDist,iRank);
2557 if( pCur->nRow<pCur->nAlloc ) {
2558 idx = pCur->nRow;
2559 } else if( iScore<iWorst ) {
2560 idx = idxWorst;
2561 sqlite3_free(pCur->a[idx].zWord);
2562 } else {
2563 continue;
2564 }
2565
2566 pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1));
2567 if( pCur->a[idx].zWord==0 ) {
2568 p->rc = SQLITE_NOMEM;
2569 break;
2570 }
2571 pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0);
2572 pCur->a[idx].iRank = iRank;
2573 pCur->a[idx].iDistance = iDist;
2574 pCur->a[idx].iScore = iScore;
2575 pCur->a[idx].iMatchlen = iMatchlen;
2576 memcpy(pCur->a[idx].zHash, zHash1, iScope+1);
2577 if( pCur->nRow<pCur->nAlloc ) pCur->nRow++;
2578 if( pCur->nRow==pCur->nAlloc ) {
2579 iWorst = pCur->a[0].iScore;
2580 idxWorst = 0;
2581 for(i=1; i<pCur->nRow; i++) {
2582 iScore = pCur->a[i].iScore;
2583 if( iWorst<iScore ) {
2584 iWorst = iScore;
2585 idxWorst = i;
2586 }
2587 }
2588 }
2589 }
2590 rc = sqlite3_reset(pStmt);
2591 if( rc ) p->rc = rc;
2592}
2593
2594/*
2595** This version of the xFilter method work if the MATCH term is present
2596** and we are doing a scan.
2597*/
2598static int spellfix1FilterForMatch(
2599 spellfix1_cursor *pCur,
2600 int argc,
2601 sqlite3_value **argv
2602)
2603{
2604 int idxNum = pCur->idxNum;
2605 const unsigned char *zMatchThis; /* RHS of the MATCH operator */
2606 EditDist3FromString *pMatchStr3 = 0; /* zMatchThis as an editdist string */
2607 char *zPattern; /* Transliteration of zMatchThis */
2608 int nPattern; /* Length of zPattern */
2609 int iLimit = 20; /* Max number of rows of output */
2610 int iScope = 3; /* Use this many characters of zClass */
2611 int iLang = 0; /* Language code */
2612 char *zSql; /* SQL of shadow table query */
2613 sqlite3_stmt *pStmt = 0; /* Shadow table query */
2614 int rc; /* Result code */
2615 int idx = 1; /* Next available filter parameter */
2616 spellfix1_vtab *p = pCur->pVTab; /* The virtual table that owns pCur */
2617 MatchQuery x; /* For passing info to RunQuery() */
2618
2619 /* Load the cost table if we have not already done so */
2620 if( p->zCostTable!=0 && p->pConfig3==0 ) {
2621 p->pConfig3 = sqlite3_malloc64( sizeof(p->pConfig3[0]) );
2622 if( p->pConfig3==0 ) return SQLITE_NOMEM;
2623 memset(p->pConfig3, 0, sizeof(p->pConfig3[0]));
2624 rc = editDist3ConfigLoad(p->pConfig3, p->db, p->zCostTable);
2625 if( rc ) return rc;
2626 }
2627 memset(&x, 0, sizeof(x));
2628 x.iScope = 3; /* Default scope if none specified by "WHERE scope=N" */
2629 x.iMaxDist = -1; /* Maximum allowed edit distance */
2630
2631 if( idxNum&2 ) {
2632 iLang = sqlite3_value_int(argv[idx++]);
2633 }
2634 if( idxNum&4 ) {
2635 iLimit = sqlite3_value_int(argv[idx++]);
2636 if( iLimit<1 ) iLimit = 1;
2637 }
2638 if( idxNum&8 ) {
2639 x.iScope = sqlite3_value_int(argv[idx++]);
2640 if( x.iScope<1 ) x.iScope = 1;
2641 if( x.iScope>SPELLFIX_MX_HASH-2 ) x.iScope = SPELLFIX_MX_HASH-2;
2642 }
2643 if( idxNum&(16|32) ) {
2644 x.iMaxDist = sqlite3_value_int(argv[idx++]);
2645 if( idxNum&16 ) x.iMaxDist--;
2646 if( x.iMaxDist<0 ) x.iMaxDist = 0;
2647 }
2648 spellfix1ResetCursor(pCur);
2649 spellfix1ResizeCursor(pCur, iLimit);
2650 zMatchThis = sqlite3_value_text(argv[0]);
2651 if( zMatchThis==0 ) return SQLITE_OK;
2652 if( p->pConfig3 ) {
2653 x.pLang = editDist3FindLang(p->pConfig3, iLang);
2654 pMatchStr3 = editDist3FromStringNew(x.pLang, (const char*)zMatchThis, -1);
2655 if( pMatchStr3==0 ) {
2656 x.rc = SQLITE_NOMEM;
2657 goto filter_exit;
2658 }
2659 } else {
2660 x.pLang = 0;
2661 }
2662 zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0]));
2663 sqlite3_free(pCur->zPattern);
2664 pCur->zPattern = zPattern;
2665 if( zPattern==0 ) {
2666 x.rc = SQLITE_NOMEM;
2667 goto filter_exit;
2668 }
2669 nPattern = (int)strlen(zPattern);
2670 if( zPattern[nPattern-1]=='*' ) nPattern--;
2671 zSql = sqlite3_mprintf(
2672 "SELECT id, word, rank, coalesce(k1,word)"
2673 " FROM \"%w\".\"%w_vocab\""
2674 " WHERE langid=%d AND k2>=?1 AND k2<?2",
2675 p->zDbName, p->zTableName, iLang
2676 );
2677 if( zSql==0 ) {
2678 x.rc = SQLITE_NOMEM;
2679 pStmt = 0;
2680 goto filter_exit;
2681 }
2682 rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0);
2683 sqlite3_free(zSql);
2684 pCur->iLang = iLang;
2685 x.pCur = pCur;
2686 x.pStmt = pStmt;
2687 x.zPattern = zPattern;
2688 x.nPattern = nPattern;
2689 x.pMatchStr3 = pMatchStr3;
2690 x.iLang = iLang;
2691 x.rc = rc;
2692 x.pConfig3 = p->pConfig3;
2693 if( x.rc==SQLITE_OK ) {
2694 spellfix1RunQuery(&x, zPattern, nPattern);
2695 }
2696
2697 if( pCur->a ) {
2698 qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare);
2699 pCur->iTop = iLimit;
2700 pCur->iScope = iScope;
2701 } else {
2702 x.rc = SQLITE_NOMEM;
2703 }
2704
2705filter_exit:
2706 sqlite3_finalize(pStmt);
2707 editDist3FromStringDelete(pMatchStr3);
2708 return x.rc;
2709}
2710
2711/*
2712** This version of xFilter handles a full-table scan case
2713*/
2714static int spellfix1FilterForFullScan(
2715 spellfix1_cursor *pCur,
2716 int argc,
2717 sqlite3_value **argv
2718)
2719{
2720 int rc = SQLITE_OK;
2721 int idxNum = pCur->idxNum;
2722 char *zSql;
2723 spellfix1_vtab *pVTab = pCur->pVTab;
2724 spellfix1ResetCursor(pCur);
2725 assert( idxNum==0 || idxNum==64 );
2726 zSql = sqlite3_mprintf(
2727 "SELECT word, rank, NULL, langid, id FROM \"%w\".\"%w_vocab\"%s",
2728 pVTab->zDbName, pVTab->zTableName,
2729 ((idxNum & 64) ? " WHERE rowid=?" : "")
2730 );
2731 if( zSql==0 ) return SQLITE_NOMEM;
2732 rc = sqlite3_prepare_v2(pVTab->db, zSql, -1, &pCur->pFullScan, 0);
2733 sqlite3_free(zSql);
2734 if( rc==SQLITE_OK && (idxNum & 64) ) {
2735 assert( argc==1 );
2736 rc = sqlite3_bind_value(pCur->pFullScan, 1, argv[0]);
2737 }
2738 pCur->nRow = pCur->iRow = 0;
2739 if( rc==SQLITE_OK ) {
2740 rc = sqlite3_step(pCur->pFullScan);
2741 if( rc==SQLITE_ROW ) {
2742 pCur->iRow = -1;
2743 rc = SQLITE_OK;
2744 }
2745 if( rc==SQLITE_DONE ) {
2746 rc = SQLITE_OK;
2747 }
2748 } else {
2749 pCur->iRow = 0;
2750 }
2751 return rc;
2752}
2753
2754/*
2755** Called to "rewind" a cursor back to the beginning so that
2756** it starts its output over again. Always called at least once
2757** prior to any spellfix1Column, spellfix1Rowid, or spellfix1Eof call.
2758*/
2759static int spellfix1Filter(
2760 sqlite3_vtab_cursor *cur,
2761 int idxNum, const char *idxStr,
2762 int argc, sqlite3_value **argv
2763)
2764{
2765 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2766 int rc;
2767 pCur->idxNum = idxNum;
2768 if( idxNum & 1 ) {
2769 rc = spellfix1FilterForMatch(pCur, argc, argv);
2770 } else {
2771 rc = spellfix1FilterForFullScan(pCur, argc, argv);
2772 }
2773 return rc;
2774}
2775
2776/*
2777** Advance a cursor to its next row of output
2778*/
2779static int spellfix1Next(sqlite3_vtab_cursor *cur)
2780{
2781 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2782 int rc = SQLITE_OK;
2783 if( pCur->iRow < pCur->nRow ) {
2784 if( pCur->pFullScan ) {
2785 rc = sqlite3_step(pCur->pFullScan);
2786 if( rc!=SQLITE_ROW ) pCur->iRow = pCur->nRow;
2787 if( rc==SQLITE_ROW || rc==SQLITE_DONE ) rc = SQLITE_OK;
2788 } else {
2789 pCur->iRow++;
2790 }
2791 }
2792 return rc;
2793}
2794
2795/*
2796** Return TRUE if we are at the end-of-file
2797*/
2798static int spellfix1Eof(sqlite3_vtab_cursor *cur)
2799{
2800 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2801 return pCur->iRow>=pCur->nRow;
2802}
2803
2804/*
2805** Return columns from the current row.
2806*/
2807static int spellfix1Column(
2808 sqlite3_vtab_cursor *cur,
2809 sqlite3_context *ctx,
2810 int i
2811)
2812{
2813 spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2814 if( pCur->pFullScan ) {
2815 if( i<=SPELLFIX_COL_LANGID ) {
2816 sqlite3_result_value(ctx, sqlite3_column_value(pCur->pFullScan, i));
2817 } else {
2818 sqlite3_result_null(ctx);
2819 }
2820 return SQLITE_OK;
2821 }
2822 switch( i ) {
2823 case SPELLFIX_COL_WORD: {
2824 sqlite3_result_text(ctx, pCur->a[pCur->iRow].zWord, -1, SQLITE_STATIC);
2825 break;
2826 }
2827 case SPELLFIX_COL_RANK: {
2828 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iRank);
2829 break;
2830 }
2831 case SPELLFIX_COL_DISTANCE: {
2832 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iDistance);
2833 break;
2834 }
2835 case SPELLFIX_COL_LANGID: {
2836 sqlite3_result_int(ctx, pCur->iLang);
2837 break;
2838 }
2839 case SPELLFIX_COL_SCORE: {
2840 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore);
2841 break;
2842 }
2843 case SPELLFIX_COL_MATCHLEN: {
2844 int iMatchlen = pCur->a[pCur->iRow].iMatchlen;
2845 if( iMatchlen<0 ) {
2846 int nPattern = (int)strlen(pCur->zPattern);
2847 char *zWord = pCur->a[pCur->iRow].zWord;
2848 int nWord = (int)strlen(zWord);
2849
2850 if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ) {
2851 char *zTranslit;
2852 int res;
2853 zTranslit = (char *)transliterate((unsigned char *)zWord, nWord);
2854 if( !zTranslit ) return SQLITE_NOMEM;
2855 res = editdist1(pCur->zPattern, zTranslit, &iMatchlen);
2856 sqlite3_free(zTranslit);
2857 if( res<0 ) return SQLITE_NOMEM;
2858 iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen);
2859 } else {
2860 iMatchlen = utf8Charlen(zWord, nWord);
2861 }
2862 }
2863
2864 sqlite3_result_int(ctx, iMatchlen);
2865 break;
2866 }
2867 case SPELLFIX_COL_PHONEHASH: {
2868 sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC);
2869 break;
2870 }
2871 case SPELLFIX_COL_TOP: {
2872 sqlite3_result_int(ctx, pCur->iTop);
2873 break;
2874 }
2875 case SPELLFIX_COL_SCOPE: {
2876 sqlite3_result_int(ctx, pCur->iScope);
2877 break;
2878 }
2879 case SPELLFIX_COL_SRCHCNT: {
2880 sqlite3_result_int(ctx, pCur->nSearch);
2881 break;
2882 }
2883 default: {
2884 sqlite3_result_null(ctx);
2885 break;
2886 }
2887 }
2888 return SQLITE_OK;
2889}
2890
2891/*
2892** The rowid.
2893*/
2894static int spellfix1Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid)
2895{
2896 spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2897 if( pCur->pFullScan ) {
2898 *pRowid = sqlite3_column_int64(pCur->pFullScan, 4);
2899 } else {
2900 *pRowid = pCur->a[pCur->iRow].iRowid;
2901 }
2902 return SQLITE_OK;
2903}
2904
2905/*
2906** This function is called by the xUpdate() method. It returns a string
2907** containing the conflict mode that xUpdate() should use for the current
2908** operation. One of: "ROLLBACK", "IGNORE", "ABORT" or "REPLACE".
2909*/
2910static const char *spellfix1GetConflict(sqlite3 *db)
2911{
2912 static const char *azConflict[] = {
2913 /* Note: Instead of "FAIL" - "ABORT". */
2914 "ROLLBACK", "IGNORE", "ABORT", "ABORT", "REPLACE"
2915 };
2916 int eConflict = sqlite3_vtab_on_conflict(db);
2917
2918 assert( eConflict==SQLITE_ROLLBACK || eConflict==SQLITE_IGNORE
2919 || eConflict==SQLITE_FAIL || eConflict==SQLITE_ABORT
2920 || eConflict==SQLITE_REPLACE
2921 );
2922 assert( SQLITE_ROLLBACK==1 );
2923 assert( SQLITE_IGNORE==2 );
2924 assert( SQLITE_FAIL==3 );
2925 assert( SQLITE_ABORT==4 );
2926 assert( SQLITE_REPLACE==5 );
2927
2928 return azConflict[eConflict-1];
2929}
2930
2931/*
2932** The xUpdate() method.
2933*/
2934static int spellfix1Update(
2935 sqlite3_vtab *pVTab,
2936 int argc,
2937 sqlite3_value **argv,
2938 sqlite_int64 *pRowid
2939)
2940{
2941 int rc = SQLITE_OK;
2942 sqlite3_int64 rowid, newRowid;
2943 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2944 sqlite3 *db = p->db;
2945
2946 if( argc==1 ) {
2947 /* A delete operation on the rowid given by argv[0] */
2948 rowid = *pRowid = sqlite3_value_int64(argv[0]);
2949 spellfix1DbExec(&rc, db, "DELETE FROM \"%w\".\"%w_vocab\" "
2950 " WHERE id=%lld",
2951 p->zDbName, p->zTableName, rowid);
2952 } else {
2953 const unsigned char *zWord = sqlite3_value_text(argv[SPELLFIX_COL_WORD+2]);
2954 int nWord = sqlite3_value_bytes(argv[SPELLFIX_COL_WORD+2]);
2955 int iLang = sqlite3_value_int(argv[SPELLFIX_COL_LANGID+2]);
2956 int iRank = sqlite3_value_int(argv[SPELLFIX_COL_RANK+2]);
2957 const unsigned char *zSoundslike =
2958 sqlite3_value_text(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2959 int nSoundslike = sqlite3_value_bytes(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2960 char *zK1, *zK2;
2961 int i;
2962 char c;
2963 const char *zConflict = spellfix1GetConflict(db);
2964
2965 if( zWord==0 ) {
2966 /* Inserts of the form: INSERT INTO table(command) VALUES('xyzzy');
2967 ** cause zWord to be NULL, so we look at the "command" column to see
2968 ** what special actions to take */
2969 const char *zCmd =
2970 (const char*)sqlite3_value_text(argv[SPELLFIX_COL_COMMAND+2]);
2971 if( zCmd==0 ) {
2972 pVTab->zErrMsg = sqlite3_mprintf("NOT NULL constraint failed: %s.word",
2973 p->zTableName);
2974 return SQLITE_CONSTRAINT_NOTNULL;
2975 }
2976 if( strcmp(zCmd,"reset")==0 ) {
2977 /* Reset the edit cost table (if there is one). */
2978 editDist3ConfigDelete(p->pConfig3);
2979 p->pConfig3 = 0;
2980 return SQLITE_OK;
2981 }
2982 if( strncmp(zCmd,"edit_cost_table=",16)==0 ) {
2983 editDist3ConfigDelete(p->pConfig3);
2984 p->pConfig3 = 0;
2985 sqlite3_free(p->zCostTable);
2986 p->zCostTable = spellfix1Dequote(zCmd+16);
2987 if( p->zCostTable==0 ) return SQLITE_NOMEM;
2988 if( p->zCostTable[0]==0 || sqlite3_stricmp(p->zCostTable,"null")==0 ) {
2989 sqlite3_free(p->zCostTable);
2990 p->zCostTable = 0;
2991 }
2992 return SQLITE_OK;
2993 }
2994 pVTab->zErrMsg = sqlite3_mprintf("unknown value for %s.command: \"%w\"",
2995 p->zTableName, zCmd);
2996 return SQLITE_ERROR;
2997 }
2998 if( iRank<1 ) iRank = 1;
2999 if( zSoundslike ) {
3000 zK1 = (char*)transliterate(zSoundslike, nSoundslike);
3001 } else {
3002 zK1 = (char*)transliterate(zWord, nWord);
3003 }
3004 if( zK1==0 ) return SQLITE_NOMEM;
3005 for(i=0; (c = zK1[i])!=0; i++) {
3006 if( c>='A' && c<='Z' ) zK1[i] += 'a' - 'A';
3007 }
3008 zK2 = (char*)phoneticHash((const unsigned char*)zK1, i);
3009 if( zK2==0 ) {
3010 sqlite3_free(zK1);
3011 return SQLITE_NOMEM;
3012 }
3013 if( sqlite3_value_type(argv[0])==SQLITE_NULL ) {
3014 if( sqlite3_value_type(argv[1])==SQLITE_NULL ) {
3015 spellfix1DbExec(&rc, db,
3016 "INSERT INTO \"%w\".\"%w_vocab\"(rank,langid,word,k1,k2) "
3017 "VALUES(%d,%d,%Q,nullif(%Q,%Q),%Q)",
3018 p->zDbName, p->zTableName,
3019 iRank, iLang, zWord, zK1, zWord, zK2
3020 );
3021 } else {
3022 newRowid = sqlite3_value_int64(argv[1]);
3023 spellfix1DbExec(&rc, db,
3024 "INSERT OR %s INTO \"%w\".\"%w_vocab\"(id,rank,langid,word,k1,k2) "
3025 "VALUES(%lld,%d,%d,%Q,nullif(%Q,%Q),%Q)",
3026 zConflict, p->zDbName, p->zTableName,
3027 newRowid, iRank, iLang, zWord, zK1, zWord, zK2
3028 );
3029 }
3030 *pRowid = sqlite3_last_insert_rowid(db);
3031 } else {
3032 rowid = sqlite3_value_int64(argv[0]);
3033 newRowid = *pRowid = sqlite3_value_int64(argv[1]);
3034 spellfix1DbExec(&rc, db,
3035 "UPDATE OR %s \"%w\".\"%w_vocab\" SET id=%lld, rank=%d, langid=%d,"
3036 " word=%Q, k1=nullif(%Q,%Q), k2=%Q WHERE id=%lld",
3037 zConflict, p->zDbName, p->zTableName, newRowid, iRank, iLang,
3038 zWord, zK1, zWord, zK2, rowid
3039 );
3040 }
3041 sqlite3_free(zK1);
3042 sqlite3_free(zK2);
3043 }
3044 return rc;
3045}
3046
3047/*
3048** Rename the spellfix1 table.
3049*/
3050static int spellfix1Rename(sqlite3_vtab *pVTab, const char *zNew)
3051{
3052 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
3053 sqlite3 *db = p->db;
3054 int rc = SQLITE_OK;
3055 char *zNewName = sqlite3_mprintf("%s", zNew);
3056 if( zNewName==0 ) {
3057 return SQLITE_NOMEM;
3058 }
3059 spellfix1DbExec(&rc, db,
3060 "ALTER TABLE \"%w\".\"%w_vocab\" RENAME TO \"%w_vocab\"",
3061 p->zDbName, p->zTableName, zNewName
3062 );
3063 if( rc==SQLITE_OK ) {
3064 sqlite3_free(p->zTableName);
3065 p->zTableName = zNewName;
3066 } else {
3067 sqlite3_free(zNewName);
3068 }
3069 return rc;
3070}
3071
3072/*
3073** A virtual table module that provides fuzzy search.
3074*/
3075static sqlite3_module spellfix1Module = {
3076 0, /* iVersion */
3077 spellfix1Create, /* xCreate - handle CREATE VIRTUAL TABLE */
3078 spellfix1Connect, /* xConnect - reconnected to an existing table */
3079 spellfix1BestIndex, /* xBestIndex - figure out how to do a query */
3080 spellfix1Disconnect, /* xDisconnect - close a connection */
3081 spellfix1Destroy, /* xDestroy - handle DROP TABLE */
3082 spellfix1Open, /* xOpen - open a cursor */
3083 spellfix1Close, /* xClose - close a cursor */
3084 spellfix1Filter, /* xFilter - configure scan constraints */
3085 spellfix1Next, /* xNext - advance a cursor */
3086 spellfix1Eof, /* xEof - check for end of scan */
3087 spellfix1Column, /* xColumn - read data */
3088 spellfix1Rowid, /* xRowid - read data */
3089 spellfix1Update, /* xUpdate */
3090 0, /* xBegin */
3091 0, /* xSync */
3092 0, /* xCommit */
3093 0, /* xRollback */
3094 0, /* xFindMethod */
3095 spellfix1Rename, /* xRename */
3096 0, /* xSavepoint */
3097 0, /* xRelease */
3098 0, /* xRollbackTo */
3099 0, /* xShadowName */
3100 0 /* xIntegrity */
3101};
3102
3103/*
3104** Register the various functions and the virtual table.
3105*/
3106static int spellfix1Register(sqlite3 *db)
3107{
3108 int rc = SQLITE_OK;
3109 int i;
3110 rc = sqlite3_create_function(db, "spellfix1_translit", 1,
3111 SQLITE_UTF8|SQLITE_DETERMINISTIC, 0,
3112 transliterateSqlFunc, 0, 0);
3113 if( rc==SQLITE_OK ) {
3114 rc = sqlite3_create_function(db, "spellfix1_editdist", 2,
3115 SQLITE_UTF8|SQLITE_DETERMINISTIC, 0,
3116 editdistSqlFunc, 0, 0);
3117 }
3118 if( rc==SQLITE_OK ) {
3119 rc = sqlite3_create_function(db, "spellfix1_phonehash", 1,
3120 SQLITE_UTF8|SQLITE_DETERMINISTIC, 0,
3121 phoneticHashSqlFunc, 0, 0);
3122 }
3123 if( rc==SQLITE_OK ) {
3124 rc = sqlite3_create_function(db, "spellfix1_scriptcode", 1,
3125 SQLITE_UTF8|SQLITE_DETERMINISTIC, 0,
3126 scriptCodeSqlFunc, 0, 0);
3127 }
3128 if( rc==SQLITE_OK ) {
3129 rc = sqlite3_create_module(db, "spellfix1", &spellfix1Module, 0);
3130 }
3131 if( rc==SQLITE_OK ) {
3132 rc = editDist3Install(db);
3133 }
3134
3135 /* Verify sanity of the translit[] table */
3136 for(i=0; i<sizeof(translit)/sizeof(translit[0])-1; i++) {
3137 assert( translit[i].cFrom<translit[i+1].cFrom );
3138 }
3139
3140 return rc;
3141}
3142
3143#endif /* SQLITE_OMIT_VIRTUALTABLE */
3144
3145/*
3146** Extension load function.
3147*/
3148#ifdef _WIN32
3149__declspec(dllexport)
3150#endif
3151int sqlite3_spellfix_init(
3152 sqlite3 *db,
3153 char **pzErrMsg,
3154 const sqlite3_api_routines *pApi
3155)
3156{
3157 SQLITE_EXTENSION_INIT2(pApi);
3158#ifndef SQLITE_OMIT_VIRTUALTABLE
3159 return spellfix1Register(db);
3160#endif
3161 return SQLITE_OK;
3162}