Zen: Gendict

Gendict - tsearch2 dictionary templates generator

This modules generates template for tsearch2 dictionary. It has built-in support for snowball stemmers.

Read http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/README.gendict for more details.

Snowball stemmer

Here is how to add portuguese stemmer:

   0. cd PGSQL_SRC/contrib/tsearch2/gendict

   1. Obtain stem.{c,h} files for Portuguese

      wget http://snowball.tartarus.org/portuguese/stem.c
      wget http://snowball.tartarus.org/portuguese/stem.h
   
   2. Create template files for Portuguese

      ./config.sh -n pt -s -p portuguese -v -C'Snowball stemmer for Portuguese'

      Note, that argument for -p option should be *the same* as name of stemming
      function in stem.c (without _stem)

      A bunch of files will be generated and placed in  PGSQL_SRC/contrib/dict_pt
      directory.

   3. Compile and install dictionary

	cd ../../dict_pt
	make
	make install

   4. Test it 

	Sample portuguese words with the stemmed forms are available
        from http://snowball.tartarus.org/portuguese/stemmer.html

 	createdb testdict
	psql testdict < /usr/local/pgsql/share/contrib/tsearch.sql
	psql testdict < /usr/local/pgsql/share/contrib/dict_pt.sql
	psql -d testdict -c "select lexize('pt','bobagem');"
	 lexize  
	---------
	 {bobag}

IntDict example

Motivation for this dictionary is to control indexing of integers (signed and unsigned), and, consequently, to minimize the number of unique words, which, in turn, greatly affects to performance of searching.

Dictionary accepts two init options: *MAXLEN parameter specifies maximum length of the number considered as a 'good' integer. Default value is 6. *REJECTLONG parameter specifies if 'long' integer should be indexed or treated as stop word. **If REJECTLONG=false (default), than dictionary returns prefixed part of integer number with length MAXLEN. **If REJECTLONG=true, than dictionary consider integer as a stop word.

Examples:

12345678902132435454 - 'garbage'
123456 - 'good' integer number suitable for indexing

 cd tsearch2/gendict/
 ./config.sh -n intdict -v -i -C 'dictionary for integers'
 cd ../../dict_intdict

Now I could edit file dict_tmpl.c generated by gendict. (See explanatory notes below)

/* 
 * example of dictionary 
 * Teodor Sigaev <teodor@sigaev.ru>
 */
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include "postgres.h"
#include "fmgr.h"

/* needed for 8.2+ */
#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif

#include "dict.h"
#include "common.h"

#include "subinclude.h"

 typedef struct {
        int     maxlen;
        bool    rejectlong;
 } DictInt;


 PG_FUNCTION_INFO_V1(dinit_intdict);
 Datum dinit_intdict(PG_FUNCTION_ARGS);

 Datum 
 dinit_intdict(PG_FUNCTION_ARGS) {
        DictInt *d = (DictInt*)malloc( sizeof(DictInt) );
        Map *cfg, *pcfg;
        text *in;
 
        if ( !d )
                elog(ERROR, "No memory");
        memset(d,0,sizeof(DictInt));
 
        /* Your INIT code */
/* defaults */
        d->maxlen = 6;
        d->rejectlong = false;

if ( PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL ) { /* no options */
        PG_RETURN_POINTER(d);
}
        in = PG_GETARG_TEXT_P(0);
        parse_cfgdict(in,&cfg);
        PG_FREE_IF_COPY(in, 0);
        pcfg=cfg;

        while (pcfg->key) {
                if ( strcasecmp("MAXLEN", pcfg->key) == 0 ) {
                        d->maxlen=atoi(pcfg->value);
                } else if ( strcasecmp("REJECTLONG", pcfg->key) == 0 ) {
                        if ( strcasecmp("true", pcfg->value) == 0 ) {
                                d->rejectlong=true;
                        } else if ( strcasecmp("false", pcfg->value) == 0 ) {
                                d->rejectlong=false;
                        } else {
                                elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value);
                        }
                } else {
                        elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value);
                }
                pfree(pcfg->key);
                pfree(pcfg->value);
                pcfg++;
        }
        pfree(cfg);     
 
        PG_RETURN_POINTER(d);
 }

PG_FUNCTION_INFO_V1(dlexize_intdict);
Datum dlexize_intdict(PG_FUNCTION_ARGS);
Datum
dlexize_intdict(PG_FUNCTION_ARGS) {
        DictInt   *d   = (DictInt*)PG_GETARG_POINTER(0);
        char      *in  = (char*)PG_GETARG_POINTER(1);
        char      *txt = pnstrdup(in, PG_GETARG_INT32(2));
        TSLexeme  *res = palloc(sizeof(TSLexeme)*2);

        /* Your INIT dictionary code */
        res[1].lexeme = NULL;
        if  ( PG_GETARG_INT32(2) > d->maxlen ) {
                if ( d->rejectlong ) {       /* stop, return void array */
                        pfree(txt);
                        res[0].lexeme = NULL; 
                } else {                     /* cut integer */
                        txt[d->maxlen] = '\0';                  
                        res[0].lexeme = txt;
                }
        } else {
                res[0].lexeme = txt;
        }
        
        PG_RETURN_POINTER(res);
}

Specify default options in dict_intdict.sql.in

insert into pg_ts_dict select
        'intdict',
        (select oid from pg_proc where proname='dinit_intdict'),

        'MAXLEN=6,REJECTLONG=false',

        (select oid from pg_proc where proname='dlexize_intdict'),
        'dictionary for integers';

After that, I compile and install it.

make
make install

Test it

createdb qq 
psql qq < /usr/local/pgsql/share/contrib/tsearch2.sql 
psql qq < /usr/local/pgsql/share/contrib/dict_intdict.sql 
qq=# select dict_name, dict_initoption from pg_ts_dict where  dict_name='intdict';
 dict_name |      dict_initoption      
-----------+---------------------------
 intdict   | MAXLEN=6,REJECTLONG=false
qq=# select lexize('intdict','12345678');
  lexize  
----------
 {123456}
qq=# select lexize('intdict','123');
 lexize 
--------
 {123}

Now, change initoption:

qq=# update pg_ts_dict set dict_initoption='MAXLEN=6,REJECTLONG=true' where dic
t_name = 'intdict';
UPDATE 1
qq=# select dict_name, dict_initoption from pg_ts_dict where dict_name='intdict';
 dict_name |     dict_initoption      
-----------+--------------------------
 intdict   | MAXLEN=6,REJECTLONG=true
qq=# select lexize('intdict','12345678');
  lexize  
----------
 {123456}

It appears, that changing of initoptions doesn't works :) For performance reason init function of dictionary calls only once per session, so you need to stop session or begin new one or use function reset_tsearch().

qq=# select reset_tsearch();
NOTICE:  TSearch cache cleaned
 reset_tsearch 
---------------
 
(1 row)

qq=# select lexize('intdict','12345678');
 lexize 
--------
 {}

Now, it works as expected - returns stop word.

Using IntDict

Specify intdict dictionary to process int and uint tokens ( for simplicity, I did that for all configurations)

qq=# select * from pg_ts_cfgmap where tok_alias ~ 'int';
     ts_name     | tok_alias | dict_name 
-----------------+-----------+-----------
 default         | int       | {simple}
 default         | uint      | {simple}
 default_russian | int       | {simple}
 default_russian | uint      | {simple}
 simple          | int       | {simple}
 simple          | uint      | {simple}
qq=# update pg_ts_cfgmap set dict_name='{intdict}' where tok_alias ~ 'int';
UPDATE 6
qq=# select * from pg_ts_cfgmap where tok_alias ~ 'int';
     ts_name     | tok_alias | dict_name 
-----------------+-----------+-----------
 default         | int       | {intdict}
 default         | uint      | {intdict}
 default_russian | int       | {intdict}
 default_russian | uint      | {intdict}
 simple          | int       | {intdict}
 simple          | uint      | {intdict}

That's all.

DecDict example

Motivation for this dictionary is to control indexing of decimal numbers and, consequently, to minimize the number of unique words, which, in turn, greatly affects to performance of searching.

Dictionary accepts two init options: *MAXLENFRAC parameter specifies maximum length of the fraction part considered as a 'good' decimal. Default value is 3. *REJECTLONG parameter specifies if decimal number with 'long' fraction part should be indexed or treated as a stop word. **If REJECTLONG=false (default), than dictionary returns decimal number with length of fraction part MAXLEN. **If REJECTLONG=true, than dictionary consider number as a stop word.

Examples:

13.345678902132435454 - 'garbage'
13.345 - 'good' decimal number suitable for indexing

Notice, that REJECTLONG=false allow indexing 'shortened' numbers and search results will contain documents with original 'garbage' numbers.

Implementation of this dictionary I leave to readers :)

Dictionary API

internal structure of dictionary should be malloc-ed (or palloc-ed) in TopMemoryContext.

 typedef struct {
        int     maxlen;
        bool    rejectlong;
 } DictInt;
.............................
DictInt *d = (DictInt*)malloc( sizeof(DictInt) );

parse_cfgdict process text string in and returns structure, contains (key, value) pairs.

        in = PG_GETARG_TEXT_P(0);
        parse_cfgdict(in,&cfg);
        PG_FREE_IF_COPY(in, 0);

Input for lexize function:
- the structure of dictionary (DictInt)
- pointer to string and it's length
lexize function (dlexize_intdict) should returns pointer to array of TSLexeme (tsearch2/dict.h ). Interface to tsearch2 dictionaries was changed in 8.1 to support compound words, in older version lexize returned array of pointers to C-strings. Intdict dictionary returns only one entry, so we use res[1] as a marker (res[1].lexeme = NULL;) and use res[0] to return:
- NULL, if dictionary considers input integer as a stop word

                        res[0].lexeme = NULL;

void array, if dictionary can't resolve input. We don't have such case, because intdict is supposed to works only with integers (via pg_ts_cfgmap)

                        PG_RETURN_POINTER(NULL)

stemmed integer, which could be
- shortened number

                        txt[d->maxlen] = '\0';
                        res[0].lexeme = txt;

or unchanged

                        res[0].lexeme = txt;

Gendict