|
Документ взят из кэша поисковой машины. Адрес
оригинального документа
: http://www.sai.msu.su/~megera/wiki/tp_search_doc
Дата изменения: Unknown Дата индексирования: Sun Apr 10 17:49:17 2016 Кодировка: Поисковые слова: asteroid |
Check tp search en for explanation of fts technique implemented in this project.
Database: PostgreSQL 8.0x (also, tested with PostgreSQL 8.1dev) Default tsearch2 configuration is 'tps'.
Example of right configuration: (data dir - /home/olegb/tmp/data,tsearch2 installed into /home/olegb/tmp/lib/)
initdb --locale=en_US /home/olegb/tmp/data
(assume, postmaster started with en_US locale)
pg_ctl -D /home/olegb/tmp/data -o "-p 5433" start
createdb -U olegb -p 5433 --encoding=SQL_ASCII tp
sed -e 's|$libdir|/home/olegb/tmp/lib/|g' < tsearch2.sql | psql -p 5433 -U oleg
psql -p 5433 -U olegb tp < ~/app/dump/tables/sql/tscfg-tps.sql
tp=# select * from pg_ts_cfg;
ts_name | prs_name | locale
-----------------+----------+--------------
default | default | C
default_russian | default | ru_RU.KOI8-R
simple | default |
tps | default | en_US
(4 rows)
tp=# select to_tsquery('paris');
to_tsquery
------------
'paris'
(1 row)
Addtional modules:
We use two different configurations of fts for better flexibility (different sets of stop words).
Installation:
Please, check configuration files (tscfg-tps.cfg,tscfg-tpsh.cfg) for location of dictionary and stop-words files. There are 3 example files provided:
paris paris united united states states orlean orleans las las los los
Capital letters (A,B,C,D) denotes lexem class (by importance). D is a default class.
Following indices should exist:
-- check indices below !!! -- create index admin_searchalias_placeid on admin.searchalias(place_id); -- create index placedetail_us_state_abbrev on placedetail(us_state_abbrev); -- create index usastate_state_abbrev on usastate(state_abbrev); -- create index name_idx on place(name);
PLACE SEARCH:
place.fts_index = place.name_A ||
usa.state_name_B || usa.state_abbrev_B ||
country.name_C ||
mk_place_aliases(place.place_id)
UPDATE QUERY:
begin;
select place.*,
mk_tsvector_place(coalesce(place.name,''),coalesce(country.name,''),coalesce(usa.state_name,'')||' '|| coalesce(usa.state_abbrev,'')) ||
to_tsvector('tps',coalesce(mk_place_aliases(place.place_id),'')) as fts_index
into place_new
from place
inner join placedetail on
(place.place_id = placedetail.place_id)
left outer join country on
(placedetail.country_id = country.country_id)
LEFT OUTER JOIN usastate usa on
( placedetail.us_state_abbrev = usa.state_abbrev);
drop table place;
alter table place_new rename to place;
create index fts_index_idx on place using gist(fts_index);
HOTEL SEARCH:
property.fts_index = property.property_name_A ||
place.name_B ||
usa.state_name_B || usa.state_abbrev_B
country.name_C ||
mk_place_aliases(place.place_id)
UPDATE QUERY:
update property.property set fts_index =
CASE WHEN property.property.status_type_id is null OR
property.property.status_type_id > 0 then ''
ELSE
mk_tsvector_hotel(coalesce(place.name,''),
coalesce(country.name,''),
coalesce(usa.state_name,'')||' '||coalesce(usa.state_abbrev,''),
coalesce(property.property.property_name,'')
) || to_tsvector('tpsh',coalesce(mk_place_aliases(place.place_id),''))
END
from place
inner join placedetail on
(place.place_id = placedetail.place_id)
inner join country on
(placedetail.country_id = country.country_id)
inner JOIN usastate usa on
( placedetail.us_state_abbrev = usa.state_abbrev)
where place.place_id = property.property.place_id;
create index ftsp_index_idx on property.property using gist(fts_index);
LOG SEARCH:
vw_log.fts_notes = notes ||
caption_text_A ||
place.name_qualified_B
UPDATE QUERY:
alter table vw_log add column fti_notes tsvector;
update vw_log set fti_notes =
coalesce(to_tsvector('tps',notes),'')||' '||
coalesce( setweight( to_tsvector('tps',caption_text),'A'),'')||' '||
coalesce( setweight( to_tsvector('tps',p.name_qualified),'B'),'')
from place p
where p.place_id = vw_log.place_id;
create index vw_log_fti_notes_idx on vw_log using gist(fti_notes);
SUPPLEMENTARY TABLE 'names' used for query rewriting.
begin;
drop table names;
select foo.place_id, foo.cname as cname, to_tsquery('tps',replace(foo.txt,chr(176),chr(32))) as name_tsquery into names from (
select distinct search_term as cname, place_id, '\'\\\'' || replace(search_term,'\'','\\\'') || '\\\'\'' as txt
from admin.searchalias where admin.searchalias.search_term ~ ' '
union
select distinct name as cname, place_id, '\'\\\'' || replace(place.name,'\'','\\\'') || '\\\'\'' as txt
from place where name ~ ' ') as foo;
-- create index for names
create index cnames_idx on names using gist(name_tsquery gist_tp_tsquery_ops);
end;
-- use 'tps' tsearch2 configuration !
create or replace function mk_tsvector_place(text, text, text) returns tsvector
as $$
DECLARE
res tsvector;
place_name ALIAS FOR $1;
country_name ALIAS FOR $2;
state_name ALIAS FOR $3;
BEGIN
-- IF country_name = 'US' THEN
res = setweight(to_tsvector('tps',place_name),'A');
IF country_name = 'United States' THEN
res = res || setweight(to_tsvector('tps',state_name),'B');
END IF;
res = res || setweight(to_tsvector('tps',country_name),'C');
RETURN res;
END;
$$
LANGUAGE plpgsql;
---------------------------------------------------------------------------
-- returns aliases (no original name!) separated by '|'
create or replace function mk_place_aliases(integer) returns text
as $$
DECLARE
res text;
placeid ALIAS FOR $1;
rec record;
BEGIN
res = '';
FOR rec IN
SELECT replace(a.search_term,'\'','') AS place_alias
FROM place p, admin.searchalias a
WHERE a.place_id = p.place_id
and p.place_id = placeid
-- from place p left outer join admin.searchalias a on
-- ( p.place_id = a.place_id) where p.name = place_name
LOOP
IF res != '' THEN
res = res || '\|';
END IF;
res = res || '\'' || rec.place_alias || '\'';
END LOOP;
IF res = '' THEN
RETURN NULL;
END IF;
RETURN res;
END;
$$
LANGUAGE plpgsql;
---------------------------------------------------------------------------
create or replace function mk_tsvector_hotel(text, text, text,text) returns tsvector
as $$
DECLARE
res tsvector;
place_name ALIAS FOR $1;
country_name ALIAS FOR $2;
state_name ALIAS FOR $3;
property_name ALIAS FOR $4;
BEGIN
res = setweight(to_tsvector('tpsh',property_name),'A');
res = res || setweight(to_tsvector('tpsh',place_name),'B');
IF country_name = 'United States' THEN
res = res || setweight(to_tsvector('tpsh',state_name),'B');
END IF;
res = res || setweight(to_tsvector('tpsh',country_name),'C');
RETURN res;
END;
$$
LANGUAGE plpgsql;
NOTICE: logarithm slowdown search, use something another if performance is not satisfactory !!!
select p.population,p.name_qualified,
(rank('{1,0.2,0.4,1}',p.fts_index, query.r_query,0)
+ p.type_id/90.
+ log(coalesce(p.population,1000.0)+1)/7
)
as rank from place p,
(select tp_rewrite(ARRAY[query, n.name_tsquery]) as r_query from names n,
to_tsquery('new&orleans&hilton') as query where query @ n.name_tsquery) as query
where p.fts_index @@ query.r_query and p.type_id != 40
order by rank desc limit 10;
population | name_qualified | rank
------------+------------------------------------+-------------------
484670 | New Orleans, Louisiana | 1.62535215409395
5860 | Hilton, New York | 1.10576071576761
33860 | Hilton Head Island, South Carolina | 1.09939228000737
| Orleans Four Corners, New York | 0.977777779764599
| Orleans, New York | 0.977777779764599
| Hilton, New Jersey | 0.977777779764599
| Hilton, New Zealand | 0.911111117071576
| Hilton, Namibia | 0.844444454378552
| Hilton, Oklahoma | 0.844444454378552
| Hilton, Kansas | 0.844444454378552
(10 rows)
Time: 385.579 ms
We use query rewriting here and return results ordered by their full text rank and property rating
tp=# select place.name_qualified,pd.property_name,pr.rating_count,
(rank('{1,1,1,1}',pd.fts_index, query.r_query,0)
+ coalesce(pr.rating_count/500.0,0.0))
as rank from place, property.property pd, property.propertycounts pr,
(select tp_rewrite(ARRAY[query, n.name_tsquery]) as r_query from names n,
to_tsquery('tpsh','hilton&new&york') as query
where query @ n.name_tsquery) as query
where pd.fts_index @@ query.r_query
and pd.place_id = place.place_id
and pr.property_id = pd.property_id
order by rank desc limit 10;
name_qualified | property_name | rating_count | rank
------------------------+----------------------------------+--------------+------------------
Las Vegas, Nevada | New York-New York Hotel & Casino | 597 | 2.02733331346512
New York, New York | Hilton New York | 111 | 1.347
New York, New York | Hilton New York Times Square | 59 | 1.243
New York, New York | Millenium Hilton | 26 | 1.20014817905426
New York, New York | New York Marriott Marquis | 195 | 1.18166668653488
New York, New York | Hilton Waldorf Towers | 4 | 1.15614817905426
New York, New York | Hilton Waldorf=Astoria | 0 | 1.14814817905426
New Orleans, Louisiana | Hilton New Orleans Riverside | 151 | 1.052
Melville, New York | Hilton Huntington/Melville | 22 | 1.044
Rye Brook, New York | Hilton Rye Town | 16 | 1.032
(10 rows)
Time: 213.734 ms
tp=# select place.name_qualified,pd.property_name,pr.rating_count,
(rank('{1,1,1,1}',pd.fts_index,query,0)
+ coalesce(pr.rating_count/500.0,0.0))
as rank from place, property.property pd, property.propertycounts pr,
to_tsquery('tpsh','hilton&new&york') as query
where pd.fts_index @@ query
and pd.place_id = place.place_id
and pr.property_id = pd.property_id
order by rank desc limit 10;
name_qualified | property_name | rating_count | rank
----------------------------+------------------------------------+--------------+------------------
New York, New York | Hilton New York | 111 | 1.222
New York, New York | Hilton New York Times Square | 59 | 1.118
New York, New York | Millenium Hilton | 26 | 1.052
Melville, New York | Hilton Huntington/Melville | 22 | 1.04399642372131
Rye Brook, New York | Hilton Rye Town | 16 | 1.03188776445389
Lake Placid, New York | Hilton Lake Placid | 15 | 1.02988776445389
Tarrytown, New York | Hilton Tarrytown | 10 | 1.01998873472214
Liverpool, New York | Homewood Suites by Hilton Syracuse | 9 | 1.01798873472214
Saratoga Springs, New York | Hilton Garden Inn | 7 | 1.01396370077133
Pearl River, New York | Hilton Pearl River | 7 | 1.01388776445389
(10 rows)
Time: 57.542 ms
Rewrite query and order results by fts rank and updated_date
select foo.log_id, v.caption_text, headline(v.notes,foo.query),foo.rank
from vw_log v,
( select l.log_id,
query.r_query as query,
(rank('{0.1,1,0.5,1}', fti_notes,query.r_query,0) +
1 - log(8,(current_date-coalesce(updated_date,current_date-1)))/10)
as rank
from vw_log l,
( select
tp_rewrite(ARRAY[query, n.name_tsquery]) as r_query
from names n, to_tsquery('tps','new&york&hilton') as query
where query @ n.name_tsquery
) as query
where l.fti_notes @@ query.r_query
order by rank desc limit 10
) as foo
where foo.log_id = v.log_id;