-<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.22 2007/10/22 03:37:04 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.23 2007/10/22 20:13:37 tgl Exp $ -->
<chapter id="textsearch">
<title id="textsearch-title">Full Text Search</title>
<itemizedlist spacing="compact" mark="bullet">
<listitem>
<para>
- <structname>word</> <type>text</> — the value of a lexeme
+ <replaceable>word</> <type>text</> — the value of a lexeme
</para>
</listitem>
<listitem>
<para>
- <structname>ndoc</> <type>integer</> — number of documents
+ <replaceable>ndoc</> <type>integer</> — number of documents
(<type>tsvector</>s) the word occurred in
</para>
</listitem>
<listitem>
<para>
- <structname>nentry</> <type>integer</> — total number of
+ <replaceable>nentry</> <type>integer</> — total number of
occurrences of the word
</para>
</listitem>
as the entire word and as each component:
<programlisting>
-SELECT "Alias", "Description", "Token" FROM ts_debug('foo-bar-beta1');
- Alias | Description | Token
+SELECT alias, description, token FROM ts_debug('foo-bar-beta1');
+ alias | description | token
-------------+-------------------------------+---------------
hword | Hyphenated word | foo-bar-beta1
lpart_hword | Latin part of hyphenated word | foo
instructive example:
<programlisting>
-SELECT "Alias", "Description", "Token" FROM ts_debug('http://foo.com/stuff/index.html');
- Alias | Description | Token
+SELECT alias, description, token FROM ts_debug('http://foo.com/stuff/index.html');
+ alias | description | token
----------+---------------+--------------------------
protocol | Protocol head | http://
url | URL | foo.com/stuff/index.html
synonym dictionary and put it before the <literal>english_stem</> dictionary:
<programlisting>
-SELECT * FROM ts_debug('english','Paris');
- Alias | Description | Token | Dictionaries | Lexized token
--------+-------------+-------+----------------+----------------------
- lword | Latin word | Paris | {english_stem} | english_stem: {pari}
-(1 row)
+SELECT * FROM ts_debug('english', 'Paris');
+ alias | description | token | dictionaries | dictionary | lexemes
+-------+-------------+-------+----------------+--------------+---------
+ lword | Latin word | Paris | {english_stem} | english_stem | {pari}
-CREATE TEXT SEARCH DICTIONARY synonym (
+CREATE TEXT SEARCH DICTIONARY my_synonym (
TEMPLATE = synonym,
SYNONYMS = my_synonyms
);
ALTER TEXT SEARCH CONFIGURATION english
- ALTER MAPPING FOR lword WITH synonym, english_stem;
+ ALTER MAPPING FOR lword WITH my_synonym, english_stem;
-SELECT * FROM ts_debug('english','Paris');
- Alias | Description | Token | Dictionaries | Lexized token
--------+-------------+-------+------------------------+------------------
- lword | Latin word | Paris | {synonym,english_stem} | synonym: {paris}
-(1 row)
+SELECT * FROM ts_debug('english', 'Paris');
+ alias | description | token | dictionaries | dictionary | lexemes
+-------+-------------+-------+---------------------------+------------+---------
+ lword | Latin word | Paris | {my_synonym,english_stem} | my_synonym | {paris}
</programlisting>
</para>
</indexterm>
<synopsis>
- ts_debug(<optional> <replaceable class="PARAMETER">config</replaceable> <type>regconfig</>, </optional> <replaceable class="PARAMETER">document</replaceable> <type>text</>) returns <type>setof ts_debug</>
+ ts_debug(<optional> <replaceable class="PARAMETER">config</replaceable> <type>regconfig</>, </optional> <replaceable class="PARAMETER">document</replaceable> <type>text</>,
+ OUT <replaceable class="PARAMETER">alias</> <type>text</>,
+ OUT <replaceable class="PARAMETER">description</> <type>text</>,
+ OUT <replaceable class="PARAMETER">token</> <type>text</>,
+ OUT <replaceable class="PARAMETER">dictionaries</> <type>regdictionary[]</>,
+ OUT <replaceable class="PARAMETER">dictionary</> <type>regdictionary</>,
+ OUT <replaceable class="PARAMETER">lexemes</> <type>text[]</>)
+ returns setof record
</synopsis>
<para>
</para>
<para>
- <function>ts_debug</>'s result row type is defined as:
+ <function>ts_debug</> returns one row for each token identified in the text
+ by the parser. The columns returned are
-<programlisting>
-CREATE TYPE ts_debug AS (
- "Alias" text,
- "Description" text,
- "Token" text,
- "Dictionaries" regdictionary[],
- "Lexized token" text
-);
-</programlisting>
-
- One row is produced for each token identified by the parser.
- The first three columns describe the token, and the fourth lists
- the dictionaries selected by the configuration for that token's type.
- The last column shows the result of dictionary processing: which
- dictionary (if any) recognized the token, and what it produced.
+ <itemizedlist spacing="compact" mark="bullet">
+ <listitem>
+ <para>
+ <replaceable>alias</> <type>text</> — short name of the token type
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <replaceable>description</> <type>text</> — description of the
+ token type
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <replaceable>token</> <type>text</> — text of the token
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <replaceable>dictionaries</> <type>regdictionary[]</> — the
+ dictionaries selected by the configuration for this token type
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <replaceable>dictionary</> <type>regdictionary</> — the dictionary
+ that recognized the token, or <literal>NULL</> if none did
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <replaceable>lexemes</> <type>text[]</> — the lexeme(s) produced
+ by the dictionary that recognized the token, or <literal>NULL</> if
+ none did; an empty array (<literal>{}</>) means it was recognized as a
+ stop word
+ </para>
+ </listitem>
+ </itemizedlist>
</para>
<para>
<programlisting>
SELECT * FROM ts_debug('english','a fat cat sat on a mat - it ate a fat rats');
- Alias | Description | Token | Dictionaries | Lexized token
--------+---------------+-------+--------------+----------------
- lword | Latin word | a | {english} | english: {}
- blank | Space symbols | | |
- lword | Latin word | fat | {english} | english: {fat}
- blank | Space symbols | | |
- lword | Latin word | cat | {english} | english: {cat}
- blank | Space symbols | | |
- lword | Latin word | sat | {english} | english: {sat}
- blank | Space symbols | | |
- lword | Latin word | on | {english} | english: {}
- blank | Space symbols | | |
- lword | Latin word | a | {english} | english: {}
- blank | Space symbols | | |
- lword | Latin word | mat | {english} | english: {mat}
- blank | Space symbols | | |
- blank | Space symbols | - | |
- lword | Latin word | it | {english} | english: {}
- blank | Space symbols | | |
- lword | Latin word | ate | {english} | english: {ate}
- blank | Space symbols | | |
- lword | Latin word | a | {english} | english: {}
- blank | Space symbols | | |
- lword | Latin word | fat | {english} | english: {fat}
- blank | Space symbols | | |
- lword | Latin word | rats | {english} | english: {rat}
- (24 rows)
+ alias | description | token | dictionaries | dictionary | lexemes
+-------+---------------+-------+----------------+--------------+---------
+ lword | Latin word | a | {english_stem} | english_stem | {}
+ blank | Space symbols | | {} | |
+ lword | Latin word | fat | {english_stem} | english_stem | {fat}
+ blank | Space symbols | | {} | |
+ lword | Latin word | cat | {english_stem} | english_stem | {cat}
+ blank | Space symbols | | {} | |
+ lword | Latin word | sat | {english_stem} | english_stem | {sat}
+ blank | Space symbols | | {} | |
+ lword | Latin word | on | {english_stem} | english_stem | {}
+ blank | Space symbols | | {} | |
+ lword | Latin word | a | {english_stem} | english_stem | {}
+ blank | Space symbols | | {} | |
+ lword | Latin word | mat | {english_stem} | english_stem | {mat}
+ blank | Space symbols | | {} | |
+ blank | Space symbols | - | {} | |
+ lword | Latin word | it | {english_stem} | english_stem | {}
+ blank | Space symbols | | {} | |
+ lword | Latin word | ate | {english_stem} | english_stem | {ate}
+ blank | Space symbols | | {} | |
+ lword | Latin word | a | {english_stem} | english_stem | {}
+ blank | Space symbols | | {} | |
+ lword | Latin word | fat | {english_stem} | english_stem | {fat}
+ blank | Space symbols | | {} | |
+ lword | Latin word | rats | {english_stem} | english_stem | {rat}
</programlisting>
</para>
<programlisting>
SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
- Alias | Description | Token | Dictionaries | Lexized token
--------+---------------+-------------+-------------------------------------------------+-------------------------------------
- lword | Latin word | The | {public.english_ispell,pg_catalog.english_stem} | public.english_ispell: {}
- blank | Space symbols | | |
- lword | Latin word | Brightest | {public.english_ispell,pg_catalog.english_stem} | public.english_ispell: {bright}
- blank | Space symbols | | |
- lword | Latin word | supernovaes | {public.english_ispell,pg_catalog.english_stem} | pg_catalog.english_stem: {supernova}
-(5 rows)
+ alias | description | token | dictionaries | dictionary | lexemes
+-------+---------------+-------------+-------------------------------+----------------+-------------
+ lword | Latin word | The | {english_ispell,english_stem} | english_ispell | {}
+ blank | Space symbols | | {} | |
+ lword | Latin word | Brightest | {english_ispell,english_stem} | english_ispell | {bright}
+ blank | Space symbols | | {} | |
+ lword | Latin word | supernovaes | {english_ispell,english_stem} | english_stem | {supernova}
</programlisting>
<para>
In this example, the word <literal>Brightest</> was recognized by the
parser as a <literal>Latin word</literal> (alias <literal>lword</literal>).
For this token type the dictionary list is
- <literal>public.english_ispell</> and
- <literal>pg_catalog.english_stem</literal>. The word was recognized by
- <literal>public.english_ispell</literal>, which reduced it to the noun
+ <literal>english_ispell</> and
+ <literal>english_stem</literal>. The word was recognized by
+ <literal>english_ispell</literal>, which reduced it to the noun
<literal>bright</literal>. The word <literal>supernovaes</literal> is
- unknown to the <literal>public.english_ispell</literal> dictionary so it
+ unknown to the <literal>english_ispell</literal> dictionary so it
was passed to the next dictionary, and, fortunately, was recognized (in
- fact, <literal>public.english_stem</literal> is a Snowball dictionary which
+ fact, <literal>english_stem</literal> is a Snowball dictionary which
recognizes everything; that is why it was placed at the end of the
dictionary list).
</para>
<para>
The word <literal>The</literal> was recognized by the
- <literal>public.english_ispell</literal> dictionary as a stop word (<xref
+ <literal>english_ispell</literal> dictionary as a stop word (<xref
linkend="textsearch-stopwords">) and will not be indexed.
The spaces are discarded too, since the configuration provides no
dictionaries at all for them.
you want to see:
<programlisting>
-SELECT "Alias", "Token", "Lexized token"
+SELECT alias, token, dictionary, lexemes
FROM ts_debug('public.english','The Brightest supernovaes');
- Alias | Token | Lexized token
--------+-------------+--------------------------------------
- lword | The | public.english_ispell: {}
- blank | |
- lword | Brightest | public.english_ispell: {bright}
- blank | |
- lword | supernovaes | pg_catalog.english_stem: {supernova}
-(5 rows)
+ alias | token | dictionary | lexemes
+-------+-------------+----------------+-------------
+ lword | The | english_ispell | {}
+ blank | | |
+ lword | Brightest | english_ispell | {bright}
+ blank | | |
+ lword | supernovaes | english_stem | {supernova}
</programlisting>
</para>
*
* Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
- * $PostgreSQL: pgsql/src/backend/catalog/system_views.sql,v 1.46 2007/09/25 20:03:37 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/catalog/system_views.sql,v 1.47 2007/10/22 20:13:37 tgl Exp $
*/
CREATE VIEW pg_roles AS
pg_stat_get_buf_written_backend() AS buffers_backend,
pg_stat_get_buf_alloc() AS buffers_alloc;
--- Tsearch debug function. Defined here because it'd be pretty unwieldy
+-- Tsearch debug function. Defined here because it'd be pretty unwieldy
-- to put it into pg_proc.h
-CREATE TYPE ts_debug AS (
- "Alias" text,
- "Description" text,
- "Token" text,
- "Dictionaries" regdictionary[],
- "Lexized token" text
-);
-
-COMMENT ON TYPE ts_debug IS 'type returned from ts_debug() function';
-
-CREATE FUNCTION ts_debug(regconfig, text)
-RETURNS SETOF ts_debug AS
+CREATE FUNCTION ts_debug(IN config regconfig, IN document text,
+ OUT alias text,
+ OUT description text,
+ OUT token text,
+ OUT dictionaries regdictionary[],
+ OUT dictionary regdictionary,
+ OUT lexemes text[])
+RETURNS SETOF record AS
$$
SELECT
- tt.alias AS "Alias",
- tt.description AS "Description",
- parse.token AS "Token",
+ tt.alias AS alias,
+ tt.description AS description,
+ parse.token AS token,
ARRAY ( SELECT m.mapdict::pg_catalog.regdictionary
FROM pg_catalog.pg_ts_config_map AS m
WHERE m.mapcfg = $1 AND m.maptokentype = parse.tokid
ORDER BY m.mapseqno )
- AS "Dictionaries",
- (
- SELECT
- dl.mapdict::pg_catalog.regdictionary || ': ' || dl.lex::pg_catalog.text
- FROM
- ( SELECT mapdict, pg_catalog.ts_lexize(mapdict, parse.token) AS lex
- FROM pg_catalog.pg_ts_config_map AS m
- WHERE m.mapcfg = $1 AND m.maptokentype = parse.tokid
- ORDER BY pg_catalog.ts_lexize(mapdict, parse.token) IS NULL, m.mapseqno ) dl
- LIMIT 1
- ) AS "Lexized token"
+ AS dictionaries,
+ ( SELECT mapdict::pg_catalog.regdictionary
+ FROM pg_catalog.pg_ts_config_map AS m
+ WHERE m.mapcfg = $1 AND m.maptokentype = parse.tokid
+ ORDER BY pg_catalog.ts_lexize(mapdict, parse.token) IS NULL, m.mapseqno
+ LIMIT 1
+ ) AS dictionary,
+ ( SELECT pg_catalog.ts_lexize(mapdict, parse.token)
+ FROM pg_catalog.pg_ts_config_map AS m
+ WHERE m.mapcfg = $1 AND m.maptokentype = parse.tokid
+ ORDER BY pg_catalog.ts_lexize(mapdict, parse.token) IS NULL, m.mapseqno
+ LIMIT 1
+ ) AS lexemes
FROM pg_catalog.ts_parse(
(SELECT cfgparser FROM pg_catalog.pg_ts_config WHERE oid = $1 ), $2
) AS parse,
COMMENT ON FUNCTION ts_debug(regconfig,text) IS
'debug function for text search configuration';
-CREATE FUNCTION ts_debug(text)
-RETURNS SETOF ts_debug AS
+CREATE FUNCTION ts_debug(IN document text,
+ OUT alias text,
+ OUT description text,
+ OUT token text,
+ OUT dictionaries regdictionary[],
+ OUT dictionary regdictionary,
+ OUT lexemes text[])
+RETURNS SETOF record AS
$$
SELECT * FROM pg_catalog.ts_debug( pg_catalog.get_current_ts_config(), $1);
$$