cgi-bin/help-index.c

   1 /*
   2  * "$Id: help-index.c 6649 2007-07-11 21:46:42Z mike $"
   3  *
   4  *   On-line help index routines for the Common UNIX Printing System (CUPS).
   5  *
   6  *   Copyright 2007 by Apple Inc.
   7  *   Copyright 1997-2007 by Easy Software Products.
   8  *
   9  *   These coded instructions, statements, and computer programs are the
  10  *   property of Apple Inc. and are protected by Federal copyright
  11  *   law.  Distribution and use rights are outlined in the file "LICENSE.txt"
  12  *   which should have been included with this file.  If this file is
  13  *   file is missing or damaged, see the license at "http://www.cups.org/".
  14  *
  15  * Contents:
  16  *
  17  *   helpDeleteIndex()          - Delete an index, freeing all memory used.
  18  *   helpFindNode()             - Find a node in an index.
  19  *   helpLoadIndex()            - Load a help index from disk.
  20  *   helpSaveIndex()            - Save a help index to disk.
  21  *   helpSearchIndex()          - Search an index.
  22  *   help_add_word()            - Add a word to a node.
  23  *   help_compile_search()      - Convert a search string into a regular expression.
  24  *   help_delete_node()         - Free all memory used by a node.
  25  *   help_delete_word()         - Free all memory used by a word.
  26  *   help_load_directory()      - Load a directory of files into an index.
  27  *   help_load_file()           - Load a HTML files into an index.
  28  *   help_new_node()            - Create a new node and add it to an index.
  29  *   help_sort_nodes_by_name()  - Sort nodes by section, filename, and anchor.
  30  *   help_sort_nodes_by_score() - Sort nodes by score and text.
  31  *   help_sort_words()          - Sort words alphabetically.
  32  */
  33
  34 /*
  35  * Include necessary headers...
  36  */
  37
  38 #include "cgi-private.h"
  39 #include <cups/dir.h>
  40
  41
  42 /*
  43  * List of common English words that should not be indexed...
  44  */
  45
  46 static char             help_common_words[][6] =
  47                         {
  48                           "about",
  49                           "all",
  50                           "an",
  51                           "and",
  52                           "are",
  53                           "as",
  54                           "at",
  55                           "be",
  56                           "been",
  57                           "but",
  58                           "by",
  59                           "call",
  60                           "can",
  61                           "come",
  62                           "could",
  63                           "day",
  64                           "did",
  65                           "do",
  66                           "down",
  67                           "each",
  68                           "find",
  69                           "first",
  70                           "for",
  71                           "from",
  72                           "go",
  73                           "had",
  74                           "has",
  75                           "have",
  76                           "he",
  77                           "her",
  78                           "him",
  79                           "his",
  80                           "hot",
  81                           "how",
  82                           "if",
  83                           "in",
  84                           "is",
  85                           "it",
  86                           "know",
  87                           "like",
  88                           "long",
  89                           "look",
  90                           "make",
  91                           "many",
  92                           "may",
  93                           "more",
  94                           "most",
  95                           "my",
  96                           "no",
  97                           "now",
  98                           "of",
  99                           "on",
 100                           "one",
 101                           "or",
 102                           "other",
 103                           "out",
 104                           "over",
 105                           "said",
 106                           "see",
 107                           "she",
 108                           "side",
 109                           "so",
 110                           "some",
 111                           "sound",
 112                           "than",
 113                           "that",
 114                           "the",
 115                           "their",
 116                           "them",
 117                           "then",
 118                           "there",
 119                           "these",
 120                           "they",
 121                           "thing",
 122                           "this",
 123                           "time",
 124                           "to",
 125                           "two",
 126                           "up",
 127                           "use",
 128                           "was",
 129                           "water",
 130                           "way",
 131                           "we",
 132                           "were",
 133                           "what",
 134                           "when",
 135                           "which",
 136                           "who",
 137                           "will",
 138                           "with",
 139                           "word",
 140                           "would",
 141                           "write",
 142                           "you",
 143                           "your"
 144                         };
 145
 146
 147 /*
 148  * Local functions...
 149  */
 150
 151 static help_word_t      *help_add_word(help_node_t *n, const char *text);
 152 static void             help_delete_node(help_node_t *n);
 153 static void             help_delete_word(help_word_t *w);
 154 static int              help_load_directory(help_index_t *hi,
 155                                             const char *directory,
 156                                             const char *relative);
 157 static int              help_load_file(help_index_t *hi,
 158                                        const char *filename,
 159                                        const char *relative,
 160                                        time_t     mtime);
 161 static help_node_t      *help_new_node(const char *filename, const char *anchor,
 162                                        const char *section, const char *text,
 163                                        time_t mtime, off_t offset,
 164                                        size_t length);
 165 static int              help_sort_by_name(help_node_t *p1, help_node_t *p2);
 166 static int              help_sort_by_score(help_node_t *p1, help_node_t *p2);
 167 static int              help_sort_words(help_word_t *w1, help_word_t *w2);
 168
 169
 170 /*
 171  * 'helpDeleteIndex()' - Delete an index, freeing all memory used.
 172  */
 173
 174 void
 175 helpDeleteIndex(help_index_t *hi)       /* I - Help index */
 176 {
 177   help_node_t   *node;                  /* Current node */
 178
 179
 180   DEBUG_printf(("helpDeleteIndex(hi=%p)\n", hi));
 181
 182   if (!hi)
 183     return;
 184
 185   for (node = (help_node_t *)cupsArrayFirst(hi->nodes);
 186        node;
 187        node = (help_node_t *)cupsArrayNext(hi->nodes))
 188   {
 189     if (!hi->search)
 190       help_delete_node(node);
 191   }
 192
 193   cupsArrayDelete(hi->nodes);
 194   cupsArrayDelete(hi->sorted);
 195
 196   free(hi);
 197 }
 198
 199
 200 /*
 201  * 'helpFindNode()' - Find a node in an index.
 202  */
 203
 204 help_node_t *                           /* O - Node pointer or NULL */
 205 helpFindNode(help_index_t *hi,          /* I - Index */
 206              const char   *filename,    /* I - Filename */
 207              const char   *anchor)      /* I - Anchor */
 208 {
 209   help_node_t   key;                    /* Search key */
 210
 211
 212   DEBUG_printf(("helpFindNode(hi=%p, filename=\"%s\", anchor=\"%s\")\n",
 213                 hi, filename ? filename : "(nil)", anchor ? anchor : "(nil)"));
 214
 215  /*
 216   * Range check input...
 217   */
 218
 219   if (!hi || !filename)
 220     return (NULL);
 221
 222  /*
 223   * Initialize the search key...
 224   */
 225
 226   key.filename = (char *)filename;
 227   key.anchor   = (char *)anchor;
 228
 229  /*
 230   * Return any match...
 231   */
 232
 233   return ((help_node_t *)cupsArrayFind(hi->nodes, &key));
 234 }
 235
 236
 237 /*
 238  * 'helpLoadIndex()' - Load a help index from disk.
 239  */
 240
 241 help_index_t *                          /* O - Index pointer or NULL */
 242 helpLoadIndex(const char *hifile,       /* I - Index filename */
 243               const char *directory)    /* I - Directory that is indexed */
 244 {
 245   help_index_t  *hi;                    /* Help index */
 246   cups_file_t   *fp;                    /* Current file */
 247   char          line[2048],             /* Line from file */
 248                 *ptr,                   /* Pointer into line */
 249                 *filename,              /* Filename in line */
 250                 *anchor,                /* Anchor in line */
 251                 *sectptr,               /* Section pointer in line */
 252                 section[1024],          /* Section name */
 253                 *text;                  /* Text in line */
 254   time_t        mtime;                  /* Modification time */
 255   off_t         offset;                 /* Offset into file */
 256   size_t        length;                 /* Length in bytes */
 257   int           update;                 /* Update? */
 258   help_node_t   *node;                  /* Current node */
 259   help_word_t   *word;                  /* Current word */
 260
 261
 262   DEBUG_printf(("helpLoadIndex(hifile=\"%s\", directory=\"%s\")\n",
 263                 hifile, directory));
 264
 265  /*
 266   * Create a new, empty index.
 267   */
 268
 269   if ((hi = (help_index_t *)calloc(1, sizeof(help_index_t))) == NULL)
 270     return (NULL);
 271
 272   hi->nodes  = cupsArrayNew((cups_array_func_t)help_sort_by_name, NULL);
 273   hi->sorted = cupsArrayNew((cups_array_func_t)help_sort_by_score, NULL);
 274
 275   if (!hi->nodes || !hi->sorted)
 276   {
 277     cupsArrayDelete(hi->nodes);
 278     cupsArrayDelete(hi->sorted);
 279     free(hi);
 280     return (NULL);
 281   }
 282
 283  /*
 284   * Try loading the existing index file...
 285   */
 286
 287   if ((fp = cupsFileOpen(hifile, "r")) != NULL)
 288   {
 289    /*
 290     * Lock the file and then read the first line...
 291     */
 292
 293     cupsFileLock(fp, 1);
 294
 295     if (cupsFileGets(fp, line, sizeof(line)) && !strcmp(line, "HELPV2"))
 296     {
 297      /*
 298       * Got a valid header line, now read the data lines...
 299       */
 300
 301       node = NULL;
 302
 303       while (cupsFileGets(fp, line, sizeof(line)))
 304       {
 305        /*
 306         * Each line looks like one of the following:
 307         *
 308         *     filename mtime offset length "section" "text"
 309         *     filename#anchor offset length "text"
 310         *     SP count word
 311         */
 312
 313         if (line[0] == ' ')
 314         {
 315          /*
 316           * Read a word in the current node...
 317           */
 318
 319           if (!node || (ptr = strrchr(line, ' ')) == NULL)
 320             continue;
 321
 322           if ((word = help_add_word(node, ptr + 1)) != NULL)
 323             word->count = atoi(line + 1);
 324         }
 325         else
 326         {
 327          /*
 328           * Add a node...
 329           */
 330
 331           filename = line;
 332
 333           if ((ptr = strchr(line, ' ')) == NULL)
 334             break;
 335
 336           while (isspace(*ptr & 255))
 337             *ptr++ = '\0';
 338
 339           if ((anchor = strrchr(filename, '#')) != NULL)
 340           {
 341             *anchor++ = '\0';
 342             mtime = 0;
 343           }
 344           else
 345             mtime = strtol(ptr, &ptr, 10);
 346
 347           offset = strtoll(ptr, &ptr, 10);
 348           length = strtoll(ptr, &ptr, 10);
 349
 350           while (isspace(*ptr & 255))
 351             ptr ++;
 352
 353           if (!anchor)
 354           {
 355            /*
 356             * Get section...
 357             */
 358
 359             if (*ptr != '\"')
 360               break;
 361
 362             ptr ++;
 363             sectptr = ptr;
 364
 365             while (*ptr && *ptr != '\"')
 366               ptr ++;
 367
 368             if (*ptr != '\"')
 369               break;
 370
 371             *ptr++ = '\0';
 372
 373             strlcpy(section, sectptr, sizeof(section));
 374
 375             while (isspace(*ptr & 255))
 376               ptr ++;
 377           }
 378
 379           if (*ptr != '\"')
 380             break;
 381
 382           ptr ++;
 383           text = ptr;
 384
 385           while (*ptr && *ptr != '\"')
 386             ptr ++;
 387
 388           if (*ptr != '\"')
 389             break;
 390
 391           *ptr++ = '\0';
 392
 393           if ((node = help_new_node(filename, anchor, section, text,
 394                                     mtime, offset, length)) == NULL)
 395             break;
 396
 397           node->score = -1;
 398
 399           cupsArrayAdd(hi->nodes, node);
 400         }
 401       }
 402     }
 403
 404     cupsFileClose(fp);
 405   }
 406
 407  /*
 408   * Scan for new/updated files...
 409   */
 410
 411   update = help_load_directory(hi, directory, NULL);
 412
 413  /*
 414   * Remove any files that are no longer installed...
 415   */
 416
 417   for (node = (help_node_t *)cupsArrayFirst(hi->nodes);
 418        node;
 419        node = (help_node_t *)cupsArrayNext(hi->nodes))
 420     if (node->score < 0)
 421     {
 422      /*
 423       * Delete this node...
 424       */
 425
 426       cupsArrayRemove(hi->nodes, node);
 427       help_delete_node(node);
 428     }
 429
 430  /*
 431   * Add nodes to the sorted array...
 432   */
 433
 434   for (node = (help_node_t *)cupsArrayFirst(hi->nodes);
 435        node;
 436        node = (help_node_t *)cupsArrayNext(hi->nodes))
 437     cupsArrayAdd(hi->sorted, node);
 438
 439  /*
 440   * Save the index if we updated it...
 441   */
 442
 443   if (update)
 444     helpSaveIndex(hi, hifile);
 445
 446  /*
 447   * Return the index...
 448   */
 449
 450   return (hi);
 451 }
 452
 453
 454 /*
 455  * 'helpSaveIndex()' - Save a help index to disk.
 456  */
 457
 458 int                                     /* O - 0 on success, -1 on error */
 459 helpSaveIndex(help_index_t *hi,         /* I - Index */
 460               const char   *hifile)     /* I - Index filename */
 461 {
 462   cups_file_t   *fp;                    /* Index file */
 463   help_node_t   *node;                  /* Current node */
 464   help_word_t   *word;                  /* Current word */
 465
 466
 467   DEBUG_printf(("helpSaveIndex(hi=%p, hifile=\"%s\")\n", hi, hifile));
 468
 469  /*
 470   * Try creating a new index file...
 471   */
 472
 473   if ((fp = cupsFileOpen(hifile, "w9")) == NULL)
 474     return (-1);
 475
 476  /*
 477   * Lock the file while we write it...
 478   */
 479
 480   cupsFileLock(fp, 1);
 481
 482   cupsFilePuts(fp, "HELPV2\n");
 483
 484   for (node = (help_node_t *)cupsArrayFirst(hi->nodes);
 485        node;
 486        node = (help_node_t *)cupsArrayNext(hi->nodes))
 487   {
 488    /*
 489     * Write the current node with/without the anchor...
 490     */
 491
 492     if (node->anchor)
 493     {
 494       if (cupsFilePrintf(fp, "%s#%s " CUPS_LLFMT " " CUPS_LLFMT " \"%s\"\n",
 495                          node->filename, node->anchor,
 496                          CUPS_LLCAST node->offset, CUPS_LLCAST node->length,
 497                          node->text) < 0)
 498         break;
 499     }
 500     else
 501     {
 502       if (cupsFilePrintf(fp, "%s %d " CUPS_LLFMT " " CUPS_LLFMT " \"%s\" \"%s\"\n",
 503                          node->filename, node->mtime,
 504                          CUPS_LLCAST node->offset, CUPS_LLCAST node->length,
 505                          node->section ? node->section : "", node->text) < 0)
 506         break;
 507     }
 508
 509    /*
 510     * Then write the words associated with the node...
 511     */
 512
 513     for (word = (help_word_t *)cupsArrayFirst(node->words);
 514          word;
 515          word = (help_word_t *)cupsArrayNext(node->words))
 516       if (cupsFilePrintf(fp, " %d %s\n", word->count, word->text) < 0)
 517         break;
 518   }
 519
 520   cupsFileFlush(fp);
 521
 522   if (cupsFileClose(fp) < 0)
 523     return (-1);
 524   else if (node)
 525     return (-1);
 526   else
 527     return (0);
 528 }
 529
 530
 531 /*
 532  * 'helpSearchIndex()' - Search an index.
 533  */
 534
 535 help_index_t *                          /* O - Search index */
 536 helpSearchIndex(help_index_t *hi,       /* I - Index */
 537                 const char   *query,    /* I - Query string */
 538                 const char   *section,  /* I - Limit search to this section */
 539                 const char   *filename) /* I - Limit search to this file */
 540 {
 541   help_index_t  *search;                /* Search index */
 542   help_node_t   *node;                  /* Current node */
 543   help_word_t   *word;                  /* Current word */
 544   void          *sc;                    /* Search context */
 545   int           matches;                /* Number of matches */
 546
 547
 548   DEBUG_printf(("helpSearchIndex(hi=%p, query=\"%s\", filename=\"%s\")\n",
 549                 hi, query ? query : "(nil)",
 550                 filename ? filename : "(nil)"));
 551
 552  /*
 553   * Range check...
 554   */
 555
 556   if (!hi || !query)
 557     return (NULL);
 558
 559  /*
 560   * Reset the scores of all nodes to 0...
 561   */
 562
 563   for (node = (help_node_t *)cupsArrayFirst(hi->nodes);
 564        node;
 565        node = (help_node_t *)cupsArrayNext(hi->nodes))
 566     node->score = 0;
 567
 568  /*
 569   * Find the first node to search in...
 570   */
 571
 572   if (filename)
 573   {
 574     node = helpFindNode(hi, filename, NULL);
 575     if (!node)
 576       return (NULL);
 577   }
 578   else
 579     node = (help_node_t *)cupsArrayFirst(hi->nodes);
 580
 581  /*
 582   * Convert the query into a regular expression...
 583   */
 584
 585   sc = cgiCompileSearch(query);
 586   if (!sc)
 587     return (NULL);
 588
 589  /*
 590   * Allocate a search index...
 591   */
 592
 593   search = calloc(1, sizeof(help_index_t));
 594   if (!search)
 595   {
 596     cgiFreeSearch(sc);
 597     return (NULL);
 598   }
 599
 600   search->nodes  = cupsArrayNew((cups_array_func_t)help_sort_by_name, NULL);
 601   search->sorted = cupsArrayNew((cups_array_func_t)help_sort_by_score, NULL);
 602
 603   if (!search->nodes || !search->sorted)
 604   {
 605     cupsArrayDelete(search->nodes);
 606     cupsArrayDelete(search->sorted);
 607     free(search);
 608     cgiFreeSearch(sc);
 609     return (NULL);
 610   }
 611
 612   search->search = 1;
 613
 614  /*
 615   * Check each node in the index, adding matching nodes to the
 616   * search index...
 617   */
 618
 619   for (; node; node = (help_node_t *)cupsArrayNext(hi->nodes))
 620     if (section && strcmp(node->section, section))
 621       continue;
 622     else if (filename && strcmp(node->filename, filename))
 623       continue;
 624     else
 625     {
 626       matches = cgiDoSearch(sc, node->text);
 627
 628       for (word = (help_word_t *)cupsArrayFirst(node->words);
 629            word;
 630            word = (help_word_t *)cupsArrayNext(node->words))
 631         if (cgiDoSearch(sc, word->text) > 0)
 632           matches += word->count;
 633
 634       if (matches > 0)
 635       {
 636        /*
 637         * Found a match, add the node to the search index...
 638         */
 639
 640         node->score = matches;
 641
 642         cupsArrayAdd(search->nodes, node);
 643         cupsArrayAdd(search->sorted, node);
 644       }
 645     }
 646
 647  /*
 648   * Free the search context...
 649   */
 650
 651   cgiFreeSearch(sc);
 652
 653  /*
 654   * Return the results...
 655   */
 656
 657   return (search);
 658 }
 659
 660
 661 /*
 662  * 'help_add_word()' - Add a word to a node.
 663  */
 664
 665 static help_word_t *                    /* O - New word */
 666 help_add_word(help_node_t *n,           /* I - Node */
 667               const char  *text)        /* I - Word text */
 668 {
 669   help_word_t   *w,                     /* New word */
 670                 key;                    /* Search key */
 671
 672
 673   DEBUG_printf(("help_add_word(n=%p, text=\"%s\")\n", n, text));
 674
 675  /*
 676   * Create the words array as needed...
 677   */
 678
 679   if (!n->words)
 680     n->words = cupsArrayNew((cups_array_func_t)help_sort_words, NULL);
 681
 682  /*
 683   * See if the word is already added...
 684   */
 685
 686   key.text = (char *)text;
 687
 688   if ((w = (help_word_t *)cupsArrayFind(n->words, &key)) == NULL)
 689   {
 690    /*
 691     * Create a new word...
 692     */
 693
 694     if ((w = calloc(1, sizeof(help_word_t))) == NULL)
 695       return (NULL);
 696
 697     if ((w->text = strdup(text)) == NULL)
 698     {
 699       free(w);
 700       return (NULL);
 701     }
 702
 703     cupsArrayAdd(n->words, w);
 704   }
 705
 706  /*
 707   * Bump the counter for this word and return it...
 708   */
 709
 710   w->count ++;
 711
 712   return (w);
 713 }
 714
 715
 716 /*
 717  * 'help_delete_node()' - Free all memory used by a node.
 718  */
 719
 720 static void
 721 help_delete_node(help_node_t *n)        /* I - Node */
 722 {
 723   help_word_t   *w;                     /* Current word */
 724
 725
 726   DEBUG_printf(("help_delete_node(n=%p)\n", n));
 727
 728   if (!n)
 729     return;
 730
 731   if (n->filename)
 732     free(n->filename);
 733
 734   if (n->anchor)
 735     free(n->anchor);
 736
 737   if (n->section)
 738     free(n->section);
 739
 740   if (n->text)
 741     free(n->text);
 742
 743   for (w = (help_word_t *)cupsArrayFirst(n->words);
 744        w;
 745        w = (help_word_t *)cupsArrayNext(n->words))
 746     help_delete_word(w);
 747
 748   cupsArrayDelete(n->words);
 749
 750   free(n);
 751 }
 752
 753
 754 /*
 755  * 'help_delete_word()' - Free all memory used by a word.
 756  */
 757
 758 static void
 759 help_delete_word(help_word_t *w)        /* I - Word */
 760 {
 761   DEBUG_printf(("help_delete_word(w=%p)\n", w));
 762
 763   if (!w)
 764     return;
 765
 766   if (w->text)
 767     free(w->text);
 768
 769   free(w);
 770 }
 771
 772
 773 /*
 774  * 'help_load_directory()' - Load a directory of files into an index.
 775  */
 776
 777 static int                              /* O - 0 = success, -1 = error, 1 = updated */
 778 help_load_directory(
 779     help_index_t *hi,                   /* I - Index */
 780     const char   *directory,            /* I - Directory */
 781     const char   *relative)             /* I - Relative path */
 782 {
 783   cups_dir_t    *dir;                   /* Directory file */
 784   cups_dentry_t *dent;                  /* Directory entry */
 785   char          *ext,                   /* Pointer to extension */
 786                 filename[1024],         /* Full filename */
 787                 relname[1024];          /* Relative filename */
 788   int           update;                 /* Updated? */
 789   help_node_t   *node;                  /* Current node */
 790
 791
 792   DEBUG_printf(("help_load_directory(hi=%p, directory=\"%s\", relative=\"%s\")\n",
 793                 hi, directory ? directory : "(nil)", relative ? relative : "(nil)"));
 794
 795  /*
 796   * Open the directory and scan it...
 797   */
 798
 799   if ((dir = cupsDirOpen(directory)) == NULL)
 800     return (0);
 801
 802   update = 0;
 803
 804   while ((dent = cupsDirRead(dir)) != NULL)
 805   {
 806    /*
 807     * Skip "." files...
 808     */
 809
 810     if (dent->filename[0] == '.')
 811       continue;
 812
 813    /*
 814     * Get absolute and relative filenames...
 815     */
 816
 817     snprintf(filename, sizeof(filename), "%s/%s", directory, dent->filename);
 818     if (relative)
 819       snprintf(relname, sizeof(relname), "%s/%s", relative, dent->filename);
 820     else
 821       strlcpy(relname, dent->filename, sizeof(relname));
 822
 823    /*
 824     * Check if we have a HTML file...
 825     */
 826
 827     if ((ext = strstr(dent->filename, ".html")) != NULL &&
 828         (!ext[5] || !strcmp(ext + 5, ".gz")))
 829     {
 830      /*
 831       * HTML file, see if we have already indexed the file...
 832       */
 833
 834       if ((node = helpFindNode(hi, relname, NULL)) != NULL)
 835       {
 836        /*
 837         * File already indexed - check dates to confirm that the
 838         * index is up-to-date...
 839         */
 840
 841         if (node->mtime == dent->fileinfo.st_mtime)
 842         {
 843          /*
 844           * Same modification time, so mark all of the nodes
 845           * for this file as up-to-date...
 846           */
 847
 848           for (; node; node = (help_node_t *)cupsArrayNext(hi->nodes))
 849             if (!strcmp(node->filename, relname))
 850               node->score = 0;
 851             else
 852               break;
 853
 854           continue;
 855         }
 856       }
 857
 858       update = 1;
 859
 860       help_load_file(hi, filename, relname, dent->fileinfo.st_mtime);
 861     }
 862     else if (S_ISDIR(dent->fileinfo.st_mode))
 863     {
 864      /*
 865       * Process sub-directory...
 866       */
 867
 868       if (help_load_directory(hi, filename, relname) == 1)
 869         update = 1;
 870     }
 871   }
 872
 873   cupsDirClose(dir);
 874
 875   return (update);
 876 }
 877
 878
 879 /*
 880  * 'help_load_file()' - Load a HTML files into an index.
 881  */
 882
 883 static int                              /* O - 0 = success, -1 = error */
 884 help_load_file(
 885     help_index_t *hi,                   /* I - Index */
 886     const char   *filename,             /* I - Filename */
 887     const char   *relative,             /* I - Relative path */
 888     time_t       mtime)                 /* I - Modification time */
 889 {
 890   cups_file_t   *fp;                    /* HTML file */
 891   help_node_t   *node;                  /* Current node */
 892   char          line[1024],             /* Line from file */
 893                 temp[1024],             /* Temporary word */
 894                 section[1024],          /* Section */
 895                 *ptr,                   /* Pointer into line */
 896                 *anchor,                /* Anchor name */
 897                 *text;                  /* Text for anchor */
 898   off_t         offset;                 /* File offset */
 899   char          quote;                  /* Quote character */
 900   help_word_t   *word;                  /* Current word */
 901   int           wordlen;                /* Length of word */
 902
 903
 904   DEBUG_printf(("help_load_file(hi=%p, filename=\"%s\", relative=\"%s\", mtime=%ld)\n",
 905                 hi, filename ? filename : "(nil)",
 906                 relative ? relative : "(nil)", mtime));
 907
 908   if ((fp = cupsFileOpen(filename, "r")) == NULL)
 909     return (-1);
 910
 911   node   = NULL;
 912   offset = 0;
 913
 914   strcpy(section, "Other");
 915
 916   while (cupsFileGets(fp, line, sizeof(line)))
 917   {
 918    /*
 919     * Look for "<TITLE>", "<A NAME", or "<!-- SECTION:" prefix...
 920     */
 921
 922     if (!strncasecmp(line, "<!-- SECTION:", 13))
 923     {
 924      /*
 925       * Got section line, copy it!
 926       */
 927
 928       for (ptr = line + 13; isspace(*ptr & 255); ptr ++);
 929
 930       strlcpy(section, ptr, sizeof(section));
 931       if ((ptr = strstr(section, "-->")) != NULL)
 932       {
 933        /*
 934         * Strip comment stuff from end of line...
 935         */
 936
 937         for (*ptr-- = '\0'; ptr > line && isspace(*ptr & 255); *ptr-- = '\0');
 938
 939         if (isspace(*ptr & 255))
 940           *ptr = '\0';
 941       }
 942       continue;
 943     }
 944
 945     for (ptr = line; (ptr = strchr(ptr, '<')) != NULL;)
 946     {
 947       ptr ++;
 948
 949       if (!strncasecmp(ptr, "TITLE>", 6))
 950       {
 951        /*
 952         * Found the title...
 953         */
 954
 955         anchor = NULL;
 956         ptr += 6;
 957       }
 958       else if (!strncasecmp(ptr, "A NAME=", 7))
 959       {
 960        /*
 961         * Found an anchor...
 962         */
 963
 964         ptr += 7;
 965
 966         if (*ptr == '\"' || *ptr == '\'')
 967         {
 968          /*
 969           * Get quoted anchor...
 970           */
 971
 972           quote  = *ptr;
 973           anchor = ptr + 1;
 974           if ((ptr = strchr(anchor, quote)) != NULL)
 975             *ptr++ = '\0';
 976           else
 977             break;
 978         }
 979         else
 980         {
 981          /*
 982           * Get unquoted anchor...
 983           */
 984
 985           anchor = ptr + 1;
 986
 987           for (ptr = anchor; *ptr && *ptr != '>' && !isspace(*ptr & 255); ptr ++);
 988
 989           if (*ptr)
 990             *ptr++ = '\0';
 991           else
 992             break;
 993         }
 994
 995        /*
 996         * Got the anchor, now lets find the end...
 997         */
 998
 999         while (*ptr && *ptr != '>')
1000           ptr ++;
1001
1002         if (*ptr != '>')
1003           break;
1004
1005         ptr ++;
1006       }
1007       else
1008         continue;
1009
1010      /*
1011       * Now collect text for the link...
1012       */
1013
1014       text = ptr;
1015       while ((ptr = strchr(text, '<')) == NULL)
1016       {
1017         ptr = text + strlen(text);
1018         if (ptr >= (line + sizeof(line) - 2))
1019           break;
1020
1021         *ptr++ = ' ';
1022
1023         if (!cupsFileGets(fp, ptr, sizeof(line) - (ptr - line) - 1))
1024           break;
1025       }
1026
1027       *ptr = '\0';
1028
1029       if (node)
1030         node->length = offset - node->offset;
1031
1032       if (!*text)
1033       {
1034         node = NULL;
1035         break;
1036       }
1037
1038       if ((node = helpFindNode(hi, relative, anchor)) != NULL)
1039       {
1040        /*
1041         * Node already in the index, so replace the text and other
1042         * data...
1043         */
1044
1045         cupsArrayRemove(hi->nodes, node);
1046
1047         if (node->section)
1048           free(node->section);
1049
1050         if (node->text)
1051           free(node->text);
1052
1053         if (node->words)
1054         {
1055           for (word = (help_word_t *)cupsArrayFirst(node->words);
1056                word;
1057                word = (help_word_t *)cupsArrayNext(node->words))
1058             help_delete_word(word);
1059
1060           cupsArrayDelete(node->words);
1061           node->words = NULL;
1062         }
1063
1064         node->section = section[0] ? strdup(section) : NULL;
1065         node->text    = strdup(text);
1066         node->mtime   = mtime;
1067         node->offset  = offset;
1068         node->score   = 0;
1069       }
1070       else
1071       {
1072        /*
1073         * New node...
1074         */
1075
1076         node = help_new_node(relative, anchor, section, text, mtime, offset, 0);
1077       }
1078
1079      /*
1080       * Go through the text value and replace tabs and newlines with
1081       * whitespace and eliminate extra whitespace...
1082       */
1083
1084       for (ptr = node->text, text = node->text; *ptr;)
1085         if (isspace(*ptr & 255))
1086         {
1087           while (isspace(*ptr & 255))
1088             ptr ++;
1089
1090           *text++ = ' ';
1091         }
1092         else if (text != ptr)
1093           *text++ = *ptr++;
1094         else
1095         {
1096           text ++;
1097           ptr ++;
1098         }
1099
1100       *text = '\0';
1101
1102      /*
1103       * (Re)add the node to the array...
1104       */
1105
1106       cupsArrayAdd(hi->nodes, node);
1107
1108       if (!anchor)
1109         node = NULL;
1110       break;
1111     }
1112
1113     if (node)
1114     {
1115      /*
1116       * Scan this line for words...
1117       */
1118
1119       for (ptr = line; *ptr; ptr ++)
1120       {
1121        /*
1122         * Skip HTML stuff...
1123         */
1124
1125         if (*ptr == '<')
1126         {
1127           if (!strncmp(ptr, "<!--", 4))
1128           {
1129            /*
1130             * Skip HTML comment...
1131             */
1132
1133             if ((text = strstr(ptr + 4, "-->")) == NULL)
1134               ptr += strlen(ptr) - 1;
1135             else
1136               ptr = text + 2;
1137           }
1138           else
1139           {
1140            /*
1141             * Skip HTML element...
1142             */
1143
1144             for (ptr ++; *ptr && *ptr != '>'; ptr ++)
1145             {
1146               if (*ptr == '\"' || *ptr == '\'')
1147               {
1148                 for (quote = *ptr++; *ptr && *ptr != quote; ptr ++);
1149
1150                 if (!*ptr)
1151                   ptr --;
1152               }
1153             }
1154
1155             if (!*ptr)
1156               ptr --;
1157           }
1158
1159           continue;
1160         }
1161         else if (*ptr == '&')
1162         {
1163          /*
1164           * Skip HTML entity...
1165           */
1166
1167           for (ptr ++; *ptr && *ptr != ';'; ptr ++);
1168
1169           if (!*ptr)
1170             ptr --;
1171
1172           continue;
1173         }
1174         else if (!isalnum(*ptr & 255))
1175           continue;
1176
1177        /*
1178         * Found the start of a word, search until we find the end...
1179         */
1180
1181         for (text = ptr, ptr ++; *ptr && isalnum(*ptr & 255); ptr ++);
1182
1183         wordlen = ptr - text;
1184
1185         memcpy(temp, text, wordlen);
1186         temp[wordlen] = '\0';
1187
1188         ptr --;
1189
1190         if (wordlen > 1 && !bsearch(temp, help_common_words,
1191                                     (sizeof(help_common_words) /
1192                                      sizeof(help_common_words[0])),
1193                                     sizeof(help_common_words[0]),
1194                                     (int (*)(const void *, const void *))
1195                                         strcasecmp))
1196           help_add_word(node, temp);
1197       }
1198     }
1199
1200    /*
1201     * Get the offset of the next line...
1202     */
1203
1204     offset = cupsFileTell(fp);
1205   }
1206
1207   cupsFileClose(fp);
1208
1209   if (node)
1210     node->length = offset - node->offset;
1211
1212   return (0);
1213 }
1214
1215
1216 /*
1217  * 'help_new_node()' - Create a new node and add it to an index.
1218  */
1219
1220 static help_node_t *                    /* O - Node pointer or NULL on error */
1221 help_new_node(const char   *filename,   /* I - Filename */
1222               const char   *anchor,     /* I - Anchor */
1223               const char   *section,    /* I - Section */
1224               const char   *text,       /* I - Text */
1225               time_t       mtime,       /* I - Modification time */
1226               off_t        offset,      /* I - Offset in file */
1227               size_t       length)      /* I - Length in bytes */
1228 {
1229   help_node_t   *n;                     /* Node */
1230
1231
1232   DEBUG_printf(("help_new_node(filename=\"%s\", anchor=\"%s\", text=\"%s\", "
1233                 "mtime=%ld, offset=%ld, length=%ld)\n",
1234                 filename ? filename : "(nil)", anchor ? anchor : "(nil)",
1235                 text ? text : "(nil)", (long)mtime, (long)offset,
1236                 (long)length));
1237
1238   n = (help_node_t *)calloc(1, sizeof(help_node_t));
1239   if (!n)
1240     return (NULL);
1241
1242   n->filename = strdup(filename);
1243   n->anchor   = anchor ? strdup(anchor) : NULL;
1244   n->section  = (section && *section) ? strdup(section) : NULL;
1245   n->text     = strdup(text);
1246   n->mtime    = mtime;
1247   n->offset   = offset;
1248   n->length   = length;
1249
1250   return (n);
1251 }
1252
1253
1254 /*
1255  * 'help_sort_nodes_by_name()' - Sort nodes by section, filename, and anchor.
1256  */
1257
1258 static int                              /* O - Difference */
1259 help_sort_by_name(help_node_t *n1,      /* I - First node */
1260                   help_node_t *n2)      /* I - Second node */
1261 {
1262   int           diff;                   /* Difference */
1263
1264
1265   DEBUG_printf(("help_sort_by_name(n1=%p(%s#%s), n2=%p(%s#%s)\n",
1266                 n1, n1->filename, n1->anchor ? n1->anchor : "",
1267                 n2, n2->filename, n2->anchor ? n2->anchor : ""));
1268
1269   if ((diff = strcmp(n1->filename, n2->filename)) != 0)
1270     return (diff);
1271
1272   if (!n1->anchor && !n2->anchor)
1273     return (0);
1274   else if (!n1->anchor)
1275     return (-1);
1276   else if (!n2->anchor)
1277     return (1);
1278   else
1279     return (strcmp(n1->anchor, n2->anchor));
1280 }
1281
1282
1283 /*
1284  * 'help_sort_nodes_by_score()' - Sort nodes by score and text.
1285  */
1286
1287 static int                              /* O - Difference */
1288 help_sort_by_score(help_node_t *n1,     /* I - First node */
1289                    help_node_t *n2)     /* I - Second node */
1290 {
1291   int           diff;                   /* Difference */
1292
1293
1294   DEBUG_printf(("help_sort_by_score(n1=%p(%d \"%s\" \"%s\"), "
1295                 "n2=%p(%d \"%s\" \"%s\")\n",
1296                 n1, n1->score, n1->section ? n1->section : "", n1->text,
1297                 n2, n2->score, n2->section ? n2->section : "", n2->text));
1298
1299   if (n1->score != n2->score)
1300     return (n1->score - n2->score);
1301
1302   if (n1->section && !n2->section)
1303     return (1);
1304   else if (!n1->section && n2->section)
1305     return (-1);
1306   else if (n1->section && n2->section &&
1307            (diff = strcmp(n1->section, n2->section)) != 0)
1308     return (diff);
1309
1310   return (strcasecmp(n1->text, n2->text));
1311 }
1312
1313
1314 /*
1315  * 'help_sort_words()' - Sort words alphabetically.
1316  */
1317
1318 static int                              /* O - Difference */
1319 help_sort_words(help_word_t *w1,        /* I - Second word */
1320                 help_word_t *w2)        /* I - Second word */
1321 {
1322   DEBUG_printf(("help_sort_words(w1=%p(\"%s\"), w2=%p(\"%s\"))\n",
1323                 w1, w1->text, w2, w2->text));
1324
1325   return (strcasecmp(w1->text, w2->text));
1326 }
1327
1328
1329 /*
1330  * End of "$Id: help-index.c 6649 2007-07-11 21:46:42Z mike $".
1331  */