Fossil

Check-in [9f67861a]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Improvements to the ranking function. Add the undocumented "debug" query parameter to /search.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | search-enhancements
Files: files | file ages | folders
SHA1:9f67861aed7d59fbfd53836d140f22799010dbec
User & Date: drh 2015-02-14 02:12:12
Context
2015-02-14
15:17
Enhance /search to distinguish between the title and the body of a document and provide support for the Porter stemmer for indexed search. Improved scoring and snippet presentation. NB: Run "fossil fts-config reindex" when upgrading through this change. check-in: 0f96ffb9 user: drh tags: trunk
02:12
Improvements to the ranking function. Add the undocumented "debug" query parameter to /search. Closed-Leaf check-in: 9f67861a user: drh tags: search-enhancements
00:37
Enabled indexed search with separate title and body and with the option to use the Porter stemmer. check-in: 71295a98 user: drh tags: search-enhancements
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/search.c.

649
650
651
652
653
654
655
656
657
658
659

660
661
662
663
664
665
666
...
675
676
677
678
679
680
681
682
683
684
685

686
687
688
689
690
691
692
...
693
694
695
696
697
698
699
700
701
702
703

704
705
706
707
708
709
710
711
712
713
714
715
716

717
718
719
720
721
722
723









724
725
726
727
728
729
730
...
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748


749





750

751
752

753
754

755
756





757
758
759
760
761
762
763
...
788
789
790
791
792
793
794
795
796
797
798

799
800
801
802
803
804
805
...
880
881
882
883
884
885
886
887

888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916



917
918
919
920
921
922
923
924
...
942
943
944
945
946
947
948

949
950
951
952
953
954
955
...
989
990
991
992
993
994
995



996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
....
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473

1474
1475
1476
1477
1478
1479
1480
    char *zDocGlob = db_get("doc-glob","");
    char *zDocBr = db_get("doc-branch","trunk");
    if( zDocGlob && zDocGlob[0] && zDocBr && zDocBr[0] ){
      db_multi_exec(
        "CREATE VIRTUAL TABLE IF NOT EXISTS temp.foci USING files_of_checkin;"
      );
      db_multi_exec(
        "INSERT INTO x(label,url,score,date,snip)"
        "  SELECT printf('Document: %%s',title('d',blob.rid,foci.filename)),"
        "         printf('/doc/%T/%%s',foci.filename),"
        "         search_score(),"

        "         (SELECT datetime(event.mtime) FROM event"
        "            WHERE objid=symbolic_name_to_rid('trunk')),"
        "         search_snippet()"
        "    FROM foci CROSS JOIN blob"
        "   WHERE checkinID=symbolic_name_to_rid('trunk')"
        "     AND blob.uuid=foci.uuid"
        "     AND search_match(title('d',blob.rid,foci.filename),"
................................................................................
      "WITH wiki(name,rid,mtime) AS ("
      "  SELECT substr(tagname,6), tagxref.rid, max(tagxref.mtime)"
      "    FROM tag, tagxref"
      "   WHERE tag.tagname GLOB 'wiki-*'"
      "     AND tagxref.tagid=tag.tagid"
      "   GROUP BY 1"
      ")"
      "INSERT INTO x(label,url,score,date,snip)"
      "  SELECT printf('Wiki: %%s',name),"
      "         printf('/wiki?name=%%s',urlencode(name)),"
      "         search_score(),"

      "         datetime(mtime),"
      "         search_snippet()"
      "    FROM wiki"
      "   WHERE search_match(title('w',rid,name),body('w',rid,name));"
    );
  }
  if( (srchFlags & SRCH_CKIN)!=0 ){
................................................................................
    db_multi_exec(
      "WITH ckin(uuid,rid,mtime) AS ("
      "  SELECT blob.uuid, event.objid, event.mtime"
      "    FROM event, blob"
      "   WHERE event.type='ci'"
      "     AND blob.rid=event.objid"
      ")"
      "INSERT INTO x(label,url,score,date,snip)"
      "  SELECT printf('Check-in [%%.10s] on %%s',uuid,datetime(mtime)),"
      "         printf('/timeline?c=%%s&n=8&y=ci',uuid),"
      "         search_score(),"

      "         datetime(mtime),"
      "         search_snippet()"
      "    FROM ckin"
      "   WHERE search_match('',body('c',rid,NULL));"
    );
  }
  if( (srchFlags & SRCH_TKT)!=0 ){
    db_multi_exec(
      "INSERT INTO x(label,url,score, date,snip)"
      "  SELECT printf('Ticket: %%s (%%s)',title('t',tkt_id,NULL),"
                      "datetime(tkt_mtime)),"
      "         printf('/tktview/%%.20s',tkt_uuid),"
      "         search_score(),"

      "         datetime(tkt_mtime),"
      "         search_snippet()"
      "    FROM ticket"
      "   WHERE search_match(title('t',tkt_id,NULL),body('t',tkt_id,NULL));"
    );
  }
}










/*
** Implemenation of the rank() function used with rank(matchinfo(*,'pcsx')).
*/
static void search_rank_sqlfunc(
  sqlite3_context *context,
  int argc,
................................................................................
  sqlite3_value **argv
){
  const unsigned *aVal = (unsigned int*)sqlite3_value_blob(argv[0]);
  int nVal = sqlite3_value_bytes(argv[0])/4;
  int nCol;           /* Number of columns in the index */
  int nTerm;          /* Number of search terms in the query */
  int i, j;           /* Loop counter */
  double r = 1.0;     /* Score */
  const unsigned *aX, *aS;

  if( nVal<2 ) return;
  nTerm = aVal[0];
  nCol = aVal[1];
  if( nVal<2+3*nCol*nTerm+4*nCol ) return;
  aS = aVal+2;
  aX = aS+nCol;
  for(j=0; j<nCol; j++){
    r *= 1<<((30*(aS[j]-1))/nTerm);


    for(i=0; i<nTerm; i++){





      int hits_this_row = aX[j + i*nCol];

      int hits_all_rows = aX[j + i*nCol + 1];
      int rows_with_hit = aX[j + i*nCol + 2];

      double avg_hits_per_row = (double)hits_all_rows/(double)rows_with_hit;
      r *= hits_this_row/avg_hits_per_row;

    }
    r *= 2.0;





  }
#define SEARCH_DEBUG_RANK 0
#if SEARCH_DEBUG_RANK
  {
    Blob x;
    blob_init(&x,0,0);
    blob_appendf(&x,"%08x", (int)r);
................................................................................
){
  Blob sql;
  if( srchFlags==0 ) return;
  sqlite3_create_function(g.db, "rank", 1, SQLITE_UTF8, 0,
     search_rank_sqlfunc, 0, 0);
  blob_init(&sql, 0, 0);
  blob_appendf(&sql,
    "INSERT INTO x(label,url,score,date,snip) "
    " SELECT ftsdocs.label,"
    "        ftsdocs.url,"
    "        rank(matchinfo(ftsidx,'pcsx')),"

    "        datetime(ftsdocs.mtime),"
    "        snippet(ftsidx,'<mark>','</mark>',' ... ',-1,35)"
    "   FROM ftsidx CROSS JOIN ftsdocs"
    "  WHERE ftsidx MATCH %Q"
    "    AND ftsdocs.rowid=ftsidx.docid",
    zPattern
  );
................................................................................
** Other web-pages can invoke this routine to add search results
** in the middle of the page.
**
** Return the number of rows.
*/
int search_run_and_output(
  const char *zPattern,       /* The query pattern */
  unsigned int srchFlags      /* What to search over */

){
  Stmt q;
  int nRow = 0;

  srchFlags = search_restrict(srchFlags);
  if( srchFlags==0 ) return 0;
  search_sql_setup(g.db);
  add_content_sql_commands(g.db);
  db_multi_exec(
    "CREATE TEMP TABLE x(label,url,score,date,snip);"
  );
  if( !search_index_exists() ){
    search_fullscan(zPattern, srchFlags);
  }else{
    search_update_index(srchFlags);
    search_indexed(zPattern, srchFlags);
  }
  db_prepare(&q, "SELECT url, snip, label"
                 "  FROM x"
                 " ORDER BY score DESC, date DESC;");
  while( db_step(&q)==SQLITE_ROW ){
    const char *zUrl = db_column_text(&q, 0);
    const char *zSnippet = db_column_text(&q, 1);
    const char *zLabel = db_column_text(&q, 2);
    if( nRow==0 ){
      @ <ol>
    }
    nRow++;
    @ <li><p><a href='%R%s(zUrl)'>%h(zLabel)</a><br>



    @ <span class='snippet'>%z(cleanSnippet(zSnippet))</span></li>
  }
  db_finalize(&q);
  if( nRow ){
    @ </ol>
  }
  return nRow;
}
................................................................................
*/
void search_screen(unsigned srchFlags, int useYparam){
  const char *zType = 0;
  const char *zClass = 0;
  const char *zDisable1;
  const char *zDisable2;
  const char *zPattern;

  srchFlags = search_restrict(srchFlags);
  switch( srchFlags ){
    case SRCH_CKIN:  zType = " Check-ins";  zClass = "Ckin";  break;
    case SRCH_DOC:   zType = " Docs";       zClass = "Doc";   break;
    case SRCH_TKT:   zType = " Tickets";    zClass = "Tkt";   break;
    case SRCH_WIKI:  zType = " Wiki";       zClass = "Wiki";  break;
  }
................................................................................
        cgi_printf(" selected");
      }
      cgi_printf(">%s</option>\n", aY[i].zNm);
    }
    @ </select>
    srchFlags = newFlags;
  }



  @ <input type="submit" value="Search%s(zType)"%s(zDisable2)>
  if( srchFlags==0 ){
    @ <p class="generalError">Search is disabled</p>
  }
  @ </div></form>
  while( fossil_isspace(zPattern[0]) ) zPattern++;
  if( zPattern[0] ){
    if( zClass ){
      @ <div class='searchResult searchResult%s(zClass)'>
    }else{
      @ <div class='searchResult'>
    }
    if( search_run_and_output(zPattern, srchFlags)==0 ){
      @ <p class='searchEmpty'>No matches for: <span>%h(zPattern)</span></p>
    }
    @ </div>
  }
}

/*
................................................................................
  db_multi_exec(
    "DELETE FROM ftsdocs WHERE type='d'"
    "      AND rid NOT IN (SELECT rid FROM current_docs)"
  );
  db_multi_exec(
    "INSERT OR IGNORE INTO ftsdocs(type,rid,name,idxed,label,bx,url,mtime)"
    "  SELECT 'd', rid, name, 0,"
    "         'Document: '||title('d',rid,name),"
    "         body('d',rid,name),"
    "         printf('/doc/%q/%%s',urlencode(name)),"
    "         %.17g"
    " FROM current_docs",
    zBrUuid, rTime
  );
  db_multi_exec(
    "INSERT INTO ftsidx(docid,title,body)"
    "  SELECT rowid, name, bx FROM ftsdocs WHERE type='d' AND NOT idxed"
  );
  db_multi_exec(
    "UPDATE ftsdocs SET"
    "  idxed=1,"
    "  bx=NULL"

    " WHERE type='d' AND NOT idxed"
  );
}

/*
** Deal with all of the unindexed 'c' terms in FTSDOCS
*/







|



>







 







|



>







 







|



>








|




>







>
>
>
>
>
>
>
>
>







 







|





|



|
>
>
|
>
>
>
>
>
|
>
|
|
>
|
<
>
|
<
>
>
>
>
>







 







|



>







 







|
>









|







|










|
>
>
>
|







 







>







 







>
>
>












|







 







|








|




|
>







649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
...
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
...
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
...
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775

776
777

778
779
780
781
782
783
784
785
786
787
788
789
...
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
...
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
...
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
....
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
....
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
    char *zDocGlob = db_get("doc-glob","");
    char *zDocBr = db_get("doc-branch","trunk");
    if( zDocGlob && zDocGlob[0] && zDocBr && zDocBr[0] ){
      db_multi_exec(
        "CREATE VIRTUAL TABLE IF NOT EXISTS temp.foci USING files_of_checkin;"
      );
      db_multi_exec(
        "INSERT INTO x(label,url,score,id,date,snip)"
        "  SELECT printf('Document: %%s',title('d',blob.rid,foci.filename)),"
        "         printf('/doc/%T/%%s',foci.filename),"
        "         search_score(),"
        "         'd'||blob.rid,"
        "         (SELECT datetime(event.mtime) FROM event"
        "            WHERE objid=symbolic_name_to_rid('trunk')),"
        "         search_snippet()"
        "    FROM foci CROSS JOIN blob"
        "   WHERE checkinID=symbolic_name_to_rid('trunk')"
        "     AND blob.uuid=foci.uuid"
        "     AND search_match(title('d',blob.rid,foci.filename),"
................................................................................
      "WITH wiki(name,rid,mtime) AS ("
      "  SELECT substr(tagname,6), tagxref.rid, max(tagxref.mtime)"
      "    FROM tag, tagxref"
      "   WHERE tag.tagname GLOB 'wiki-*'"
      "     AND tagxref.tagid=tag.tagid"
      "   GROUP BY 1"
      ")"
      "INSERT INTO x(label,url,score,id,date,snip)"
      "  SELECT printf('Wiki: %%s',name),"
      "         printf('/wiki?name=%%s',urlencode(name)),"
      "         search_score(),"
      "         'w'||rid,"
      "         datetime(mtime),"
      "         search_snippet()"
      "    FROM wiki"
      "   WHERE search_match(title('w',rid,name),body('w',rid,name));"
    );
  }
  if( (srchFlags & SRCH_CKIN)!=0 ){
................................................................................
    db_multi_exec(
      "WITH ckin(uuid,rid,mtime) AS ("
      "  SELECT blob.uuid, event.objid, event.mtime"
      "    FROM event, blob"
      "   WHERE event.type='ci'"
      "     AND blob.rid=event.objid"
      ")"
      "INSERT INTO x(label,url,score,id,date,snip)"
      "  SELECT printf('Check-in [%%.10s] on %%s',uuid,datetime(mtime)),"
      "         printf('/timeline?c=%%s&n=8&y=ci',uuid),"
      "         search_score(),"
      "         'c'||rid,"
      "         datetime(mtime),"
      "         search_snippet()"
      "    FROM ckin"
      "   WHERE search_match('',body('c',rid,NULL));"
    );
  }
  if( (srchFlags & SRCH_TKT)!=0 ){
    db_multi_exec(
      "INSERT INTO x(label,url,score,id,date,snip)"
      "  SELECT printf('Ticket: %%s (%%s)',title('t',tkt_id,NULL),"
                      "datetime(tkt_mtime)),"
      "         printf('/tktview/%%.20s',tkt_uuid),"
      "         search_score(),"
      "         't'||tkt_id,"
      "         datetime(tkt_mtime),"
      "         search_snippet()"
      "    FROM ticket"
      "   WHERE search_match(title('t',tkt_id,NULL),body('t',tkt_id,NULL));"
    );
  }
}

/*
** Number of significant bits in a u32
*/
static int nbits(u32 x){
  int n = 0; 
  while( x ){ n++; x >>= 1; }
  return n;
}

/*
** Implemenation of the rank() function used with rank(matchinfo(*,'pcsx')).
*/
static void search_rank_sqlfunc(
  sqlite3_context *context,
  int argc,
................................................................................
  sqlite3_value **argv
){
  const unsigned *aVal = (unsigned int*)sqlite3_value_blob(argv[0]);
  int nVal = sqlite3_value_bytes(argv[0])/4;
  int nCol;           /* Number of columns in the index */
  int nTerm;          /* Number of search terms in the query */
  int i, j;           /* Loop counter */
  double r = 0.0;     /* Score */
  const unsigned *aX, *aS;

  if( nVal<2 ) return;
  nTerm = aVal[0];
  nCol = aVal[1];
  if( nVal<2+3*nCol*nTerm+nCol ) return;
  aS = aVal+2;
  aX = aS+nCol;
  for(j=0; j<nCol; j++){
    double x;
    if( aS[j]>0 ){
      x = 0.0;
      for(i=0; i<nTerm; i++){
        int hits_this_row;
        int hits_all_rows;
        int rows_with_hit;
        double avg_hits_per_row;

        hits_this_row = aX[j + i*nCol*3];
        if( hits_this_row==0 )continue;
        hits_all_rows = aX[j + i*nCol*3 + 1];
        rows_with_hit = aX[j + i*nCol*3 + 2];
        if( rows_with_hit==0 ) continue;
        avg_hits_per_row = hits_all_rows/(double)rows_with_hit;

        x += hits_this_row/(avg_hits_per_row*nbits(rows_with_hit));
      }

      x *= (1<<((30*(aS[j]-1))/nTerm));
    }else{
      x = 0.0;
    }
    r = r*10.0 + x;
  }
#define SEARCH_DEBUG_RANK 0
#if SEARCH_DEBUG_RANK
  {
    Blob x;
    blob_init(&x,0,0);
    blob_appendf(&x,"%08x", (int)r);
................................................................................
){
  Blob sql;
  if( srchFlags==0 ) return;
  sqlite3_create_function(g.db, "rank", 1, SQLITE_UTF8, 0,
     search_rank_sqlfunc, 0, 0);
  blob_init(&sql, 0, 0);
  blob_appendf(&sql,
    "INSERT INTO x(label,url,score,id,date,snip) "
    " SELECT ftsdocs.label,"
    "        ftsdocs.url,"
    "        rank(matchinfo(ftsidx,'pcsx')),"
    "        ftsdocs.type || ftsdocs.rid,"
    "        datetime(ftsdocs.mtime),"
    "        snippet(ftsidx,'<mark>','</mark>',' ... ',-1,35)"
    "   FROM ftsidx CROSS JOIN ftsdocs"
    "  WHERE ftsidx MATCH %Q"
    "    AND ftsdocs.rowid=ftsidx.docid",
    zPattern
  );
................................................................................
** Other web-pages can invoke this routine to add search results
** in the middle of the page.
**
** Return the number of rows.
*/
int search_run_and_output(
  const char *zPattern,       /* The query pattern */
  unsigned int srchFlags,     /* What to search over */
  int fDebug                  /* Extra debugging output */
){
  Stmt q;
  int nRow = 0;

  srchFlags = search_restrict(srchFlags);
  if( srchFlags==0 ) return 0;
  search_sql_setup(g.db);
  add_content_sql_commands(g.db);
  db_multi_exec(
    "CREATE TEMP TABLE x(label,url,score,id,date,snip);"
  );
  if( !search_index_exists() ){
    search_fullscan(zPattern, srchFlags);
  }else{
    search_update_index(srchFlags);
    search_indexed(zPattern, srchFlags);
  }
  db_prepare(&q, "SELECT url, snip, label, score, id"
                 "  FROM x"
                 " ORDER BY score DESC, date DESC;");
  while( db_step(&q)==SQLITE_ROW ){
    const char *zUrl = db_column_text(&q, 0);
    const char *zSnippet = db_column_text(&q, 1);
    const char *zLabel = db_column_text(&q, 2);
    if( nRow==0 ){
      @ <ol>
    }
    nRow++;
    @ <li><p><a href='%R%s(zUrl)'>%h(zLabel)</a>
    if( fDebug ){
      @ (%e(db_column_double(&q,3)), %s(db_column_text(&q,4)))
    }
    @ <br><span class='snippet'>%z(cleanSnippet(zSnippet))</span></li>
  }
  db_finalize(&q);
  if( nRow ){
    @ </ol>
  }
  return nRow;
}
................................................................................
*/
void search_screen(unsigned srchFlags, int useYparam){
  const char *zType = 0;
  const char *zClass = 0;
  const char *zDisable1;
  const char *zDisable2;
  const char *zPattern;
  int fDebug = PB("debug");
  srchFlags = search_restrict(srchFlags);
  switch( srchFlags ){
    case SRCH_CKIN:  zType = " Check-ins";  zClass = "Ckin";  break;
    case SRCH_DOC:   zType = " Docs";       zClass = "Doc";   break;
    case SRCH_TKT:   zType = " Tickets";    zClass = "Tkt";   break;
    case SRCH_WIKI:  zType = " Wiki";       zClass = "Wiki";  break;
  }
................................................................................
        cgi_printf(" selected");
      }
      cgi_printf(">%s</option>\n", aY[i].zNm);
    }
    @ </select>
    srchFlags = newFlags;
  }
  if( fDebug ){
    @ <input type="hidden" name="debug" value="1">
  }
  @ <input type="submit" value="Search%s(zType)"%s(zDisable2)>
  if( srchFlags==0 ){
    @ <p class="generalError">Search is disabled</p>
  }
  @ </div></form>
  while( fossil_isspace(zPattern[0]) ) zPattern++;
  if( zPattern[0] ){
    if( zClass ){
      @ <div class='searchResult searchResult%s(zClass)'>
    }else{
      @ <div class='searchResult'>
    }
    if( search_run_and_output(zPattern, srchFlags, fDebug)==0 ){
      @ <p class='searchEmpty'>No matches for: <span>%h(zPattern)</span></p>
    }
    @ </div>
  }
}

/*
................................................................................
  db_multi_exec(
    "DELETE FROM ftsdocs WHERE type='d'"
    "      AND rid NOT IN (SELECT rid FROM current_docs)"
  );
  db_multi_exec(
    "INSERT OR IGNORE INTO ftsdocs(type,rid,name,idxed,label,bx,url,mtime)"
    "  SELECT 'd', rid, name, 0,"
    "         title('d',rid,name),"
    "         body('d',rid,name),"
    "         printf('/doc/%q/%%s',urlencode(name)),"
    "         %.17g"
    " FROM current_docs",
    zBrUuid, rTime
  );
  db_multi_exec(
    "INSERT INTO ftsidx(docid,title,body)"
    "  SELECT rowid, label, bx FROM ftsdocs WHERE type='d' AND NOT idxed"
  );
  db_multi_exec(
    "UPDATE ftsdocs SET"
    "  idxed=1,"
    "  bx=NULL,"
    "  label='Document: '||label"
    " WHERE type='d' AND NOT idxed"
  );
}

/*
** Deal with all of the unindexed 'c' terms in FTSDOCS
*/