Repositories » TXT0
Clone URL:  
Pushed to one repository · View In Graph Contained in tip

The match_handler algorithm takes advantage of the fact that document
IDs arrive in increasing order and are never repeated. So all new

Posts are added to the front of the list and repeating Posts
(determined by checking the front Post only) update the front
Post. This eliminates the need to call a O(n) lookup() on the postings
list for every term that arrives from the text. The postings list, if,
later, walked over from head to tail, document-ID-integers will arrive
in descending order. This has no significance as long as the list is
'ordered'.

Changeset 8b895f063105

Parent aaf4d785f819

by Rup Palchowdhury

Changes to 3 files · Browse files at 8b895f063105 Showing diff from parent aaf4d785f819 Diff from another changeset...

Change 1 of 3 Show Entire File ii.c Stacked
 
1
2
3
4
5
6
7
 
12
13
14
15
 
16
17
18
19
 
20
21
22
23
 
24
25
26
 
36
37
38
 
 
 
 
39
40
41
 
42
43
44
45
46
 
 
 
 
 
 
 
 
 
 
 
 
 
47
48
49
50
51
52
53
54
55
56
57
58
59
 
 
 
60
61
62
 
 
63
64
 
65
66
67
68
69
 
 
 
 
70
71
72
73
 
74
75
76
77
 
78
79
80
 
1
2
3
 
4
5
6
 
11
12
13
 
14
15
16
17
 
18
19
20
21
 
22
23
24
25
 
35
36
37
38
39
40
41
42
 
 
43
44
45
46
47
 
48
49
50
51
52
53
54
55
56
57
58
59
60
61
 
 
 
 
 
 
 
 
 
 
 
 
62
63
64
65
 
 
66
67
68
 
69
70
71
 
 
 
72
73
74
75
76
77
78
79
80
81
82
83
 
84
85
86
87
@@ -1,7 +1,6 @@
 #include <stdio.h>  #include <stdlib.h>  #include <string.h> -#include <kak/kbuffer.h>  #include <kak/klog.h>  #include <kak/klist.h>  #include <kak/ktree.h> @@ -12,15 +11,15 @@
   int main(void)  { - int i, n_term, n_txt, n_id; + int len, n_txt, n_id, i, i_;   TFile *tfile;   TDoc *tdoc;   TNode *t; - Node *p; + Node *p, *next;   Post *post;   Term *term;   unsigned row, col, n_doc; - char *docid; + char *docid, *p_txt;     t = NULL;   @@ -36,45 +35,53 @@
  docid = (char *)realloc(docid, (row <<= 1) * col);   memset(docid + n_doc * col, '\0', (row - n_doc) * col);   } + + for (p = tfile->list; p != NULL; p = next) { + + fprintf(stderr, "%d/%d\r", n_doc, tfile->h->n);   - for (p = tfile->list; p != NULL; p = p->next) { - + next = p->next;   tdoc = p->data;   n_txt = tdoc->h->n_txt;   n_id = tdoc->h->n_id;   - memcpy(docid + n_doc * col, tdoc->id->b, n_id); + memcpy(docid + n_doc * col, tdoc->id, n_id); + + len = 0; + i_ = 0; + p_txt = tdoc->txt; + + /* pick terms one by one and add to BST */ + for (i = 0; i < n_txt; i++) { + if (tdoc->txt[i] == ' ') { + len = i - i_; + post = newpost(n_doc, 1); + term = newterm(p_txt, len, post); + /* printf("%s %u\n", term->s, len); */   - /* pick terms one by one and add to BST */ - - for (i = 0; i < n_txt; i++) { - if (((char *)tdoc->txt->b)[i] == ' ') { - n_term = i - tdoc->txt->i; - post = newpost(n_doc, 1); - term = newterm((char *)tdoc->txt->p, n_term, post); - printf("%s %u\n", term->s, n_term); - - /* if term strings match - walk the list till docid matches - list->data->tf++ + /* if term exists + if doc matches front of list + post->tf++   else - add new post to end of list - t->data->df++ + add new post to front of list + term->df++   else - attach term to tree as a new node + attach term to tree   */   - /* t = insert(t, newtnode((void *)term), termcmp, */ - /* term_match_handler); */ - shiftpointer(tdoc->txt, n_term + 1); + t = nrinsert(t, newtnode((void *)term), termcmp, + term_match_handler); + i_ += len + 1; + p_txt += len + 1;   }   }     n_doc++; + freenode(p, freeTDoc);   }   }   - /* applyinorder(t, printtree, "%s %d\n"); */ + applyinorder(t, printtree, "%s:%d ");     /* for (i = 0; i < n_doc; i++) */   /* printf("%s\n", docid + i * col); */
Change 1 of 1 Show Entire File term.c Stacked
 
26
27
28
29
 
30
31
32
 
33
34
35
36
37
38
 
 
 
 
 
 
 
39
40
41
42
43
 
 
44
45
46
 
26
27
28
 
29
30
 
 
31
32
 
 
 
 
 
33
34
35
36
37
38
39
40
 
 
 
 
41
42
43
44
45
@@ -26,21 +26,20 @@
  return strcmp(((Term *)t)->s, ((Term *)t1)->s);  }   -int term_match_handler(void *t_, void *t1_) +int term_match_handler(void *term, void *newterm)  { - Term *t, *t1; - Node *p; + Term *t, *nt;   - t = (Term *)t_; - t1 = (Term *)t1_; - - if((p = lookup(t->list, t1->list, postcmp)) == NULL) { - t->list = addend(t->list, t1->list); + t = (Term *)term; + nt = (Term *)newterm; + + if (postcmp(nt->list->data, t->list->data) == 0) + ((Post *)t->list->data)->tf++; + else { + t->list = addfront(t->list, nt->list);   t->df++; - } - else { - ((Post *)p->data)->tf++; - } + } +   return 0;  }  
Change 1 of 2 Show Entire File tfile.c Stacked
 
163
164
165
 
166
167
168
 
280
281
282
283
 
284
285
286
287
 
288
289
 
290
291
292
293
 
294
295
 
296
297
298
299
300
 
301
302
303
 
163
164
165
166
167
168
169
 
281
282
283
 
284
285
286
287
 
288
289
 
290
291
292
293
 
294
295
 
296
297
298
299
300
301
302
303
304
305
@@ -163,6 +163,7 @@
   TFile* readTFile(FILE *fp)  { + /* TODO: checks to validate a T file. */   uint16_t v, check;   int n;   TFile *tfile; @@ -280,24 +281,25 @@
  _printTSubHeader(tdoc->h);     /* print the doc ID */ - + /*   tdoc->id[tdoc->h->n_id] = '\0';   printf("tdoc->id: ");   printf("%s\n", tdoc->id); - + */   /* print the tokenized doc text */ - + /*   tdoc->txt[tdoc->h->n_txt] = '\0';   printf("tdoc->txt: ");   printf("%s\n", tdoc->txt); - + */   /* print the resource blocks */ - + /*   for (int i = 0; i < tdoc->h->r; i++) {   tdoc->rsrc[i][tdoc->h->n_rsrc[i]] = '\0';   printf("tdoc->rsrc[%d]: ", i);   printf("%s\n", tdoc->rsrc[i]);   } + */  }    void printTFile(TFile *tfile)