Repositories » TXT0
Clone URL:  
Pushed to one repository · View In Graph Contained in tip

New libk hash table interface and dropped term-hash-table.

Since the inverted index hash table is the place for storing the
vocabulary, it is a wastage of space and time to use a separate hash
for the terms. This is because, looking up a incoming term would
always call strcmp() (since the incoming term's address is never going
to be found in the hash), and then, another lookup was needed (by
comparing pointers this time) to get it placed in the inverted index
hash. For the term, making this trip through the term-hash was
useless, one might as well point it at the inverted index hash since
strcmp is unavoidable here. However, the doc hash is useful because
the postings of each term is going to have repeating doc IDs and to
have them point to the doc hash saves time, during insertion, and
memory, by not instantiating a new string.

Changeset d6a0854ef449

Parent e9a56fa025a5

by Rup Palchowdhury

Changes to 8 files · Browse files at d6a0854ef449 Showing diff from parent e9a56fa025a5 Diff from another changeset...

Change 1 of 1 Show Entire File doc.c Stacked
 
1
2
3
 
4
5
6
7
8
9
10
11
12
 
 
 
 
 
 
 
 
 
 
 
 
 
13
14
15
16
17
18
19
20
21
22
 
23
24
25
 
1
2
 
3
4
5
6
 
 
 
 
 
 
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
 
 
24
25
26
 
27
28
29
30
@@ -1,25 +1,30 @@
 #include <stdlib.h>  #include <string.h> -#include "Doc.h" +#include "doc.h"    Doc *newdoc(char *id, unsigned n_byte, unsigned n_term)  { - Doc *doc; - doc = (Doc *)malloc(sizeof(Doc)); - doc->id = id; - doc->n_byte = n_byte; - doc->n_term = n_term; - return doc; + Doc *d; + d = (Doc *)malloc(sizeof(Doc)); + d->id = id; + d->n_byte = n_byte; + d->n_term = n_term; + return d; +} + +int p_doccmp(void *d1, void *d2) +{ + if (((Doc *)d1)->id == ((Doc *)d2)->id) + return 0; + return 1;  }    int doccmp(void *d1, void *d2)  { - if (((Doc *)d1)->id == ((Doc *)d2)->id) - return 0;   return strcmp(((Doc *)d1)->id, ((Doc *)d2)->id);  }   -unsigned hashdoc(void *data, unsigned hsize) +unsigned dochash(void *data, unsigned hsize)  {   static unsigned MULTIPLIER = 31;   unsigned h;
Change 1 of 1 Show Entire File doc.h Stacked
 
8
9
10
11
 
 
8
9
10
 
11
@@ -8,4 +8,4 @@
   Doc *newdoc(char*, unsigned, unsigned);  int doccmp(void*, void*); -unsigned hashdoc(void*, unsigned); +unsigned dochash(void*, unsigned);
Change 1 of 1 Show Entire File ii.c Stacked
 
 
 
1
2
3
 
1
2
3
4
5
@@ -1,3 +1,5 @@
+/*TODO: ii.c is broken, khash interfaces have changed. Won't compile. */ +  #include <stdio.h>  #include <stdlib.h>  #include <string.h>
Change 1 of 9 Show Entire File ii1.c Stacked
 
2
3
4
 
5
6
7
 
11
12
13
14
15
 
 
16
17
 
18
19
20
 
23
24
25
26
 
27
28
29
 
30
31
32
 
86
87
88
89
 
90
91
 
92
93
94
 
96
97
98
99
 
100
101
102
103
104
105
106
107
108
 
109
110
111
 
116
117
118
119
120
 
121
122
123
 
 
124
125
126
127
128
129
 
 
130
131
 
 
 
 
132
133
134
135
136
137
138
139
140
141
142
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
 
 
 
187
188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
190
191
192
193
194
195
196
197
 
198
199
200
201
 
202
203
204
205
 
206
207
 
 
 
208
209
210
211
 
212
213
214
215
 
216
217
218
219
220
221
222
223
 
 
 
 
 
224
 
225
226
227
228
229
230
231
232
233
234
 
 
 
235
236
 
237
238
239
 
 
240
241
 
242
243
244
 
264
265
266
267
 
268
269
270
 
274
275
276
277
 
278
279
 
280
281
282
 
284
285
286
287
288
289
290
291
292
293
 
 
 
 
 
 
 
 
 
294
295
 
 
 
 
 
296
297
298
299
300
301
302
303
304
305
306
307
308
 
 
309
310
311
312
313
 
 
314
315
316
 
2
3
4
5
6
7
8
 
12
13
14
 
 
15
16
17
 
18
19
20
21
 
24
25
26
 
27
28
29
 
30
31
32
33
 
87
88
89
 
90
91
 
92
93
94
95
 
97
98
99
 
100
101
102
103
104
105
106
 
 
 
107
108
109
110
 
115
116
117
 
 
118
119
 
 
120
121
122
123
124
125
126
127
128
129
130
 
131
132
133
134
135
 
 
 
 
 
 
 
 
 
 
 
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
 
171
172
173
174
175
176
 
177
178
 
179
 
180
181
182
183
 
184
185
 
186
187
188
189
190
191
 
192
193
194
195
 
196
197
198
199
200
201
 
 
 
202
203
204
205
206
207
208
209
210
 
 
 
 
 
 
 
 
211
212
213
214
 
215
216
 
 
217
218
219
 
220
221
222
223
 
243
244
245
 
246
247
248
249
 
253
254
255
 
256
257
 
258
259
260
261
 
263
264
265
 
 
266
267
268
269
 
270
271
272
273
274
275
276
277
278
279
 
280
281
282
283
284
285
 
 
 
 
 
 
 
 
 
 
 
 
286
287
288
289
290
 
 
291
292
293
294
295
@@ -2,6 +2,7 @@
 #include <stdlib.h>  #include <string.h>  #include <assert.h> +#include <kak/kcommon.h>  #include <kak/klog.h>  #include <kak/klist.h>  #include <kak/khash.h> @@ -11,10 +12,10 @@
 #include "tfile.h"  #include "doc.h"   -/* enum {NTERMS = 100003, NDOCS = 100003}; */ -enum {NTERMS = 10007, NDOCS = 4093}; +enum {NTERMS = 100003, NDOCS = 100003}; +/* enum {NTERMS = 10007, NDOCS = 4093}; */   -void hprint(Node *htab[], unsigned size) +void hprint(Hash *h, unsigned size)  {   float fill, avglen;   int c, maxlen, sumlen, chainlen; @@ -23,10 +24,10 @@
  Post *p;   c = maxlen = sumlen = 0;   for (int i = 0; i < size; i++) { - if (htab[i] != NULL) { + if (h->tab[i] != NULL) {   c++;   chainlen = 0; - for (np = htab[i]; np != NULL; np = np->next) { + for (np = h->tab[i]; np != NULL; np = np->next) {   t = (Term *)(np->data);   chainlen++;   printf("%s:%d ", t->s, t->df); @@ -86,9 +87,9 @@
  }  }   -void buildii(unsigned *n_doc, Node *ii[], Node *doctab[], Node *termtab[], FILE *fp) +void buildii(Hash *hdoc, Hash *hterm, FILE *fp)  { - int j_; + int j_, pflag_, tflag_;   unsigned n_term;   char *p_txt, *ts;   THeader *h; @@ -96,16 +97,14 @@
  Doc *doc;   Post *post, *post_;   Term *term, *term_; - Node *npdoc, *npdoc_, *npterm, *npterm_, *npii, *npii_; + Node *npdoc, *npdoc_, *npterm, *npterm_;     h = _newTHeader(TREC);     if ((h = readTHeader(h, fp)) == NULL)   exit(0);   - *n_doc = h->n; - - for(int i = 1; i <= h->n; i++) { + for(int i = 1; i <= h->n; i++) { /* for each doc */     fprintf(stderr, "\r%d/%d", i, h->n);   @@ -116,129 +115,109 @@
    tdoc->id[tdoc->h->n_id] = '\0';   sscanf(tdoc->rsrc[0], "%u", &n_term); - doc = newdoc(strdup(tdoc->id), tdoc->h->n_txt, n_term); - + doc = newdoc(strdup(tdoc->id), tdoc->h->n_txt, n_term);   npdoc_ = newnode(doc); - npdoc = hlookup(doctab, npdoc_, 1, hashdoc, NDOCS, doccmp); - if (npdoc != npdoc_) { + npdoc = hlookup(hdoc, npdoc_, 1); + if (npdoc != npdoc_) { /* a repeating doc id */   free(doc);   free(npdoc_);   }     j_ = 0;   p_txt = tdoc->txt; + + for (int j = 0; j < tdoc->h->n_txt; j++) { /* for each char */   - for (int j = 0; j < tdoc->h->n_txt; j++) { + if (tdoc->txt[j] != ' ') + continue; + + /* at a term-separator */   - if (tdoc->txt[j] == ' ') { - - ts = (char *)malloc(j - j_ + 1); - memcpy(ts, p_txt, j - j_); - ts[j - j_] = '\0'; - - npterm_ = newnode(ts); - npterm = hlookup(termtab, npterm_, 1, hash, NTERMS, strcmp); - if (npterm != npterm_) { - free(ts); - free(npterm_); + ts = strndup(p_txt, j - j_); + post_ = newpost((char *)(((Doc *)(npdoc->data))->id), 1); + term_ = newterm(ts, 1, post_); + pflag_ = tflag_ = 0; + npterm_ = newnode(term_); + npterm = hlookup(hterm, npterm_, 1); /* lookup or create new */ + + if (npterm != npterm_) { /* term exists */ + tflag_ = 1; /* mark to free */ + term = (Term *)(npterm->data); + post = (Post *)(term->plist->data); + if (p_postcmp(post_, post) == 0) { /* post exists */ + post->tf++; + pflag_ = 1; /* mark to free */   } - - post_ = newpost((char *)(((Doc *)(npdoc->data))->id), 1); - term_ = newterm((char *)(npterm->data), 1, post_); - - /* if term exists - if post exists at front of list - post->tf++ - else - add new post at front of list - term->df++ - else - add term to hash table - */ - - npii_ = newnode(term_); - npii = hlookup(ii, npii_, 1, hashterm, NTERMS, termcmp1); - - if (npii != npii_) { /* a new Term wasn't added to the table */ - term = (Term *)(npii->data); - post = (Post *)(term->plist->data); - - if (postcmp(post_, post) == 0) { - post->tf++; - post_->ref = 0; - } - else { - term->plist = addfront(term->plist, term_->plist); - term->df++; - } - term_->ref = 0; - } - - j_ = j + 1; - p_txt = &(tdoc->txt[j + 1]); - - - if (post_->ref == 0) { - free(post_); - free(term_->plist); - } - if (term_->ref == 0) { - free(term_); + else { /* attach a new post */ + term->plist = addfront(term->plist, term_->plist); + term->df++;   }   } + + /* update */ + + j_ = j + 1; + p_txt = &(tdoc->txt[j + 1]); + + /* cleanup */ + + if (pflag_) { + free(post_); + free(term_->plist); + } + if (tflag_) + free(term_);   } -   freeTDoc(tdoc);   }   _freeTHeader(h);   fprintf(stderr, "\n");  }   -void retrieve(unsigned n_doc, Node *ii[], Node *doctab[], Node *termtab[]) +void retrieve(Hash *hdoc, Hash *hterm, FILE *qfp)  { - FILE *qfp = fopen("q.txt", "r");   char q[100][100]; - int n_qterm, n_q, n_d; + int n_qt, n_q, n_d;   float idf;   Post *post, *post_;   Term *term, *term_; - Node *npterm, *npterm_, *npii, *npii_, *nppost, *nppost_; + Node *nppost, *nppost_, *npterm, *npterm_;   Post *res[NDOCS]; - Node *posttab[NDOCS]; + Hash *hpost; + + hpost = newhash(NDOCS, p_postcmp, posthash);     n_q = 0;   - while ((n_qterm = getquery(q, qfp)) > 0) { + while ((n_qt = getquery(q, qfp)) > 0) { /* for a query */     for (int i = 0; i < NDOCS; i++) {   res[i] = NULL; - posttab[i] = NULL; + hpost->tab[i] = NULL;   }     n_q++;   n_d = 0;   - for (int i = 0; i < n_qterm; i++) { - npterm_ = newnode(q[i]); - npterm = hlookup(termtab, npterm_, 0, hash, NTERMS, strcmp); + for (int i = 0; i < n_qt; i++) { /* for a query-term */ + + term_ = newterm(q[i], 0, NULL); + npterm_ = newnode(term_); + npterm = hlookup(hterm, npterm_, 0);   free(npterm_); + free(term_);   if (npterm == NULL)   continue; - term_ = newterm((char *)(npterm->data), 0, NULL); - npii_ = newnode(term_); - npii = hlookup(ii, npii_, 0, hashterm, NTERMS, termcmp1); - free(term_); - free(npii_); - if (npii == NULL) - continue; - term = (Term *)(npii->data); + + term = (Term *)(npterm->data); +   for (Node *np = term->plist; np != NULL; np = np->next) { - post_ = (Post *)(np->data); + post_ = (Post *)(np->data);   nppost_ = newnode(post_); - nppost = hlookup(posttab, nppost_, 1, hashpost, NDOCS, postcmp); - if (nppost != nppost_) { + nppost = hlookup(hpost, nppost_, 1); + if (nppost != nppost_) { /* merge */   post = (Post *)(nppost->data); - post->tf += post_->tf; + post->tf += post_->tf;   free(nppost_);   }   else @@ -264,7 +243,7 @@
  /*   for (Node *np = reslist; np != NULL; np = np->next) {   post = (Post *)(np->data); - printf("%d %s %d\n", n_q, post->id, post->tf); + printf("%d %s %d\n", n_q, post->id, post->tf);   }   */   @@ -274,9 +253,9 @@
    /* results in a hash table */   /* for (int i = 0; i < NDOCS; i++) { */ - /* if (posttab[i] == NULL) */ + /* if (hpost[i] == NULL) */   /* continue; */ - /* for (Node *np = posttab[i]; np != NULL; np = np->next) { */ + /* for (Node *np = hpost[i]; np != NULL; np = np->next) { */   /* post = (Post *)(np->data); */   /* /\* post->tf /= idf; *\/ /\*FIX: post->tf is int *\/ */   /* printf("%d %s %d\n", n_q, post->id, post->tf); */ @@ -284,33 +263,33 @@
  /* } */     } - - fclose(qfp);  }    int main(void)  { - Node *doctab[NDOCS], *termtab[NTERMS], *ii[NTERMS]; + Hash *hdoc, *hterm; + FILE *qfp; + + hdoc = newhash(NDOCS, doccmp, dochash); + hterm = newhash(NTERMS, termcmp, termhash); + + buildii(hdoc, hterm, stdin); + /* hprint(hterm, NTERMS); */ + /* hstats(hdoc, NDOCS); */   - unsigned n_doc; + /* exit(0); */ + + qfp = fopen("test/qtiny.txt", "r"); + + retrieve(hdoc, hterm, qfp);   - for (int i = 0; i < NDOCS; i++) - doctab[i] = NULL; - for (int i = 0; i < NTERMS; i++) { - termtab[i] = NULL; - ii[i] = NULL; - } - - buildii(&n_doc, ii, doctab, termtab, stdin); - /* hprint(ii, NTERMS); */ - /* exit(0); */ - retrieve(n_doc, ii, doctab, termtab); - + fclose(qfp); +   /* fprintf(stderr, "\n"); */   /* fprintf(stderr, "doctab[]\n"); */   /* hstats(doctab, NDOCS); */ - /* fprintf(stderr, "termtab[]\n"); */ - /* hstats(termtab, NTERMS); */ + /* fprintf(stderr, "hterm[]\n"); */ + /* hstats(hterm, NTERMS); */     return 0;  }
Change 1 of 2 Show Entire File post.c Stacked
 
9
10
11
12
13
14
15
16
 
17
18
19
 
27
28
29
30
 
31
32
33
 
9
10
11
 
12
13
14
 
15
16
17
18
 
26
27
28
 
29
30
31
32
@@ -9,11 +9,10 @@
  p = (Post *)malloc(sizeof(Post));   p->id = id;   p->tf = tf; - p->ref = 1;   return p;  }   -int postcmp(void *d1, void *d2) +int p_postcmp(void *d1, void *d2)  {   if (((Post *)d1)->id == ((Post *)d2)->id)   return 0; @@ -27,7 +26,7 @@
  printf(fmt, ((Post *)d)->id, ((Post *)d)->tf);  }   -unsigned hashpost(void *data, unsigned hsize) +unsigned posthash(void *data, unsigned hsize)  {   static unsigned MULTIPLIER = 31;   unsigned h;
Change 1 of 1 Show Entire File post.h Stacked
 
3
4
5
6
7
8
9
10
 
11
12
 
 
3
4
5
 
6
7
8
 
9
10
 
11
@@ -3,10 +3,9 @@
 struct Post {   char* id;   int tf; - int ref;  };    Post *newpost(char*, int); -int postcmp(void*, void*); +int p_postcmp(void*, void*);  void printpost(void*, void*); -unsigned hashpost(void*, unsigned); +unsigned posthash(void*, unsigned);
Change 1 of 4 Show Entire File term.c Stacked
 
12
13
14
15
16
17
18
19
 
20
21
22
 
27
28
29
30
 
31
32
33
 
36
37
38
39
40
41
42
43
 
49
50
51
52
53
54
55
56
57
 
58
59
60
61
62
 
12
13
14
 
15
16
17
 
18
19
20
21
 
26
27
28
 
29
30
31
32
 
35
36
37
 
 
38
39
40
 
46
47
48
 
 
49
50
51
 
52
53
 
54
55
56
@@ -12,11 +12,10 @@
  t->s = s;   t->df = df;   t->plist = newnode(post); - t->ref = 1;   return t;  }   -unsigned hashterm(void *data, unsigned hsize) +unsigned termhash(void *data, unsigned hsize)  {   static unsigned MULTIPLIER = 31;   unsigned h; @@ -27,7 +26,7 @@
  return h % hsize;  }   -int termcmp1(void *d1, void *d2) +int p_termcmp(void *d1, void *d2)  {   if (((Term *)d1)->s == ((Term *)d2)->s)   return 0; @@ -36,8 +35,6 @@
   int termcmp(void *d1, void *d2)  { - if (((Term *)d1)->s == ((Term *)d2)->s) - return 0;   return strcmp(((Term *)d1)->s, ((Term *)d2)->s);  }   @@ -49,14 +46,11 @@
  newt = (Term *)d1;   t = (Term *)d2;   - newt->ref = 0; -   newp = (Post *)(newt->plist->data);   p = (Post *)(t->plist->data);   - if (postcmp(newp, p) == 0) { + if (p_postcmp(newp, p) == 0) {   p->tf++; - newp->ref = 0;   }   else {   t->plist = addfront(t->plist, newt->plist);
Change 1 of 1 Show Entire File term.h Stacked
 
4
5
6
7
8
9
10
 
11
12
13
 
14
15
16
 
4
5
6
 
7
8
 
9
10
11
 
12
13
14
15
@@ -4,13 +4,12 @@
  char *s;   unsigned df;   Node *plist; - int ref;  };   -unsigned hashterm(void*, unsigned); +unsigned termhash(void*, unsigned);  Term *newterm(char*, unsigned, Post*);  int termcmp(void*, void*); -int termcmp1(void*, void*); +int p_termcmp(void*, void*);  int term_match_handler(void*, void*);  void freeterm(void*);  void printterm(void*, void*);