Repositories » TXT0 Read More
Clone URL:  
Pushed to one repository · View In Graph Contained in tip

Changes to THeader and introduced Model abstraction for term-weighting.

THeader now carries some of the global corpus statistics which is
needed for the weighting of terms. buildii() returns the THeader it
has read in. In this way, having another data-structure to pass around
global corpus stats is avoided.

A Model structure packs to pointers to functions. The TF-IDF weighting
is abstracted as f(tf) * g(df) * h(qtf) (term-frequency, document
frequency and query-term-frequency). The programmer has to supply a
form of f, g and h which the search() routine calls on tf, df and
qtf. Along with these three parameters, the rest of the corpus stats
and document stats are also passed on.

A set of test f-g-h routines are available to reproduce the results on
the Alice test collection.

Changeset 3c76be16e685

Parent 66dbe4eea2e5

by Rup Palchowdhury

Changes to 6 files · Browse files at 3c76be16e685 Showing diff from parent 66dbe4eea2e5 Diff from another changeset...

Change 1 of 11 Show Entire File ii.c Stacked
 
20
21
22
23
 
24
25
26
 
35
36
37
38
39
 
 
40
41
42
43
 
 
44
45
46
 
49
50
51
52
 
53
54
55
 
86
87
88
89
 
90
91
92
 
93
94
95
 
110
111
112
113
114
115
116
 
 
 
 
 
117
118
119
 
121
122
123
124
 
125
126
127
 
165
166
167
168
169
 
170
171
172
 
173
174
175
176
177
178
179
 
 
 
 
 
 
 
180
181
182
 
 
183
184
185
 
193
194
195
196
 
 
197
198
199
200
201
202
203
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
206
207
208
209
210
211
212
213
 
 
 
 
214
215
216
217
 
218
219
220
 
264
265
266
 
 
267
268
269
 
273
274
275
276
 
277
278
279
 
284
285
286
 
 
 
 
 
 
 
 
287
288
289
290
291
292
 
293
294
295
 
20
21
22
 
23
24
25
26
 
35
36
37
 
 
38
39
40
41
 
 
42
43
44
45
46
 
49
50
51
 
52
53
54
55
 
86
87
88
 
89
90
91
 
92
93
94
95
 
110
111
112
 
 
 
 
113
114
115
116
117
118
119
120
 
122
123
124
 
125
126
127
128
 
166
167
168
 
169
170
171
172
 
173
174
 
 
 
 
 
 
175
176
177
178
179
180
181
182
 
 
183
184
185
186
187
 
195
196
197
 
198
199
200
 
 
 
 
 
 
 
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
 
 
 
 
245
246
247
248
249
250
251
 
252
253
254
255
 
299
300
301
302
303
304
305
306
 
310
311
312
 
313
314
315
316
 
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
 
337
338
339
340
@@ -20,7 +20,7 @@
  Post *post_;   Node *np, *npq_, *npq, *nppost_;   Query *q_, *q; - uint32_t k, k_, n_term, n_uterm, tf; + uint32_t k, k_, sumtf, uterm, tf;   char ts[KB];     /* read in the query file */ @@ -35,12 +35,12 @@
    tdoc = np->data;   - sscanf(tdoc->rsrc[0], "%u", &n_term); - sscanf(tdoc->rsrc[1], "%u", &n_uterm); + sscanf(tdoc->rsrc[0], "%u", &sumtf); + sscanf(tdoc->rsrc[1], "%u", &uterm);     q_ = newquery(strdup(tdoc->id)); - q_->n_term = n_term; - q_->n_uterm = n_uterm; + q_->sumtf = sumtf; + q_->uterm = uterm;     npq_ = newnode(q_);   npq = hlookup(hq, npq_, 1); @@ -49,7 +49,7 @@
    k = k_ = 0;   - for (int j = 0; j < n_uterm; j++) { /* for each term in query vector */ + for (int j = 0; j < uterm; j++) { /* for each term in query vector */     /* pick a term */   for(; tdoc->txt[k] != ' '; k++); @@ -86,10 +86,10 @@
  }  }   -void buildii(Hash *hdoc, Hash *hterm, FILE *fp) +THeader *buildii(Hash *hdoc, Hash *hterm, FILE *fp)  {   int pflag_, tflag_; - uint32_t k, k_, n_term, n_uterm, sumsq, tf; + uint32_t k, k_, sumtf, uterm, sumsqtf, maxtf, tf;   char ts[KB];   THeader *h;   TDoc *tdoc; @@ -110,10 +110,11 @@
  if ((tdoc = readTDoc(tdoc, fp, TREC)) == NULL)   exit(0);   - sscanf(tdoc->rsrc[0], "%u", &n_term); - sscanf(tdoc->rsrc[1], "%u", &n_uterm); - sscanf(tdoc->rsrc[2], "%u", &sumsq); - doc = newdoc(tdoc->id, n_term, n_uterm, tdoc->h->n_txt, sumsq); + sscanf(tdoc->rsrc[0], "%u", &sumtf); + sscanf(tdoc->rsrc[1], "%u", &uterm); + sscanf(tdoc->rsrc[2], "%u", &sumsqtf); + sscanf(tdoc->rsrc[3], "%u", &maxtf); + doc = newdoc(tdoc->id, tdoc->h->n_txt, sumtf, uterm, sumsqtf, maxtf);   npdoc_ = newnode(doc);   npdoc = hlookup(hdoc, npdoc_, 1);   if (npdoc != npdoc_) /* a repeating doc id */ @@ -121,7 +122,7 @@
    k = k_ = 0;   - for (int j = 0; j < n_uterm; j++) { /* for each term in doc vector */ + for (int j = 0; j < uterm; j++) { /* for each term */     /* pick a term */   for(; tdoc->txt[k] != ' '; k++); @@ -165,21 +166,22 @@
  freeTDoc(tdoc, TREC);   fprintf(stderr, "\rindexed: %d/%d", i, h->n);   } - _freeTHeader(h);   fprintf(stderr, "\n"); + return h;  }   -void search(Query *q, Hash *hdoc, Hash *hterm) +void search(Query *q, Hash *hdoc, Hash *hterm, THeader *h, Model m)  { - uint32_t n_d; - float idf; - Post *post, *post_, *post_t, *post_d; - Term *term, term__; - Node *nppost, *nppost_, *npterm, node__; - Hash *hpost; + double tf_, idf_, qtf_, w_; + Post *post, *post_, *post_t, *post_d; + Term *term, term__; + Doc *doc, doc__; + Node *nppost, *nppost_, *npterm, node__, *npdoc, *npscore, *npscore_; + Score *score, *score_; + Hash *hresult;   - hpost = newhash(NDOCS, cmppost, hashpost); - n_d = 0; + /* hpost = newhash(NDOCS, cmppost, hashpost); */ + hresult = newhash(NDOCS, cmpscore, hashscore);     for (Node *np = q->tlist; np != NULL; np = np->next) { /* for a term */   @@ -193,28 +195,61 @@
    term = (Term *)(npterm->data);   - for (Node *np1 = term->plist; np1 != NULL; np1 = np1->next) { /* for a doc */ + for (Node *np1 = term->plist; np1 != NULL; np1 = np1->next) { /* for a post */ +   post_d = (Post *)(np1->data); - post_ = newpost(post_d->id, post_d->tf * post_t->tf); - nppost_ = newnode(post_); - nppost = hlookup(hpost, nppost_, 1); - if (nppost != nppost_) { /* merge */ - post = (Post *)(nppost->data); - post->tf += post_->tf; - freenode(nppost_, freepost); + + /* post_ = newpost(post_d->id, post_d->tf * post_t->tf); */ + /* nppost_ = newnode(post_); */ + /* nppost = hlookup(hpost, nppost_, 1); */ + + doc__.id = post_d->id; + node__.data = &doc__; + hdoc->cmp = cmpdoc_p; /* dismantle cmp_fn */ + npdoc = hlookup(hdoc, &node__, 0); + hdoc->cmp = cmpdoc; /* put it back */ + + if (npdoc == NULL) + continue; + + doc = (Doc *)(npdoc->data); + + /* compute and combine f(tf)), f(df) and f(qtf) */ + tf_ = (*m.tf)(post_d->tf, doc->sumtf, doc->uterm, + doc->sumsqtf, doc->maxtf, doc->nb, + h->sumnb, h->sumnu, h->sumnt, + h->n); + idf_ = (*m.df)(term->df, h->n); + qtf_ = (*m.qtf)(post_t->tf); + w_ = tf_ * idf_ * qtf_; + + score_ = newscore_s(post_d->id, w_); + npscore_ = newnode(score_); + npscore = hlookup(hresult, npscore_, 1); + + /* if (nppost != nppost_) { /\* merge *\/ */ + /* post = (Post *)(nppost->data); */ + /* post->tf += post_->tf; */ + /* freenode(nppost_, freepost); */ + /* } */ + + if (npscore != npscore_) { /* merge */ + score = (Score *)(npscore->data); + score->n += score_->n; + freenode(npscore_, freescore_s);   }   }   }     /* print result */ - for (int i = 0; i < hpost->n; i++) { - for (Node *np = hpost->tab[i]; np != NULL; np = np->next) { - post = (Post *)(np->data); - fprintf(stdout, "%s %s %d\n", q->id, post->id, post->tf); + for (int i = 0; i < hresult->n; i++) { + for (Node *np = hresult->tab[i]; np != NULL; np = np->next) { + score = (Score *)(np->data); + fprintf(stdout, "%s %s %f\n", q->id, score->id, score->n);   }   }   - freehash(hpost, freepost); + freehash(hresult, freescore_s);     /* results in a list that points to nodes in the hash table */   /* @@ -264,6 +299,8 @@
    Hash *hdoc, *hterm, *hq;   FILE *fpq, *fplog; + Model m; + THeader *h;     esetprogname(estrdup(argv[0]));   fplog = fopen(strcat(estrdup(argv[0]), "-error.log"), "w"); @@ -273,7 +310,7 @@
  hterm = newhash(NTERMS, cmpterm, hashterm);   hq = newhash(NHASH, cmpquery, hashquery);   - buildii(hdoc, hterm, stdin); + h = buildii(hdoc, hterm, stdin);   /* fprinthash(stdout, hterm, fprintterm); */   /* hstats(hdoc, NDOCS); */   @@ -284,12 +321,20 @@
  /* fprinthash(stdout, hq, fprintquery); */   }   + /* m.tf = &_tf; */ + /* m.df = &_df; */ + /* m.qtf = &_qtf; */ + + m.tf = &SMART_d_b; + m.df = &SMART__t_; + m.qtf = &_qtf; +   for (int i = 0; i < hq->n; i++) {   if (hq->tab[i] == NULL)   continue;   /* fprintquery(stdout, hq->tab[i]->data); */   /* fprintf(stdout, "\n"); */ - search(hq->tab[i]->data, hdoc, hterm); + search(hq->tab[i]->data, hdoc, hterm, h, m);   }     /* fprintf(stderr, "\n"); */
Change 1 of 4 Show Entire File parser.c Stacked
 
138
139
140
141
 
142
143
144
 
181
182
183
184
 
185
186
187
 
226
227
228
 
 
 
229
230
231
 
292
293
294
 
 
 
 
 
 
295
296
297
 
138
139
140
 
141
142
143
144
 
181
182
183
 
184
185
186
187
 
226
227
228
229
230
231
232
233
234
 
295
296
297
298
299
300
301
302
303
304
305
306
@@ -138,7 +138,7 @@
  hpost = newhash(NHASH, cmppost, hashpost);   token = newtoken(NULL, KB);   lowmem = KB; - n_txt = sumtf = uterm = sumsqtf = maxtf = b_read = 0; + n_txt = sumtf = uterm = sumsqtf = maxtf = b_read = 0;   tfile = newTFile(TREC);   tdoc = newTDoc(TREC);   n_txtbuf = TXTBUFSIZE; @@ -181,7 +181,7 @@
  }   freehash(hpost, freepost);   hpost = newhash(NHASH, cmppost, hashpost); - +   /* fill tdoc->h->n_txt */   tdoc->h->n_txt = n_txt;   @@ -226,6 +226,9 @@
    /* update TFile->THeader */   tfile->h->n++; + tfile->h->sumnb += n_txt; + tfile->h->sumnu += uterm; + tfile->h->sumnt += sumtf;   tfile->h->b += n_subheader + tdoc->h->n_txt + tdoc->h->n_id;   for (int i = 0; i < tfile->h->r; i++)   tfile->h->b += tdoc->h->n_rsrc[i]; @@ -292,6 +295,12 @@
  sizeof(uint32_t));   crc = crc_wordwise(&crc_m, crc, (unsigned char *)&tfile->h->n,   sizeof(uint32_t)); + crc = crc_wordwise(&crc_m, crc, (unsigned char *)&tfile->h->sumnb, + sizeof(uint32_t)); + crc = crc_wordwise(&crc_m, crc, (unsigned char *)&tfile->h->sumnu, + sizeof(uint32_t)); + crc = crc_wordwise(&crc_m, crc, (unsigned char *)&tfile->h->sumnt, + sizeof(uint32_t));   crc = crc_wordwise(&crc_m, crc, (unsigned char *)&tfile->h->r,   sizeof(uint32_t));   crc = crc_wordwise(&crc_m, crc, (unsigned char *)&tfile->h->bo,
Change 1 of 5 Show Entire File tfile.c Stacked
 
10
11
12
13
14
15
16
17
18
19
20
 
 
 
 
 
 
 
 
 
 
 
21
22
23
 
108
109
110
111
112
113
114
115
116
117
118
119
 
 
 
 
 
 
 
 
 
 
 
 
120
121
122
 
193
194
195
196
197
198
199
200
201
202
203
204
 
 
 
 
 
 
 
 
 
 
 
 
205
206
207
 
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
 
 
301
302
303
 
368
369
370
371
372
373
374
375
 
 
 
 
 
 
376
377
378
 
10
11
12
 
 
 
 
 
 
 
 
13
14
15
16
17
18
19
20
21
22
23
24
25
26
 
111
112
113
 
 
 
 
 
 
 
 
 
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
 
199
200
201
 
 
 
 
 
 
 
 
 
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
 
292
293
294
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
296
297
298
299
 
364
365
366
 
 
 
 
 
367
368
369
370
371
372
373
374
375
@@ -10,14 +10,17 @@
 {   THeader *h;   h = (THeader *)emalloc(sizeof(THeader)); - h->mg = 41; - h->crc = 0; - h->ver = 1; - h->type = type; - h->b = 0; - h->n = 0; - h->r = 0; - h->bo = 32; + h->mg = 41; + h->crc = 0; + h->ver = 1; + h->type = type; + h->b = 0; + h->n = 0; + h->sumnb = 0; + h->sumnu = 0; + h->sumnt = 0; + h->r = 0; + h->bo = sizeof(THeader);   if (type == TREC)   h->r = TREC_RSRC;   return h; @@ -108,15 +111,18 @@
    /* write the THeader structure */   v = 0; - n = fwrite(&tfile->h->mg, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fwrite(&tfile->h->crc, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fwrite(&tfile->h->ver, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fwrite(&tfile->h->type, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fwrite(&tfile->h->b, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fwrite(&tfile->h->n, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fwrite(&tfile->h->r, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fwrite(&tfile->h->bo, sizeof(uint32_t), 1, fp); v |= n; - if (v != 0x00FF) { /* 0000 0000 1111 1111 */ + n = fwrite(&tfile->h->mg, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->crc, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->ver, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->type, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->b, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->n, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->sumnb, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->sumnu, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->sumnt, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->r, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->bo, sizeof(uint32_t), 1, fp); v |= n; + if (v != 0x07FF) { /* 0000 0111 1111 1111 */   fprintf(stderr, "ERROR: failed to write THeader\n");   return 0;   } @@ -193,15 +199,18 @@
    /* fill the THeader */   v = 0; - n = fread(&h->mg, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fread(&h->crc, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fread(&h->ver, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fread(&h->type, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fread(&h->b, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fread(&h->n, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fread(&h->r, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fread(&h->bo, sizeof(uint32_t), 1, fp); v |= n; - if (v != 0x00FF) { /* 0000 0000 1111 1111 */ + n = fread(&h->mg, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&h->crc, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&h->ver, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&h->type, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&h->b, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&h->n, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&h->sumnb, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&h->sumnu, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&h->sumnt, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&h->r, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&h->bo, sizeof(uint32_t), 1, fp); v |= n; + if (v != 0x07FF) { /* 0000 0111 1111 1111 */   fprintf(stderr, "ERROR: failed to read THeader\n");   return NULL;   } @@ -283,21 +292,8 @@
    tfile = newTFile(TREC);   - /* fill the THeader */ - v = 0; - n = fread(&tfile->h->mg, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fread(&tfile->h->crc, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fread(&tfile->h->ver, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fread(&tfile->h->type, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fread(&tfile->h->b, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fread(&tfile->h->n, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fread(&tfile->h->r, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fread(&tfile->h->bo, sizeof(uint32_t), 1, fp); v |= n; - if (v != 0x00FF) { /* 0000 0000 1111 1111 */ - fprintf(stderr, "ERROR: failed to read THeader\n"); - return NULL; - } - + readTHeader(tfile->h, fp); +   /* fill the TDoc structures */   for (int i = 0; i < tfile->h->n; i++) {   @@ -368,11 +364,12 @@
   void _printTHeader(THeader *h)  { - printf("%-2s %-10s %-1s %-1s %-10s %-10s %-2s %-2s\n", - "MG", "CRC", "V", "T", "BYTES", "N", "R", "BO"); - printf("%-2u %-10u %-1u %-1u %-10u %-10u %-2u %-2u\n", - h->mg, h->crc, h->ver, h->type, - h->b, h->n, h->r, h->bo); + printf("%-2s %-10s %-1s %-1s %-10s %-10s %-10s %-10s %-10s %-2s %-2s\n", + "MG", "CRC", "V", "T", "BYTES", "N", "SUMNB", "SUMNU", "SUMNT", "R", "BO"); + printf("%-2u %-10u %-1u %-1u %-10u %-10u %-10u %-10u %-10u %-2u %-2u\n", + h->mg, h->crc, h->ver, h->type, + h->b, h->n, h->sumnb, h->sumnu, h->sumnt, + h->r, h->bo);  }    void _printTSubHeader(TSubHeader *h, E_TDocType type)
Change 1 of 3 Show Entire File tfile.h Stacked
 
1
 
2
3
 
4
5
6
 
18
19
20
 
 
 
21
22
23
 
42
43
44
45
46
 
 
47
48
49
50
51
52
53
54
55
56
57
58
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
 
3
4
5
6
 
18
19
20
21
22
23
24
25
26
 
45
46
47
 
 
48
49
50
 
 
 
 
 
 
 
 
 
 
 
 
51
52
53
54
55
56
57
58
59
60
61
62
@@ -1,6 +1,6 @@
-#define TREC_RSRC 4 /* number of resources for TDoc of type TREC */ +#define TREC_RSRC 4 /* number of resources for a TDoc of type TREC */  #define MAX_RSRC 10 /* maximum number of resources */ -#define TXTBUFSIZE 10240 /* 10KB */ +#define TXTBUFSIZE 10240 /* TDoc->txt buffer size, 10KB */  #define IDBUFSIZE 64 /* A UUID (if used) would be max 36 chars */  #define RSRCBUFSIZE 32   @@ -18,6 +18,9 @@
  uint32_t type;   uint32_t b;   uint32_t n; + uint32_t sumnb; + uint32_t sumnu; + uint32_t sumnt;   uint32_t r;   uint32_t bo;  }; @@ -42,18 +45,18 @@
 };    E_TDocType getTDocType(char*); -THeader *_newTHeader(E_TDocType); -void _freeTHeader(void*); +THeader *_newTHeader(E_TDocType); +void _freeTHeader(void*);  TSubHeader *_newTSubHeader(E_TDocType); -TDoc *newTDoc(E_TDocType); -TFile *newTFile(E_TDocType); -void freeTDoc(void*, E_TDocType); -void freeTFile(TFile*, E_TDocType); -int writeTFile(TFile*, FILE*); -THeader *readTHeader(THeader*, FILE*); -TDoc *readTDoc(TDoc*, FILE*, E_TDocType); -TFile *readTFile(FILE*); -void _printTHeader(THeader*); -void _printTSubHeader(TSubHeader*, E_TDocType); -void _printTDoc(TDoc*, E_TDocType); -void printTFile(TFile*); +TDoc *newTDoc(E_TDocType); +TFile *newTFile(E_TDocType); +void freeTDoc(void*, E_TDocType); +void freeTFile(TFile*, E_TDocType); +int writeTFile(TFile*, FILE*); +THeader *readTHeader(THeader*, FILE*); +TDoc *readTDoc(TDoc*, FILE*, E_TDocType); +TFile *readTFile(FILE*); +void _printTHeader(THeader*); +void _printTSubHeader(TSubHeader*, E_TDocType); +void _printTDoc(TDoc*, E_TDocType); +void printTFile(TFile*);
Change 1 of 6 Show Entire File txt.c Stacked
 
1
2
3
 
4
 
5
 
6
7
8
9
10
 
 
11
12
13
14
15
16
17
18
 
 
 
 
 
19
20
21
 
27
28
29
30
 
 
 
 
 
 
31
32
33
34
35
36
37
38
39
40
41
42
43
 
53
54
55
56
57
 
 
58
59
60
 
64
65
66
67
68
 
 
69
70
71
 
97
98
99
100
 
101
102
103
 
252
253
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
 
13
14
15
16
17
18
 
 
 
 
19
20
21
22
23
24
25
26
 
32
33
34
 
35
36
37
38
39
40
41
42
43
44
45
46
 
 
 
 
47
48
49
 
59
60
61
 
 
62
63
64
65
66
 
70
71
72
 
 
73
74
75
76
77
 
103
104
105
 
106
107
108
109
 
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
@@ -1,21 +1,26 @@
 #include <stdio.h>  #include <stdlib.h>  #include <string.h> +#include <math.h>  #include <kak/eprintf.h> +#include <kak/kcommon.h>  #include "kak/klist.h" +#include "kak/khash.h"  #include "txt.h"    /* Doc */   -Doc *newdoc(char *id, uint32_t n_term, uint32_t n_uterm, uint32_t n_byte, uint32_t sumsq) +Doc *newdoc(char *id, uint32_t nb, uint32_t sumtf, + uint32_t uterm, uint32_t sumsqtf, uint32_t maxtf)  {   Doc *d;   d = (Doc *)emalloc(sizeof(Doc));   d->id = estrdup(id); - d->n_term = n_term; - d->n_uterm = n_uterm; - d->n_byte = n_byte; - d->sumsq = sumsq; + d->nb = nb; + d->sumtf = sumtf; + d->uterm = uterm; + d->sumsqtf = sumsqtf; + d->maxtf = maxtf;   return d;  }   @@ -27,17 +32,18 @@
  free(d);  }   -int p_cmpdoc(void *d1, void *d2) +int cmpdoc(void *d1, void *d2) +{ + return strcmp(((Doc *)d1)->id, ((Doc *)d2)->id); +} + +int cmpdoc_p(void *d1, void *d2)  {   if (((Doc *)d1)->id == ((Doc *)d2)->id)   return 0;   return 1;  }   -int cmpdoc(void *d1, void *d2) -{ - return strcmp(((Doc *)d1)->id, ((Doc *)d2)->id); -}    unsigned hashdoc(void *data, unsigned hsize)  { @@ -53,8 +59,8 @@
 {   Doc *d;   d = (Doc *)data; - fprintf(stream, " %s:%u:%u:%u:%u", d->id, - d->n_term, d->n_uterm, d->n_byte, d->sumsq); + fprintf(stream, " %s:%u:%u:%u:%u:%u", d->id, + d->nb, d->sumtf, d->uterm, d->sumsqtf, d->maxtf);  }    /* Query */ @@ -64,8 +70,8 @@
  Query *q;   q = (Query *)emalloc(sizeof(Query));   q->id = estrdup(id); - q->n_term = 0; - q->n_uterm = 0; + q->sumtf = 0; + q->uterm = 0;   q->tlist = NULL;   return q;  } @@ -97,7 +103,7 @@
 {   Query *q;   q = (Query *)data; - fprintf(stream, " %s:%u:%u", q->id, q->n_term, q->n_uterm); + fprintf(stream, " %s:%u:%u", q->id, q->sumtf, q->uterm);   for (Node *np = q->tlist; np != NULL; np = np->next)   fprintpost(stream, np->data);  } @@ -252,3 +258,110 @@
  h = MULTIPLIER * h + *p;   return h % hsize;  } + +/* Score */ + +Score *newscore(char *id, double n) +{ + Score *s; + s = (Score *)malloc(sizeof(Score)); + s->id = estrdup(id); + s->n = n; + return s; +} + +void freescore(void *s) +{ + if (s == NULL) + return; + free(((Score *)s)->id); + free(s); +} + +Score *newscore_s(char *id, double n) /* shallow */ +{ + Score *s; + s = (Score *)emalloc(sizeof(Score)); + s->id = id; + s->n = n; + return s; +} + +void freescore_s(void *s) +{ + free(s); +} + +int cmpscore(void *d1, void *d2) +{ + return strcmp(((Post *)d1)->id, ((Post *)d2)->id); +} + +int cmpscore_n(void *d1, void *d2) +{ + /* TODO: do floating point comparison correctly */ + int ret = 0; + if (((Score *)d1)->n < ((Score *)d2)->n) + ret = -1; + else + ret = 1; + return ret; +} + +unsigned hashscore(void *data, unsigned hsize) +{ + return _strhash(((Score *)data)->id, hsize); +} + +/* models */ + +/** test **/ + +double _tf(uint32_t tf, uint32_t sumtf, uint32_t uterm, + uint32_t sumsqtf, uint32_t maxtf, uint32_t nb, + uint32_t sumnb, uint32_t sumnu, uint32_t sumnt, + uint32_t n) +{ + return (double)tf; +} + +double _df(uint32_t df, uint32_t n) +{ + return 1.0; +} + +double _qtf(uint32_t qtf) +{ + return (double)qtf; +} + +/** SMART dtb **/ + +double SMART_d_b(uint32_t tf, uint32_t sumtf, uint32_t uterm, + uint32_t sumsqtf, uint32_t maxtf, uint32_t nb, + uint32_t sumnb, uint32_t sumnu, uint32_t sumnt, + uint32_t n) +{ + static const double ln2 = 0.69314718; + double tf_; + tf_ = (1.0 + log(1.0 + log(tf))) / (0.8 * (sumnb / n) + 0.2 * nb); + return tf_; +} + +double SMART__t_(uint32_t df, uint32_t n) +{ + static const double ln2 = 0.69314718; + double df_; + df_ = log((n + 1) / df); + return df_; +} + +/** OKAPI BM25 **/ + +double OKAPI_BM25_qtf(uint32_t qtf) +{ + static const double K3 = 1000.0; + double qtf_; + qtf_ = ((K3 + 1.0) * qtf) / (K3 + qtf); + return qtf_; +}
Change 1 of 3 Show Entire File txt.h Stacked
 
2
3
4
 
 
 
 
 
 
 
 
 
5
6
7
8
9
10
11
 
 
 
 
 
12
13
14
15
16
17
 
 
18
19
20
 
29
30
31
32
 
 
 
 
 
 
 
 
 
 
 
 
33
34
 
35
36
37
 
59
60
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
 
 
 
 
17
18
19
20
21
22
23
24
25
 
 
26
27
28
29
30
 
39
40
41
 
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
 
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
@@ -2,19 +2,29 @@
 typedef struct Query Query;  typedef struct Term Term;  typedef struct Post Post; +typedef struct Score Score; +typedef struct Model Model; + +typedef double (tf_fn)(uint32_t, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t, + uint32_t); +typedef double (df_fn)(uint32_t, uint32_t); +typedef double (qtf_fn)(uint32_t);    struct Doc {   char *id; - uint32_t n_term; - uint32_t n_uterm; - uint32_t n_byte; - uint32_t sumsq; + uint32_t nb; + uint32_t sumtf; + uint32_t uterm; + uint32_t sumsqtf; + uint32_t maxtf;  };    struct Query {   char *id; - uint32_t n_term; - uint32_t n_uterm; + uint32_t sumtf; + uint32_t uterm;   Node *tlist;  };   @@ -29,9 +39,21 @@
  int tf;  };   -Doc *newdoc(char*, uint32_t, uint32_t, uint32_t, uint32_t); +struct Score { + char* id; + double n; +}; + +struct Model { + tf_fn *tf; + df_fn *df; + qtf_fn *qtf; +}; + +Doc *newdoc(char*, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t);  void freedoc(void*);  int cmpdoc(void*, void*); +int cmpdoc_p(void*, void*);  unsigned hashdoc(void*, unsigned);  void fprintdoc(FILE*, void*);   @@ -59,3 +81,29 @@
 int cmppost_tf(const void*, const void*);  void fprintpost(FILE*, void*);  unsigned hashpost(void*, unsigned); + +/* Score */ +Score *newscore(char*, double); +void freescore(void*); +Score *newscore_s(char*, double); +void freescore_s(void*); +int cmpscore(void*, void*); +int cmpscore_n(void*, void*); +unsigned hashscore(void*, unsigned); + +/* Models */ + +/* /\** test **\/ */ + +tf_fn _tf; +df_fn _df; +qtf_fn _qtf; + +/* /\** SMART dtb **\/ */ + +tf_fn SMART_d_b; +df_fn SMART__t_; + +/* /\** OKAPI BM25 **\/ */ + +qtf_fn OKAPI_BM25_qtf;