Repositories » TXT0 Read More
Clone URL:  
Pushed to one repository · View In Graph Contained in tip

Reworked model interface, added models, Alice and AP output redone.

Changeset dfb300e6e68a

Parent 3c76be16e685

by Rup Palchowdhury

Changes to 11 files · Browse files at dfb300e6e68a Showing diff from parent 3c76be16e685 Diff from another changeset...

Change 1 of 1 Show Entire File Makefile Stacked
 
67
68
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
@@ -67,3 +67,25 @@
 .PHONY: iitest  iitest:   ./ii -s test/alice_query.t <test/alice.t | sort -k1,1 -k3,3nr | diff -q - test/alice.rank + +.PHONY: alice +alice: + ./raw2t -x -n -c TREC <test/alice.txt >/tmp/alice.t + ./raw2t -x -n -c TRECQUERY <test/alice_query.txt >/tmp/alice_query.t + ./t2mem </tmp/alice.t >/tmp/alice.mem + ./t2mem </tmp/alice_query.t >/tmp/alice_query.mem + ./ii -s /tmp/alice_query.t </tmp/alice.t >/tmp/alice.res + sort -k1,1 -k3,3nr /tmp/alice.res >/tmp/alice.rank + awk -f txt2trecrun.awk </tmp/alice.rank >/tmp/alice.run + ~/ir/trec_eval.9.0/trec_eval -q test/alice.qrel /tmp/alice.run >/tmp/alice.eval + +.PHONY: ap +ap: + ./raw2t -x -n -c TREC <test/ap.txt >/tmp/ap.t + ./raw2t -x -n -c TRECQUERY <test/ap_query.txt >/tmp/ap_query.t + ./t2mem </tmp/ap.t >/tmp/ap.mem + ./t2mem </tmp/ap_query.t >/tmp/ap_query.mem + ./ii -s /tmp/ap_query.t </tmp/ap.t >/tmp/ap.res + sort -k1,1 -k3,3nr /tmp/ap.res >/tmp/ap.rank + awk -f txt2trecrun.awk </tmp/ap.rank >/tmp/ap.run + ~/ir/trec_eval.9.0/trec_eval -q test/ap.qrel /tmp/ap.run >/tmp/ap.eval
Change 1 of 6 Show Entire File ii.c Stacked
 
173
174
175
176
 
177
178
179
 
180
181
182
 
199
200
201
202
203
204
205
206
207
208
 
227
228
229
230
231
232
233
234
235
236
237
238
 
251
252
253
254
255
256
257
258
259
260
261
262
263
264
 
283
284
285
286
287
288
 
 
 
 
289
290
291
292
293
 
 
 
 
 
 
 
 
 
294
295
296
 
314
315
316
317
 
318
319
320
321
322
323
324
325
326
 
327
328
329
 
 
330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
332
333
 
173
174
175
 
176
177
178
 
179
180
181
182
 
199
200
201
 
 
 
 
202
203
204
 
223
224
225
 
 
 
 
 
 
226
227
228
 
241
242
243
 
 
 
 
 
 
 
 
244
245
246
 
265
266
267
 
 
 
268
269
270
271
272
273
274
275
 
276
277
278
279
280
281
282
283
284
285
286
287
 
305
306
307
 
308
309
310
311
312
313
314
 
 
 
315
316
 
 
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
@@ -173,10 +173,10 @@
 void search(Query *q, Hash *hdoc, Hash *hterm, THeader *h, Model m)  {   double tf_, idf_, qtf_, w_; - Post *post, *post_, *post_t, *post_d; + Post *post_t, *post_d;   Term *term, term__;   Doc *doc, doc__; - Node *nppost, *nppost_, *npterm, node__, *npdoc, *npscore, *npscore_; + Node *npterm, node__, *npdoc, *npscore, *npscore_;   Score *score, *score_;   Hash *hresult;   @@ -199,10 +199,6 @@
    post_d = (Post *)(np1->data);   - /* post_ = newpost(post_d->id, post_d->tf * post_t->tf); */ - /* nppost_ = newnode(post_); */ - /* nppost = hlookup(hpost, nppost_, 1); */ -   doc__.id = post_d->id;   node__.data = &doc__;   hdoc->cmp = cmpdoc_p; /* dismantle cmp_fn */ @@ -227,12 +223,6 @@
  npscore_ = newnode(score_);   npscore = hlookup(hresult, npscore_, 1);   - /* if (nppost != nppost_) { /\* merge *\/ */ - /* post = (Post *)(nppost->data); */ - /* post->tf += post_->tf; */ - /* freenode(nppost_, freepost); */ - /* } */ -   if (npscore != npscore_) { /* merge */   score = (Score *)(npscore->data);   score->n += score_->n; @@ -251,14 +241,6 @@
    freehash(hresult, freescore_s);   - /* results in a list that points to nodes in the hash table */ - /* - for (Node *np = reslist; np != NULL; np = np->next) { - post = (Post *)(np->data); - printf("%d %s %d\n", n_q, post->id, post->tf); - } - */ -   /*TODO: figure out a way to free memory neatly*/   /* freelist(reslist, NULL); */   /* reslist = NULL; */ @@ -283,14 +265,23 @@
   int main(int argc, char *argv[])  { - int opt = 0; - char qfile[KB]; - + int opt_s, opt_m; + char qfile[KB], mstr[KB]; + + opt_s = opt_m = 0;   for (int i = 1; i < argc; i++) {   if (strcmp(argv[i], "-s") == 0) {   if (i + 1 <= argc - 1) {   strcpy(qfile, argv[++i]); - opt = 1; + opt_s = 1; + } + else + usage(argv[0]); + } + else if (strcmp(argv[i], "-m") == 0) { + if (i + 1 <= argc - 1) { + strcpy(mstr, argv[++i]); + opt_m = 1;   }   else   usage(argv[0]); @@ -314,20 +305,36 @@
  /* fprinthash(stdout, hterm, fprintterm); */   /* hstats(hdoc, NDOCS); */   - if (opt) { + if (opt_s) {   fpq = fopen(qfile, "r");   buildq(hq, fpq);   fclose(fpq);   /* fprinthash(stdout, hq, fprintquery); */   }   - /* m.tf = &_tf; */ - /* m.df = &_df; */ - /* m.qtf = &_qtf; */ + /* test model, default */   - m.tf = &SMART_d_b; - m.df = &SMART__t_; + m.tf = &_tf; + m.df = &_df;   m.qtf = &_qtf; + + if (opt_m) { + if (strcmp(mstr, "SMART_dtb") == 0) { + m.tf = &SMART_d_b_tf; + m.df = &SMART__t__df; + m.qtf = &_qtf; + } + else if (strcmp(mstr, "OKAPI_BM25") == 0) { + m.tf = &OKAPI_BM25_tf; + m.df = &OKAPI_BM25_df; + m.qtf = &OKAPI_BM25_qtf; + } + else if (strcmp(mstr, "SMART_bxx") == 0) { + m.tf = &SMART_b___tf; + m.df = &_df; + m.qtf = &_qtf; + } + }     for (int i = 0; i < hq->n; i++) {   if (hq->tab[i] == NULL)
Change 1 of 1 Show Entire File test/​alice.mem Stacked
 
1
2
3
4
 
 
 
 
5
6
7
8
9
10
11
 
 
 
12
13
14
15
16
17
18
 
 
 
19
20
21
22
23
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
 
 
10
11
12
13
14
15
16
17
 
 
18
19
20
21
22
23
24
25
26
@@ -1,23 +1,26 @@
-MG CRC V T BYTES N R BO -41 2797950090 1 0 622 3 3 32 -CRC TXT ID RSRC0 RSRC1 RSRC2 -2702988819 181 2 2 2 2 +MG CRC V T BYTES N SUMNB SUMNU SUMNT R BO +41 2930072002 1 0 649 3 494 63 70 4 44 +CRC TXT ID RSRC0 RSRC1 RSRC2 RSRC3 +1210775004 181 2 2 2 2 1  tdoc->id: A.  tdoc->txt: rude 1 open 1 curios 1 look 1 cut 1 alic 2 learn 1 hear 1 desk 1 veri 2 remark 1 great 1 hair 1 sever 1 wide 1 first 1 hatter 2 speech 1 write 1 ey 1 person 1 time 1 raven 1 make 1 .  tdoc->rsrc[0]: 27.  tdoc->rsrc[1]: 24.  tdoc->rsrc[2]: 33. -CRC TXT ID RSRC0 RSRC1 RSRC2 -1863705286 187 2 2 2 2 +tdoc->rsrc[3]: 2. +CRC TXT ID RSRC0 RSRC1 RSRC2 RSRC3 +3853360751 187 2 2 2 2 1  tdoc->id: B.  tdoc->txt: adventur 1 issu 1 wrong 2 alic 1 1897 1 nevar 1 front 2 never 2 carrol 1 veri 1 propos 1 final 1 flat 1 note 1 himself 1 earli 1 spell 1 lewi 1 answer 1 produc 1 raven 1 though 1 revis 2 .  tdoc->rsrc[0]: 27.  tdoc->rsrc[1]: 23.  tdoc->rsrc[2]: 35. -CRC TXT ID RSRC0 RSRC1 RSRC2 -711335375 126 2 2 2 2 +tdoc->rsrc[3]: 2. +CRC TXT ID RSRC0 RSRC1 RSRC2 RSRC3 +3471635844 126 2 2 2 2 1  tdoc->id: C.  tdoc->txt: alic 1 both 1 dip 1 more 1 gave 1 1990 1 martin 1 gardner 1 flap 1 sent 1 answer 1 quill 1 reader 1 annot 1 possibl 1 slope 1 .  tdoc->rsrc[0]: 16.  tdoc->rsrc[1]: 16.  tdoc->rsrc[2]: 16. +tdoc->rsrc[3]: 1.
Change 1 of 1 Show Entire File test/​alice.rank Stacked
 
1
2
3
4
5
6
7
8
9
10
11
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
@@ -1,12 +1,12 @@
-1 A 7 -1 B 3 -1 C 1 -2 B 3 -2 A 2 -2 C 1 -3 A 6 -3 B 2 -3 C 2 -4 C 5 -4 A 2 -4 B 1 +1 A 7.000000 +1 B 3.000000 +1 C 1.000000 +2 B 3.000000 +2 A 2.000000 +2 C 1.000000 +3 A 6.000000 +3 B 2.000000 +3 C 2.000000 +4 C 5.000000 +4 A 2.000000 +4 B 1.000000
Change 1 of 1 Show Entire File test/​alice.res Stacked
 
1
2
3
4
5
6
7
8
9
10
11
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
@@ -1,12 +1,12 @@
-1 A 7 -1 B 3 -1 C 1 -2 A 2 -2 B 3 -2 C 1 -3 A 6 -3 B 2 -3 C 2 -4 C 5 -4 A 2 -4 B 1 +1 A 7.000000 +1 B 3.000000 +1 C 1.000000 +2 A 2.000000 +2 B 3.000000 +2 C 1.000000 +3 A 6.000000 +3 B 2.000000 +3 C 2.000000 +4 A 2.000000 +4 B 1.000000 +4 C 5.000000
Change 1 of 1 Show Entire File test/​alice.run Stacked
 
1
2
3
4
5
6
7
8
9
10
11
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
@@ -1,12 +1,12 @@
-1 Q0 A 0 7 . -1 Q0 B 1 3 . -1 Q0 C 2 1 . -2 Q0 B 0 3 . -2 Q0 A 1 2 . -2 Q0 C 2 1 . -3 Q0 A 0 6 . -3 Q0 B 1 2 . -3 Q0 C 2 2 . -4 Q0 C 0 5 . -4 Q0 A 1 2 . -4 Q0 B 2 1 . +1 Q0 A 0 7.000000 . +1 Q0 B 1 3.000000 . +1 Q0 C 2 1.000000 . +2 Q0 B 0 3.000000 . +2 Q0 A 1 2.000000 . +2 Q0 C 2 1.000000 . +3 Q0 A 0 6.000000 . +3 Q0 B 1 2.000000 . +3 Q0 C 2 2.000000 . +4 Q0 C 0 5.000000 . +4 Q0 A 1 2.000000 . +4 Q0 B 2 1.000000 .
 
 
 
1
2
3
4
 
 
 
 
5
6
7
8
9
10
11
 
 
 
12
13
14
15
16
17
18
 
 
 
19
20
21
22
23
24
25
 
 
 
26
27
28
29
30
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
 
 
10
11
12
13
14
15
16
17
 
 
18
19
20
21
22
23
24
25
 
 
26
27
28
29
30
31
32
33
34
@@ -1,30 +1,34 @@
-MG CRC V T BYTES N R BO -41 3101715852 1 0 368 4 3 32 -CRC TXT ID RSRC0 RSRC1 RSRC2 -944899761 63 2 1 1 1 +MG CRC V T BYTES N SUMNB SUMNU SUMNT R BO +41 194642396 1 0 400 4 220 26 26 4 44 +CRC TXT ID RSRC0 RSRC1 RSRC2 RSRC3 +3472369368 63 2 1 1 1 1  tdoc->id: 1.  tdoc->txt: adventur 1 alic 1 desk 1 hatter 1 wonderland 1 write 1 raven 1 .  tdoc->rsrc[0]: 7.  tdoc->rsrc[1]: 7.  tdoc->rsrc[2]: 7. -CRC TXT ID RSRC0 RSRC1 RSRC2 -497613859 48 2 1 1 1 +tdoc->rsrc[3]: 1. +CRC TXT ID RSRC0 RSRC1 RSRC2 RSRC3 +3441830555 48 2 1 1 1 1  tdoc->id: 2.  tdoc->txt: puzzl 1 carrol 1 desk 1 lewi 1 answer 1 write 1 .  tdoc->rsrc[0]: 6.  tdoc->rsrc[1]: 6.  tdoc->rsrc[2]: 6. -CRC TXT ID RSRC0 RSRC1 RSRC2 -85283453 68 2 1 1 1 +tdoc->rsrc[3]: 1. +CRC TXT ID RSRC0 RSRC1 RSRC2 RSRC3 +2269894019 68 2 1 1 1 1  tdoc->id: 3.  tdoc->txt: pose 1 alic 1 puzzl 1 desk 1 hatter 1 wonderland 1 answer 1 write 1 .  tdoc->rsrc[0]: 8.  tdoc->rsrc[1]: 8.  tdoc->rsrc[2]: 8. -CRC TXT ID RSRC0 RSRC1 RSRC2 -32326634 41 2 1 1 1 +tdoc->rsrc[3]: 1. +CRC TXT ID RSRC0 RSRC1 RSRC2 RSRC3 +864232047 41 2 1 1 1 1  tdoc->id: 4.  tdoc->txt: alic 1 more 1 martin 1 gardner 1 annot 1 .  tdoc->rsrc[0]: 5.  tdoc->rsrc[1]: 5.  tdoc->rsrc[2]: 5. +tdoc->rsrc[3]: 1.
 
 
Change 1 of 1 Show Entire File txt.c Stacked
 
335
336
337
338
 
339
340
341
342
343
 
 
 
 
344
345
346
347
 
 
 
 
 
 
 
 
348
349
350
351
 
 
 
 
 
 
352
353
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
355
 
356
357
358
359
360
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
362
363
 
 
 
364
365
 
366
367
 
335
336
337
 
338
339
 
 
 
 
340
341
342
343
344
 
 
 
345
346
347
348
349
350
351
352
353
354
355
 
356
357
358
359
360
361
362
 
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
 
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
 
414
415
416
417
 
418
419
420
@@ -335,33 +335,86 @@
  return (double)qtf;  }   -/** SMART dtb **/ +/** SMART bxx **/   -double SMART_d_b(uint32_t tf, uint32_t sumtf, uint32_t uterm, - uint32_t sumsqtf, uint32_t maxtf, uint32_t nb, - uint32_t sumnb, uint32_t sumnu, uint32_t sumnt, - uint32_t n) +double SMART_b___tf(uint32_t tf, uint32_t sumtf, uint32_t uterm, + uint32_t sumsqtf, uint32_t maxtf, uint32_t nb, + uint32_t sumnb, uint32_t sumnu, uint32_t sumnt, + uint32_t n)  { - static const double ln2 = 0.69314718; - double tf_; - tf_ = (1.0 + log(1.0 + log(tf))) / (0.8 * (sumnb / n) + 0.2 * nb); + /* + s = 0.2 + tf_ = (1 + log(1 + log(tf))) / ((1 - s) * avg(dl) + s * dl) + */ + + double tf_ = 0.0; + if (tf > 0) + tf_ = 1.0;   return tf_;  }   -double SMART__t_(uint32_t df, uint32_t n) +/** SMART dtb **/ + +double SMART_d_b_tf(uint32_t tf, uint32_t sumtf, uint32_t uterm, + uint32_t sumsqtf, uint32_t maxtf, uint32_t nb, + uint32_t sumnb, uint32_t sumnu, uint32_t sumnt, + uint32_t n)  { - static const double ln2 = 0.69314718; + /* + s = 0.2 + tf_ = (1 + log(1 + log(tf))) / ((1 - s) * avg(dl) + s * dl) + */ + + double tf_; + tf_ = (1.0 + log(1.0 + log((double)tf))) / (0.8 * ((double)sumnb / (double)n) + 0.2 * (double)nb); + return tf_; +} + +double SMART__t__df(uint32_t df, uint32_t n) +{ + /* + df_ = log((n + 1) / df) + */ +   double df_; - df_ = log((n + 1) / df); + df_ = log(((double)n + 1.0) / (double)df);   return df_;  }    /** OKAPI BM25 **/   +double OKAPI_BM25_tf(uint32_t tf, uint32_t sumtf, uint32_t uterm, + uint32_t sumsqtf, uint32_t maxtf, uint32_t nb, + uint32_t sumnb, uint32_t sumnu, uint32_t sumnt, + uint32_t n) +{ + /* + K1 = 2 + b = 0.8 + tf_ = ((K1 + 1) * tf) / (K1 * (1 - b + b * (dl / avg(dl))) + tf); + */ + + double tf_; + tf_ = 3.0 * (double)tf / (0.4 + 1.6 * (double)nb * (double)n / (double)sumnb + (double)tf); + return tf_; +} + +double OKAPI_BM25_df(uint32_t df, uint32_t n) +{ + /* + df_ = log((n - df + 0.5) / (df + 0.5)) + */ + double df_; + df_ = log(((double)n - (double)df + 0.5) / ((double)df + 0.5)); + return df_; +} +  double OKAPI_BM25_qtf(uint32_t qtf)  { - static const double K3 = 1000.0; + /* K3 = 1000 */ + /* qtf_ = ((K3 + 1) * qtf) / (K3 + qtf) */ +   double qtf_; - qtf_ = ((K3 + 1.0) * qtf) / (K3 + qtf); + qtf_ = 1001.0 * (double)qtf / (1000.0 + (double)qtf);   return qtf_;  }
Change 1 of 1 Show Entire File txt.h Stacked
 
93
94
95
96
 
97
98
99
100
101
102
 
103
104
105
 
106
107
 
108
 
 
 
 
 
 
 
109
 
93
94
95
 
96
97
98
99
100
101
 
102
103
 
 
104
105
 
106
107
108
109
110
111
112
113
114
115
@@ -93,17 +93,23 @@
   /* Models */   -/* /\** test **\/ */ +/** test **/    tf_fn _tf;  df_fn _df;  qtf_fn _qtf;   -/* /\** SMART dtb **\/ */ +/** SMART bxx **/   -tf_fn SMART_d_b; -df_fn SMART__t_; +tf_fn SMART_b___tf;   -/* /\** OKAPI BM25 **\/ */ +/** SMART dtb **/   +tf_fn SMART_d_b_tf; +df_fn SMART__t__df; + +/** OKAPI BM25 **/ + +tf_fn OKAPI_BM25_tf; +df_fn OKAPI_BM25_df;  qtf_fn OKAPI_BM25_qtf;