Repositories » TXT0
Clone URL:  
Pushed to one repository · View In Graph Contained in tip

Parse-index-search tool-chain.

Almost everything has changed since the last commit. As the hash table
interface in libk developed, ii.c, which used the BST, was retired
(ii-bst.c) and modified to use a hash table. The parser changed, as
the tokenized (.t) format now stores 'term frequency' pairs (a
document vector). This was possible using a hash table from
libk. Changes to the .t file format lead to changes to t2mem and ii.c
where .t files are read in to memory.

The parser now tokenizes a query to a .t format too. ii.c uses a
tokenized query to perform the lookup (search) and print out
results. The ranking of the search result has been left to the UNIX
'sort' program (to keep things simple in ii.c) and another awk script
txt2trecrun.awk converts the output to a format used for TREC runs.

Changeset f651a75b0efa

Parent d6a0854ef449

by Rup Palchowdhury

Changes to 25 files · Browse files at f651a75b0efa Showing diff from parent d6a0854ef449 Diff from another changeset...

Change 1 of 2 Show Entire File Makefile Stacked
 
19
20
21
 
22
23
24
 
 
25
26
27
28
 
29
30
31
 
68
69
70
71
72
 
 
 
73
74
75
76
77
78
 
 
19
20
21
22
23
24
 
25
26
27
28
29
 
30
31
32
33
 
70
71
72
 
 
73
74
75
76
77
 
 
 
 
78
@@ -19,13 +19,15 @@
  tfile.c \   parser.c \   crc.c \ + porter.c \   post.c \   term.c \ - doc.c + doc.c \ + query.c    OBJ = $(patsubst %.c,$(O)/%.o,$(SRC))   -all: raw2t t2mem ii ii1 +all: raw2t t2mem ii1    qparse: $(O)/qparse.o $(OBJ)   $(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) -o $@ $^ $(LIBS) @@ -68,11 +70,9 @@
   .PHONY: parsetest  parsetest: - ./raw2t <test/tiny.txt | ./t2mem | diff -q - test/tiny.mem - ./raw2t <test/big.txt | ./t2mem | diff -q - test/big.mem + ./raw2t -x -n -c TREC <test/alice.txt | ./t2mem | diff -q - test/alice.mem + ./raw2t -x -n -c TRECQUERY <test/alice_query.txt | ./t2mem | diff -q - test/alice_query.mem +  .PHONY: iitest  iitest: - ./ii <test/tiny.t | diff -q - test/tiny.ii - ./ii <test/big.t | diff -q - test/big.ii - ./ii1 <test/tiny.t | sort -t":" -k1,1 | diff -q - test/tiny.ii - ./ii1 <test/big.t | sort -t":" -k1,1 | diff -q - test/big.ii + ./ii1 -s test/alice_query.t <test/alice.t | diff -q - test/alice.res
Change 1 of 1 Show Entire File doc.c Stacked
 
 
1
2
3
4
5
 
6
7
8
9
10
11
 
 
 
 
12
13
14
15
 
 
 
 
 
 
 
 
 
16
17
18
19
20
21
22
 
23
24
25
26
27
 
28
29
30
31
32
33
 
34
35
36
 
 
 
 
 
 
 
 
1
2
3
4
5
 
6
7
8
9
 
 
 
10
11
12
13
14
15
16
 
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
 
32
33
34
35
36
 
37
38
39
40
 
41
 
42
43
44
45
46
47
48
49
50
51
52
@@ -1,36 +1,52 @@
+#include <stdio.h>  #include <stdlib.h>  #include <string.h>  #include "doc.h"   -Doc *newdoc(char *id, unsigned n_byte, unsigned n_term) +Doc *newdoc(char *id, uint32_t n_term, uint32_t n_uterm, uint32_t n_byte)  {   Doc *d;   d = (Doc *)malloc(sizeof(Doc)); - d->id = id; - d->n_byte = n_byte; - d->n_term = n_term; + d->id = strdup(id); + d->n_term = n_term; + d->n_uterm = n_uterm; + d->n_byte = n_byte;   return d;  }   -int p_doccmp(void *d1, void *d2) +void freedoc(void *d) +{ + if (d == NULL) + return; + free(((Doc *)d)->id); + free(d); +} + +int p_cmpdoc(void *d1, void *d2)  {   if (((Doc *)d1)->id == ((Doc *)d2)->id)   return 0;   return 1;  }   -int doccmp(void *d1, void *d2) +int cmpdoc(void *d1, void *d2)  {   return strcmp(((Doc *)d1)->id, ((Doc *)d2)->id);  }   -unsigned dochash(void *data, unsigned hsize) +unsigned hashdoc(void *data, unsigned hsize)  {   static unsigned MULTIPLIER = 31;   unsigned h; - char *p;   h = 0; - for (p = (char *)(((Doc *)data)->id); *p != '\0'; p++) + for (char *p = ((Doc *)data)->id; *p != '\0'; p++)   h = MULTIPLIER * h + *p;   return h % hsize;  } + +void fprintfdoc(FILE *stream, void *data) +{ + Doc *d; + d = (Doc *)data; + fprintf(stream, " %s:%u:%u:%u", d->id, d->n_term, d->n_uterm, d->n_byte); +}
Change 1 of 1 Show Entire File doc.h Stacked
 
2
3
4
5
6
 
 
 
7
8
9
10
11
 
 
 
 
 
 
 
2
3
4
 
 
5
6
7
8
9
 
 
 
10
11
12
13
14
15
@@ -2,10 +2,14 @@
   struct Doc {   char *id; - unsigned n_term; - unsigned n_byte; + uint32_t n_term; + uint32_t n_uterm; + uint32_t n_byte;  };   -Doc *newdoc(char*, unsigned, unsigned); -int doccmp(void*, void*); -unsigned dochash(void*, unsigned); +Doc *newdoc(char*, uint32_t, uint32_t, uint32_t); +void freedoc(void*); +int cmpdoc(void*, void*); +unsigned hashdoc(void*, unsigned); +void fprintdoc(FILE*, void*); +
Change 1 of 1 Show Entire File ii.c Stacked
 
1
2
3
4
5
6
 
7
8
9
10
11
12
13
14
15
 
16
17
18
 
 
19
20
 
21
22
23
24
25
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
28
29
30
31
 
 
32
33
34
35
36
37
38
39
40
41
 
42
43
44
 
45
46
47
 
 
48
49
 
50
51
52
53
 
54
55
56
57
58
 
 
 
 
 
 
59
60
61
62
63
64
65
 
66
67
68
69
70
71
72
 
73
74
75
76
 
 
 
 
 
77
78
79
80
81
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
 
 
 
110
111
 
 
 
 
 
 
112
 
 
 
 
 
113
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
116
117
118
 
 
 
 
 
 
119
120
 
 
 
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
123
124
125
126
 
 
127
128
129
 
 
 
1
2
3
4
5
6
7
 
8
9
10
11
12
13
14
15
 
 
16
17
18
 
19
20
 
 
 
 
 
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
 
 
 
 
100
101
102
 
 
 
 
 
 
 
 
 
103
104
105
 
106
107
 
 
108
109
110
 
111
112
113
114
 
115
116
117
 
118
 
119
120
121
122
123
124
125
 
 
 
 
 
 
126
127
 
 
 
 
 
 
128
129
 
 
 
130
131
132
133
134
135
 
 
 
 
 
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
 
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
 
 
264
265
266
267
268
269
270
 
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
 
 
293
294
295
296
297
@@ -1,129 +1,297 @@
-/*TODO: ii.c is broken, khash interfaces have changed. Won't compile. */ -  #include <stdio.h>  #include <stdlib.h>  #include <string.h>  #include <assert.h> +#include <kak/kcommon.h>  #include <kak/klog.h>  #include <kak/klist.h> -#include <kak/ktree.h>  #include <kak/khash.h>  #include "post.h"  #include "term.h"  #include "tokenizer.h"  #include "tfile.h"  #include "doc.h" +#include "query.h"   -enum {NTERMS = 100003, NDOCS = 100003}; -/* enum {NTERMS = 10007, NDOCS = 4093}; */ +enum {NTERMS = 100003, NDOCS = 100003, NHASH = 4093}; +/* enum {NTERMS = 10007, NDOCS = 4093, NHASH = 4093}; */   -int main(void) +void buildq(Hash *hq, FILE *fp)  { - int i_; - TDoc *tdoc; - TNode *tree; - Post *post; - Term *term; + TFile *tfile; + TDoc *tdoc; + Post *post_; + Node *np, *npq_, *npq, *nppost_; + Query *q_, *q; + uint32_t k, k_, n_term, n_uterm, tf; + char ts[KB]; + + /* read in the query file */ + + tfile = readTFile(fp); + if (tfile == NULL) { + fprintf(stderr, "readTFile() failed\n"); + exit(1); + } + + for (np = tfile->list; np != NULL; np = np->next) { /* for each query */ + + tdoc = np->data; + + sscanf(tdoc->rsrc[0], "%u", &n_term); + sscanf(tdoc->rsrc[1], "%u", &n_uterm); + + q_ = newquery(strdup(tdoc->id)); + q_->n_term = n_term; + q_->n_uterm = n_uterm; + + npq_ = newnode(q_); + npq = hlookup(hq, npq_, 1); + q = (Query *)npq->data; + /* needn't check for duplicates, query IDs are unique */ + + k = k_ = 0; + + for (int j = 0; j < n_uterm; j++) { /* for each term in query vector */ + + /* pick a term */ + for(; tdoc->txt[k] != ' '; k++); + memcpy(ts, &(tdoc->txt[k_]), k - k_); + ts[k - k_] = '\0'; + k++; k_ = k; + + /* and then pick its frequency */ + for(; tdoc->txt[k] != ' '; k++); + sscanf(&(tdoc->txt[k_]), "%u", &tf); + k++; k_ = k; + + /* pack them in a Post, attach to a list */ + post_ = newpost(ts, tf); + nppost_ = newnode(post_); + q->tlist = addfront(q->tlist, nppost_); + /* needn't check for duplicates, terms are unique */ + } + } + freeTFile(tfile, TRECQUERY); +} + +void bubblesort(Post *p[NDOCS], int nel, int (*cmp)(const void*, const void*)) +{ + Post *tmp; + for (int i = 0; i < nel - 1; i++) { + for (int j = i + 1; j < nel; j++) { + if ((*cmp)((const void *)(p[i]), (const void *)(p[j])) > 0) { + tmp = p[i]; + p[i] = p[j]; + p[j] = tmp; + } + } + } +} + +void buildii(Hash *hdoc, Hash *hterm, FILE *fp) +{ + int pflag_, tflag_; + uint32_t k, k_, n_term, n_uterm, tf; + char ts[KB]; + THeader *h; + TDoc *tdoc;   Doc *doc; - unsigned n_doc, n_term; - char *p_txt, *ts; - THeader *h; - Node *doctab[NDOCS], *termtab[NTERMS]; + Post *post, *post_; + Term *term, *term_;   Node *npdoc, *npdoc_, *npterm, *npterm_; - - - for (int i = 0; i < NDOCS; i++) - doctab[i] = NULL; - for (int i = 0; i < NTERMS; i++) - termtab[i] = NULL; - - tree = NULL; - +   h = _newTHeader(TREC);   - if ((h = readTHeader(h, stdin)) == NULL) + if ((h = readTHeader(h, fp)) == NULL)   exit(0); - - for(n_doc = 1; n_doc <= h->n; n_doc++) { + + for(int i = 1; i <= h->n; i++) { /* for each doc */   - fprintf(stderr, "%d/%d\r", n_doc, h->n); + fprintf(stderr, "\r%d/%d", i, h->n);     tdoc = newTDoc(TREC);   - if ((tdoc = readTDoc(tdoc, stdin)) == NULL) + if ((tdoc = readTDoc(tdoc, fp, TREC)) == NULL)   exit(0);   - tdoc->id[tdoc->h->n_id] = '\0';   sscanf(tdoc->rsrc[0], "%u", &n_term); - doc = newdoc(strdup(tdoc->id), tdoc->h->n_txt, n_term); + sscanf(tdoc->rsrc[1], "%u", &n_uterm); + doc = newdoc(tdoc->id, n_term, n_uterm, tdoc->h->n_txt); + npdoc_ = newnode(doc); + npdoc = hlookup(hdoc, npdoc_, 1); + if (npdoc != npdoc_) /* a repeating doc id */ + freenode(npdoc_, freedoc);   - npdoc_ = newnode(doc); - npdoc = hlookup(doctab, npdoc_, 1, hashdoc, NDOCS, doccmp); - if (npdoc != npdoc_) { - free(doc); - free(npdoc_); - } + k = k_ = 0;   - i_ = 0; - p_txt = tdoc->txt; - - for (int i = 0; i < tdoc->h->n_txt; i++) { - - if (tdoc->txt[i] == ' ') { + for (int j = 0; j < n_uterm; j++) { /* for each term in doc vector */   - ts = (char *)malloc(i - i_ + 1); - memcpy(ts, p_txt, i - i_); - ts[i - i_] = '\0'; + /* pick a term */ + for(; tdoc->txt[k] != ' '; k++); + memcpy(ts, &(tdoc->txt[k_]), k - k_); + ts[k - k_] = '\0'; + k++; k_ = k;   - npterm_ = newnode(ts); - npterm = hlookup(termtab, npterm_, 1, hash, NTERMS, strcmp); - if (npterm != npterm_) { - free(ts); - free(npterm_); + /* and then pick its frequency */ + for(; tdoc->txt[k] != ' '; k++); + sscanf(&(tdoc->txt[k_]), "%u", &tf); + k++; k_ = k; + + /* amass a payload */ + pflag_ = tflag_ = 0; + post_ = newpost_s(((Doc *)(npdoc->data))->id, tf); + term_ = newterm(ts, 1, post_); + npterm_ = newnode(term_); + npterm = hlookup(hterm, npterm_, 1); + + /* drop it on the inverted index */ + if (npterm != npterm_) { /* term exists */ + tflag_ = 1; /* mark to free */ + term = (Term *)(npterm->data); + post = (Post *)(term->plist->data); + if (cmppost_p(post_, post) == 0) { /* post exists */ + post->tf += post_->tf; + pflag_ = 1; /* mark to free */   } - - post = newpost((char *)(((Doc *)(npdoc->data))->id), 1); - term = newterm((char *)(npterm->data), 1, post); - - /* if term exists - if post exists at front of list - post->tf++ - else - add new post at front of list - term->df++ - else - add term to tree - */ - - tree = nrinsert(tree, newtnode((void *)term), termcmp, - term_match_handler); - - i_ = i + 1; - p_txt = &(tdoc->txt[i + 1]); - - if (post->ref == 0) { - free(post); - free(term->plist); - } - if (term->ref == 0) { - free(term); + else { /* attach a new post */ + term->plist = addfront(term->plist, term_->plist); + term->df++;   }   } + + /* cleanup */ + if (pflag_) + freenode(term_->plist, freepost_s); + if (tflag_) + free(term_);   } + freeTDoc(tdoc, TREC); + } + _freeTHeader(h); + fprintf(stderr, "\n"); +}   - freeTDoc(tdoc); +void search(Query *q, Hash *hdoc, Hash *hterm) +{ + uint32_t n_d; + float idf; + Post *post, *post_, *post_t, *post_d; + Term *term, term__; + Node *nppost, *nppost_, *npterm, node__; + Post *res[NDOCS]; /*TODO: use a list in stead? */ + Hash *hpost; + + hpost = newhash(NDOCS, cmppost, hashpost); + n_d = 0; + + for (Node *np = q->tlist; np != NULL; np = np->next) { /* for a term */ + + post_t = (Post *)(np->data); + term__.s = post_t->id; + node__.data = &term__; + npterm = hlookup(hterm, &node__, 0); + + if (npterm == NULL) + continue; + + term = (Term *)(npterm->data); + + for (Node *np1 = term->plist; np1 != NULL; np1 = np1->next) { + post_d = (Post *)(np1->data); + post_ = newpost(post_d->id, post_d->tf * post_t->tf); + nppost_ = newnode(post_); + nppost = hlookup(hpost, nppost_, 1); + if (nppost != nppost_) { /* merge */ + post = (Post *)(nppost->data); + post->tf += post_->tf; + free(nppost_); + } + else + res[n_d++] = post_; + } + } + + for (int i = 0; i < n_d; i++) + printf("%s %s %d\n", q->id, res[i]->id, res[i]->tf); + + /* results in a list that points to nodes in the hash table */ + /* + for (Node *np = reslist; np != NULL; np = np->next) { + post = (Post *)(np->data); + printf("%d %s %d\n", n_q, post->id, post->tf); + } + */ + + /*TODO: figure out a way to free memory neatly*/ + /* freelist(reslist, NULL); */ + /* reslist = NULL; */ + + /* results in a hash table */ + /* for (int i = 0; i < NDOCS; i++) { */ + /* if (hpost[i] == NULL) */ + /* continue; */ + /* for (Node *np = hpost[i]; np != NULL; np = np->next) { */ + /* post = (Post *)(np->data); */ + /* /\* post->tf /= idf; *\/ /\*FIX: post->tf is int *\/ */ + /* printf("%d %s %d\n", n_q, post->id, post->tf); */ + /* } */ + /* } */ +} + +void usage(char *progname) +{ + fprintf(stderr, "usage: %s [-s query]\n", progname); + exit(1); +} + +int main(int argc, char *argv[]) +{ + int opt = 0; + char qfile[KB]; + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-s") == 0) { + if (i + 1 <= argc - 1) { + strcpy(qfile, argv[++i]); + opt = 1; + } + else + usage(argv[0]); + }   }   - fprintf(stderr, "%d/%d\n", n_doc - 1, h->n); - _freeTHeader(h); + Hash *hdoc, *hterm, *hq; + FILE *fpq; + + hdoc = newhash(NDOCS, cmpdoc, hashdoc); + hterm = newhash(NTERMS, cmpterm, hashterm); + hq = newhash(NHASH, cmpquery, hashquery);   - /* applyinorder(tree, printtree, "%s:%d "); */ + buildii(hdoc, hterm, stdin); + /* fprinthash(stdout, hterm, fprintterm); */ + /* hstats(hdoc, NDOCS); */   + if (opt) { + fpq = fopen(qfile, "r"); + buildq(hq, fpq); + fclose(fpq); + /* fprinthash(stdout, hq, fprintquery); */ + } + + for (int i = 0; i < hq->n; i++) { + if (hq->tab[i] == NULL) + continue; + /* fprintquery(stdout, hq->tab[i]->data); */ + /* fprintf(stdout, "\n"); */ + search(hq->tab[i]->data, hdoc, hterm); + } +   /* fprintf(stderr, "\n"); */   /* fprintf(stderr, "doctab[]\n"); */   /* hstats(doctab, NDOCS); */ - /* fprintf(stderr, "termtab[]\n"); */ - /* hstats(termtab, NTERMS); */ + /* fprintf(stderr, "hterm[]\n"); */ + /* hstats(hterm, NTERMS); */     return 0;  }
Change 1 of 4 Show Entire File parser.c Stacked
 
3
4
5
 
6
 
7
 
8
9
 
10
11
12
 
37
38
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
41
 
42
43
44
 
45
46
47
 
48
49
50
51
52
53
 
 
 
54
55
 
 
 
56
57
58
59
60
61
62
63
64
65
66
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
69
70
71
 
 
72
73
 
 
 
 
 
74
75
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
78
79
80
 
81
82
 
 
 
 
 
 
83
84
85
 
90
91
92
93
94
95
 
 
96
97
98
99
100
 
 
101
 
 
 
 
 
 
 
 
102
103
104
105
 
 
 
 
 
 
106
107
108
109
110
111
112
113
 
 
 
 
 
 
 
114
115
116
117
118
119
120
121
122
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
125
126
127
128
 
 
 
129
130
131
 
146
147
148
149
 
150
151
152
 
3
4
5
6
7
8
9
10
11
12
13
14
15
16
 
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
 
115
116
117
 
118
119
120
 
121
122
123
124
 
 
 
125
126
127
128
129
130
131
132
133
134
135
 
 
 
 
 
 
 
 
 
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
 
 
 
151
152
153
 
154
155
156
157
158
159
 
 
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
 
204
205
206
 
 
 
207
208
209
 
 
 
 
210
211
212
213
214
215
216
217
218
219
220
221
 
 
 
222
223
224
225
226
227
228
229
 
 
 
 
 
 
230
231
232
233
234
235
236
237
238
 
 
 
 
 
 
 
 
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
 
265
266
267
268
269
270
 
285
286
287
 
288
289
290
291
@@ -3,10 +3,14 @@
 #include <string.h>  #include <stdint.h>  #include <malloc/malloc.h> +#include <kak/kcommon.h>  #include <kak/klist.h> +#include <kak/khash.h>  #include "crc.h" +#include "stemmer.h"  #include "tokenizer.h"  #include "tfile.h" +#include "post.h"  #include "parser.h"    static char *str = "width=32 poly=0x04c11db7 init=0x00000000 refin=false refout=false xorout=0xffffffff check=0x765e7680 name=\"CRC-32/POSIX\""; @@ -37,49 +41,159 @@
  crc_m.name = NULL;  }   +Parser *newparser(E_TDocType type, int c, int n, int x) +{ + /*TODO: path hard-wired, set it elsewhere */ + + Node *npterm_; + Parser *p; + FILE *fp; + char s[KB]; + char *cfile = "test/common_words.txt"; + + p = (Parser *)malloc(sizeof(Parser)); + p->type = type; + + switch (type) { + case TREC: + p->n_tag = 0; + strcpy(p->endtag, "doc"); + strcpy(p->idtag, "docno"); + break; + case WARC: + fprintf(stderr, "A parser for WARC isn't available.\n"); + exit(1); + break; + case TRECQUERY: + p->n_tag = 0; + strcpy(p->endtag, "top"); + strcpy(p->idtag, "num"); + break; + default: + fprintf(stderr, "Unknown parser type\n"); + exit(1); + } + + p->c = 0; p->hterm = NULL; + p->n = 0; p->stemmer = NULL; + p->x = 0; + + if (c) { + p->c = c; + p->hterm = newhash(NHASH, _strcmp, _strhash); + fp = fopen(cfile, "r"); + while (fgets(s, KB, fp)) { + if (s[strlen(s)-1] == '\n') + s[strlen(s)-1] = '\0'; + npterm_ = newnode(strdup(s)); + hlookup(p->hterm, npterm_, 1); + /* assuming incoming terms are unique */ + } + fclose(fp); + } + if (n) { + p->n = n; + p->stemmer = create_stemmer(); + } + if (x) { + p->x = x; + } + + return p; +} + +void freeparser(Parser *p) +{ + if (p == NULL) + return; + freehash(p->hterm, free); + free_stemmer(p->stemmer); + free(p); +} +  /* TODO: - parser() assumes a TREC-type TFile + parse() assumes a TREC-type TFile   use sizeof() var in stead of type  */ -TFile* parse(FILE *fp) +TFile* parse(Parser *parser, FILE *fp)  {   Tokenizer *toktxt, *tokid; - Token token; + Token *token;   Stack mem;   TFile *tfile;   TDoc *tdoc; - unsigned lowmem, bytesread; - uint32_t crc, n_term; - int n, ntxtbuf; + unsigned lowmem, b_read; + uint32_t crc, n_term, n_txt, n_uterm; + int n, txtsize, m, intact;   char *septxt = " ;,.:`'\"?!(){}[]<>~^&*_-+=#$%@|\\/";   char *sepid = " <>"; + Hash *hpost; + Node *nppost, *nppost_, *npterm, node__; + Post *post, *post_;     __initcrc();   reset(&mem, 3); - toktxt = newtokenizer(septxt, LOWERCASE, &mem); - tokid = newtokenizer(sepid, KEEPCASE, &mem); - lowmem = 1024; - n_term = 0; - bytesread = 0; - tfile = newTFile(TREC); - tdoc = newTDoc(TREC); - token.str = tdoc->txt; - ntxtbuf = BUFSIZE; + toktxt = newtokenizer(septxt, LOWERCASE, &mem); + tokid = newtokenizer(sepid, KEEPCASE, &mem); + hpost = newhash(NHASH, cmppost, hashpost); + token = newtoken(NULL, KB); + lowmem = KB; + n_term = n_uterm = b_read = 0; + tfile = newTFile(TREC); + tdoc = newTDoc(TREC); + txtsize = TXTBUFSIZE; + intact = 1; + + while((n = gettoken(token, fp, toktxt)) > 0) { + + b_read += n;   - while((n = gettoken(&token, fp, toktxt)) > 0) { - - bytesread += n; + /* at end of a unit of document */ + if ((token->type == CTAG) && (strcmp(token->str, parser->endtag) == 0)) {   - if ((token.type == CTAG) && (strcmp(token.str, "doc") == 0)) { + /* at this point, tdoc->id & tdoc->h->n_id has + * been filled already */ + + if (tdoc->h->n_id <= 0) + intact = 0;   - /* fill TDoc->rsrc[i], TDoc->txt and TDoc->id - * are complete */ + /* fill tdoc->txt and empty the hash */ + n_txt = 0; + for (int i = 0; i < hpost->n; i++) { + for (Node *np = hpost->tab[i]; np != NULL; np = np->next) { + post = (Post *)(np->data); + m = sprintf(&(tdoc->txt[n_txt]), "%s %u ", + post->id, post->tf); + n_txt += m; + if (txtsize - n_txt < lowmem) + tdoc->txt = realloc(tdoc->txt, txtsize <<= 1); + n_uterm++; + n_term += post->tf; + } + } + freehash(hpost, freepost); + hpost = newhash(NHASH, cmppost, hashpost); + + /* fill tdoc->h->n_txt */ + tdoc->h->n_txt = n_txt; + + if (tdoc->h->n_txt <= 0) + intact = 0; + + /* fill TDoc->rsrc[i] */     tdoc->h->n_rsrc[0] = sprintf(tdoc->rsrc[0], "%u", n_term);   if (tdoc->h->n_rsrc[0] == -1)   tdoc->h->n_rsrc[0] = 0; +   n_term = 0;   + tdoc->h->n_rsrc[1] = sprintf(tdoc->rsrc[1], "%u", n_uterm); + if (tdoc->h->n_rsrc[1] == -1) + tdoc->h->n_rsrc[1] = 0; + + n_uterm = 0; +   /* fill remaining parts of TDoc->TSubHeader */   crc = crc_wordwise(&crc_m, 0, NULL, 0);   crc = crc_wordwise(&crc_m, crc, (unsigned char *)tdoc->txt, @@ -90,42 +204,67 @@
  tdoc->h->n_rsrc[0]);   tdoc->h->crc = crc;   - /* add a new node to TFile->list with a TDoc - * payload */ - tfile->list = addfront(tfile->list, newnode((void *)tdoc)); + /* if the incoming document is intact attach + * the TDoc to TFile->list */   - /* update TFile->THeader */ - tfile->h->n++; - tfile->h->b += TSUBHEADER_SIZE + tdoc->h->n_txt - + tdoc->h->n_id + tdoc->h->n_rsrc[0]; + if (intact) { + tfile->list = addfront(tfile->list, newnode((void *)tdoc));   + /* update TFile->THeader */ + tfile->h->n++; + tfile->h->b += TSUBHEADER_SIZE + tdoc->h->n_txt + + tdoc->h->n_id + tdoc->h->n_rsrc[0]; + } + else + freeTDoc(tdoc, parser->type); +   /* reset */ - tdoc = newTDoc(TREC); - token.str = tdoc->txt; - ntxtbuf = BUFSIZE; + intact = 1; + tdoc = newTDoc(TREC); + txtsize = TXTBUFSIZE; + + fprintf(stderr, "\rparsed: %d", tfile->h->n); +   continue;   } - if ((token.type == OTAG) && (strcmp(token.str, "docno") == 0)) { - n = gettoken(&token, fp, tokid); - bytesread += n; - /* fill TDoc->id and update TDoc->TSubHeader */ - memcpy(tdoc->id, token.str, token.len); - tdoc->h->n_id = token.len; + if ((token->type == OTAG) && (strcmp(token->str, parser->idtag) == 0)) { + /* fill TDoc->id, update TDoc->TSubHeader */ + /* assuming that the next token is the id */ + n = gettoken(token, fp, tokid); + b_read += n; + strcpy(tdoc->id, token->str); + tdoc->h->n_id = token->l + 1;   continue;   } - if (token.type == TERM) { - n_term++; - token.str[token.len] = ' '; - token.str += token.len + 1; - tdoc->h->n_txt += token.len + 1; - if ((ntxtbuf - tdoc->h->n_txt) < lowmem) { - tdoc->txt = realloc(tdoc->txt, ntxtbuf <<= 1); - token.str = tdoc->txt + tdoc->h->n_txt; + if (token->type == TERM) { /* place the term in a hash */ + + if (token->l <= parser->x) /* drop a short token */ + continue; + if (parser->c == 1) { /* drop a too-common token */ + node__.data = token->str; + npterm = hlookup(parser->hterm, &node__, 0); + if (npterm != NULL) + continue; + } + if (parser->n == 1) { /* normalize a token */ + int res = stem(parser->stemmer, token->str, token->l - 1); + token->str[res + 1] = '\0'; + token->l = res + 1; + } + + post_ = newpost(token->str, 1); + nppost_ = newnode(post_); + nppost = hlookup(hpost, nppost_, 1); + if (nppost != nppost_) { /* repeating token */ + ((Post *)(nppost->data))->tf++; + freenode(nppost_, freepost);   }   }   }   - bytesread++; + fprintf(stderr, "\n"); + + b_read++;     tfile->h->b += THEADER_SIZE;   @@ -146,7 +285,7 @@
  tfile->h->crc = crc;   __deinitcrc();   /* - fprintf(stderr, "%16s: %u\n", "bytes read", bytesread); + fprintf(stderr, "%16s: %u\n", "bytes read", b_read);   fprintf(stderr, "%16s: %u\n", "TFile size", tfile->h->b);   fprintf(stderr, "%16s: %u\n", "#docs", tfile->h->n);   */
Change 1 of 1 Show Entire File parser.h Stacked
 
 
 
 
 
1
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
 
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
@@ -1,2 +1,27 @@
+/* #include <kak/kcommon.h> */ +/* #include <kak/khash.h> */ +/* #include "stemmer.h" */ +  #define BUFSIZE 10240 -TFile* parse(FILE*); +#define NHASH 4093 + +typedef struct Parser Parser; +typedef struct Hash Hash; +typedef struct stemmer Stemmer; + +struct Parser { + E_TDocType type; + int c; + int n; + int x; + Hash *hterm; + Stemmer *stemmer; + int n_tag; + char endtag[64]; + char idtag[64]; + char *tag[64]; +}; + +Parser *newparser(E_TDocType, int, int, int); +void freeparser(Parser*); +TFile* parse(Parser*, FILE*);
Change 1 of 1 Show Entire File post.c Stacked
 
7
8
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
11
12
13
14
15
 
 
 
 
 
 
 
 
 
 
 
16
17
18
19
20
21
22
 
23
24
25
26
 
27
28
29
 
 
 
 
 
 
 
 
30
31
32
33
34
35
 
36
37
38
 
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
 
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
 
49
50
 
 
 
51
52
53
 
54
55
56
57
58
59
60
61
62
63
64
 
65
 
66
67
68
69
@@ -7,32 +7,63 @@
 {   Post *p;   p = (Post *)malloc(sizeof(Post)); + p->id = strdup(id); + p->tf = tf; + return p; +} + +void freepost(void *p) +{ + if (p == NULL) + return; + free(((Post *)p)->id); + free(p); +} + +Post *newpost_s(char *id, int tf) /* shallow */ +{ + Post *p; + p = (Post *)malloc(sizeof(Post));   p->id = id;   p->tf = tf;   return p;  }   -int p_postcmp(void *d1, void *d2) +void freepost_s(void *p) /* shallow */ +{ + free(p); +} + +int cmppost(void *d1, void *d2) +{ + return strcmp(((Post *)d1)->id, ((Post *)d2)->id); +} + +int cmppost_p(void *d1, void *d2)  {   if (((Post *)d1)->id == ((Post *)d2)->id)   return 0;   return 1;  }   -void printpost(void *d, void *arg) +int cmppost_tf(const void *p1, const void *p2)  { - char *fmt; - fmt = (char *)arg; - printf(fmt, ((Post *)d)->id, ((Post *)d)->tf); + return ((Post *)p1)->tf - ((Post *)p2)->tf;  }   -unsigned posthash(void *data, unsigned hsize) +void fprintpost(FILE *stream, void *data) +{ + Post *p; + p = (Post *)data; + fprintf(stream, " %s:%u", p->id, p->tf); +} + +unsigned hashpost(void *data, unsigned hsize)  {   static unsigned MULTIPLIER = 31;   unsigned h; - char *p;   h = 0; - for (p = (char *)(((Post *)data)->id); *p != '\0'; p++) + for (char *p = ((Post *)data)->id; *p != '\0'; p++)   h = MULTIPLIER * h + *p;   return h % hsize;  }
Change 1 of 1 Show Entire File post.h Stacked
 
6
7
8
9
10
11
 
 
 
 
 
 
 
 
 
6
7
8
 
 
 
9
10
11
12
13
14
15
16
@@ -6,6 +6,11 @@
 };    Post *newpost(char*, int); -int p_postcmp(void*, void*); -void printpost(void*, void*); -unsigned posthash(void*, unsigned); +void freepost(void*); +Post *newpost_s(char*, int); +void freepost_s(void*); +int cmppost(void*, void*); +int cmppost_p(void*, void*); +int cmppost_tf(const void*, const void*); +void fprintpost(FILE*, void*); +unsigned hashpost(void*, unsigned);
Change 1 of 1 Show Entire File raw2t.c Stacked
 
14
15
16
17
18
19
20
21
22
23
 
 
 
 
 
 
 
24
25
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
28
 
29
30
 
31
32
33
34
 
35
36
37
 
14
15
16
 
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
 
58
59
 
60
61
62
 
 
63
64
65
66
@@ -14,24 +14,53 @@
 #include <stdlib.h>  #include <string.h>  #include <malloc/malloc.h> -#include <kak/klog.h>  #include <kak/klist.h>  #include "crc.h"  #include "tokenizer.h"  #include "tfile.h"  #include "parser.h"   +void usage(char *progname) +{ + fprintf(stderr, "usage: %s [-c] [-n] [-x] type\n", progname); + fprintf(stderr, "type = TREC | WARC | TRECQUERY\n"); + exit(1); +} +  int main(int argc, char *argv[])  {   TFile *tfile; + Parser *parser; + E_TDocType type; + int c, n, x, t; + + c = n = x = t = 0; + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-c") == 0) { + c = 1; + } + else if (strcmp(argv[i], "-n") == 0) { + n = 1; + } + else if (strcmp(argv[i], "-x") == 0) { + x = 3; + } + else { + t = 1; + type = getTDocType(argv[i]); + } + } + + if (!t) + usage(argv[0]);   - /* read list of common words */ + parser = newparser(type, c, n, x);   - tfile = parse(stdin); + tfile = parse(parser, stdin);   writeTFile(tfile, stdout);   - /* FIX: call segfaults */ - /* freeTFile(tfile); */ + freeTFile(tfile, type);     return 0;  }
Change 1 of 4 Show Entire File term.c Stacked
 
9
10
11
12
 
13
14
15
16
17
18
 
 
 
 
 
 
 
 
 
19
20
21
22
23
24
 
25
26
27
28
29
 
30
31
32
33
34
35
36
 
37
38
39
 
49
50
51
52
 
53
54
55
56
57
 
60
61
62
63
 
64
65
66
67
68
 
 
 
 
 
69
70
71
72
73
74
75
76
77
 
 
78
79
80
 
83
84
85
 
 
9
10
11
 
12
13
14
15
16
17
 
18
19
20
21
22
23
24
25
26
27
28
29
 
30
 
31
32
33
34
35
 
36
37
38
39
40
41
42
 
43
44
45
46
 
56
57
58
 
59
60
 
61
62
63
 
66
67
68
 
69
70
 
 
 
 
71
72
73
74
75
76
77
 
 
 
 
 
 
 
78
79
80
81
82
 
85
86
87
88
@@ -9,31 +9,38 @@
 {   Term *t;   t = (Term *)malloc(sizeof(Term)); - t->s = s; + t->s = strdup(s);   t->df = df;   t->plist = newnode(post);   return t;  }   -unsigned termhash(void *data, unsigned hsize) +void freeterm(void *term) +{ + if (term == NULL) + return; + free(((Term *)term)->s); + free(term); +} + +unsigned hashterm(void *data, unsigned hsize)  {   static unsigned MULTIPLIER = 31;   unsigned h; - char *p;   h = 0; - for (p = (char *)(((Term *)data)->s); *p != '\0'; p++) + for (char *p = ((Term *)data)->s; *p != '\0'; p++)   h = MULTIPLIER * h + *p;   return h % hsize;  }   -int p_termcmp(void *d1, void *d2) +int cmptermp(void *d1, void *d2)  {   if (((Term *)d1)->s == ((Term *)d2)->s)   return 0;   return 1;  }   -int termcmp(void *d1, void *d2) +int cmpterm(void *d1, void *d2)  {   return strcmp(((Term *)d1)->s, ((Term *)d2)->s);  } @@ -49,9 +56,8 @@
  newp = (Post *)(newt->plist->data);   p = (Post *)(t->plist->data);   - if (p_postcmp(newp, p) == 0) { + if (cmppost_p(newp, p) == 0)   p->tf++; - }   else {   t->plist = addfront(t->plist, newt->plist);   t->df++; @@ -60,21 +66,17 @@
  return 0;  }   -void freeterm(void *term) +void fprintterm(FILE *stream, void *data)  { - if (term == NULL) - return; - free(((Term *)term)->s); - free(term); + Term *t; + t = (Term *)data; + fprintf(stream, " %s:%u", t->s, t->df); + for (Node *np = t->plist; np != NULL; np = np->next) + fprintpost(stream, np->data);  }   -void printterm(void *d, void *arg) -{ - char *fmt; - fmt = (char *)arg; - printf(fmt, ((Term *)d)->s, strlen(((Term *)d)->s)); -} - +/* TODO: printpost doesn't need the format string, so modify apply */ +/*  void printtree(void *d, void *arg)  {   char *fmt; @@ -83,3 +85,4 @@
  apply(((Term *)d)->plist, printpost, "%s:%d ");   printf("\n");  } +*/
Change 1 of 1 Show Entire File term.h Stacked
 
6
7
8
9
 
10
11
12
 
 
13
14
15
16
 
 
 
6
7
8
 
9
10
 
 
11
12
13
14
 
 
15
16
@@ -6,11 +6,11 @@
  Node *plist;  };   -unsigned termhash(void*, unsigned); +unsigned hashterm(void*, unsigned);  Term *newterm(char*, unsigned, Post*); -int termcmp(void*, void*); -int p_termcmp(void*, void*); +int cmpterm(void*, void*); +int cmptermp(void*, void*);  int term_match_handler(void*, void*);  void freeterm(void*); -void printterm(void*, void*); -void printtree(void*, void*); +void fprintterm(FILE*, void*); +/* void printtree(void*, void*); */
Change 1 of 1 Show Entire File test/​alice.mem Stacked
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
@@ -0,0 +1,20 @@
+MG CRC V T BYTES N R BO +41 3375973743 1 0 574 3 2 32 +CRC TXT ID RSRC0 RSRC1 +2702988819 181 2 2 2 +tdoc->id: A. +tdoc->txt: rude 1 open 1 curios 1 look 1 cut 1 alic 2 learn 1 hear 1 desk 1 veri 2 remark 1 great 1 hair 1 sever 1 wide 1 first 1 hatter 2 speech 1 write 1 ey 1 person 1 time 1 raven 1 make 1 . +tdoc->rsrc[0]: 27. +tdoc->rsrc[1]: 24. +CRC TXT ID RSRC0 RSRC1 +1863705286 187 2 2 2 +tdoc->id: B. +tdoc->txt: adventur 1 issu 1 wrong 2 alic 1 1897 1 nevar 1 front 2 never 2 carrol 1 veri 1 propos 1 final 1 flat 1 note 1 himself 1 earli 1 spell 1 lewi 1 answer 1 produc 1 raven 1 though 1 revis 2 . +tdoc->rsrc[0]: 27. +tdoc->rsrc[1]: 23. +CRC TXT ID RSRC0 RSRC1 +711335375 126 2 2 2 +tdoc->id: C. +tdoc->txt: alic 1 both 1 dip 1 more 1 gave 1 1990 1 martin 1 gardner 1 flap 1 sent 1 answer 1 quill 1 reader 1 annot 1 possibl 1 slope 1 . +tdoc->rsrc[0]: 16. +tdoc->rsrc[1]: 16.
Change 1 of 1 Show Entire File test/​alice.qrel Stacked
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
@@ -0,0 +1,9 @@
+1 0 A 1 +1 0 B 1 +1 0 C 1 +2 0 A 0 +2 0 B 1 +2 0 C 0 +3 0 A 0 +3 0 B 0 +3 0 C 1
Change 1 of 1 Show Entire File test/​alice.rank Stacked
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
@@ -0,0 +1,12 @@
+1 A 7 +1 B 3 +1 C 1 +2 B 3 +2 A 2 +2 C 1 +3 A 6 +3 B 2 +3 C 2 +4 C 5 +4 A 2 +4 B 1
Change 1 of 1 Show Entire File test/​alice.res Stacked
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
@@ -0,0 +1,12 @@
+1 A 7 +1 B 3 +1 C 1 +2 A 2 +2 B 3 +2 C 1 +3 A 6 +3 B 2 +3 C 2 +4 C 5 +4 A 2 +4 B 1
Change 1 of 1 Show Entire File test/​alice.run Stacked
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
@@ -0,0 +1,12 @@
+1 Q0 A 0 7 . +1 Q0 B 1 3 . +1 Q0 C 2 1 . +2 Q0 B 0 3 . +2 Q0 A 1 2 . +2 Q0 C 2 1 . +3 Q0 A 0 6 . +3 Q0 B 1 2 . +3 Q0 C 2 2 . +4 Q0 C 0 5 . +4 Q0 A 1 2 . +4 Q0 B 2 1 .
 
 
Change 1 of 1 Show Entire File test/​alice.txt Stacked
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
@@ -0,0 +1,25 @@
+<DOC> + <DOCNO>A</DOCNO> + 'Your hair wants cutting,' said the Hatter. He had been looking at + Alice for some time with great curiosity, and this was his first + speech. 'You should learn not to make personal remarks,' Alice said + with some severity; 'it's very rude.' The Hatter opened his eyes + very wide on hearing this; but all he said was, 'Why is a raven like + a writing-desk?' +</DOC> + +<DOC> + <DOCNO>B</DOCNO> + LEWIS CARROLL himself proposed an answer in the 1897 final revision + of Alice's Adventures. "Because it can produce a few notes, though + they are very flat; and it is never put with the wrong end in front!" + The early issues of the revision spell "never" as "nevar", ie "raven" + with the wrong end in front. +</DOC> + +<DOC> + <DOCNO>C</DOCNO> + Martin Gardner, in More Annotated Alice (1990) gave two possible + answers, sent in by readers: "both have quills dipped in ink" and + "because it slopes with a flap". +</DOC>
Change 1 of 1 Show Entire File test/​alice_query.mem Stacked
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
@@ -0,0 +1,26 @@
+MG CRC V T BYTES N R BO +41 1553551552 1 0 312 4 2 32 +CRC TXT ID RSRC0 RSRC1 +944899761 63 2 1 1 +tdoc->id: 1. +tdoc->txt: adventur 1 alic 1 desk 1 hatter 1 wonderland 1 write 1 raven 1 . +tdoc->rsrc[0]: 7. +tdoc->rsrc[1]: 7. +CRC TXT ID RSRC0 RSRC1 +497613859 48 2 1 1 +tdoc->id: 2. +tdoc->txt: puzzl 1 carrol 1 desk 1 lewi 1 answer 1 write 1 . +tdoc->rsrc[0]: 6. +tdoc->rsrc[1]: 6. +CRC TXT ID RSRC0 RSRC1 +85283453 68 2 1 1 +tdoc->id: 3. +tdoc->txt: pose 1 alic 1 puzzl 1 desk 1 hatter 1 wonderland 1 answer 1 write 1 . +tdoc->rsrc[0]: 8. +tdoc->rsrc[1]: 8. +CRC TXT ID RSRC0 RSRC1 +32326634 41 2 1 1 +tdoc->id: 4. +tdoc->txt: alic 1 more 1 martin 1 gardner 1 annot 1 . +tdoc->rsrc[0]: 5. +tdoc->rsrc[1]: 5.
 
 
Change 1 of 1 Show Entire File test/​alice_query.txt Stacked
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
@@ -0,0 +1,28 @@
+<TOP> + <NUM>1</NUM> + <TEXT> + Alice's Adventures in Wonderland, Mad Hatter, raven and writing-desk. + </TEXT> +</TOP> + +<TOP> + <NUM>2</NUM> + <TEXT> + Lewis Carroll's answer to the writing-desk puzzle. + </TEXT> +</TOP> + +<TOP> + <NUM>3</NUM> + <TEXT> + Answers to the writing-desk puzzle posed by the Mad Hatter in Alice + in Wonderland? + </TEXT> +</TOP> + +<TOP> + <NUM>4</NUM> + <TEXT> + 'More Annotated Alice' by Martin Gardner. + </TEXT> +</TOP>
Change 1 of 15 Show Entire File tfile.c Stacked
 
34
35
36
37
38
39
40
41
42
43
44
45
46
47
 
 
 
 
48
49
50
51
52
53
54
 
 
55
56
57
 
65
66
67
68
 
69
70
 
 
 
 
71
72
73
74
75
76
77
 
78
79
80
81
82
83
84
 
85
86
87
 
 
 
 
 
 
 
88
89
 
90
91
92
93
94
95
 
96
97
98
 
111
112
113
114
115
 
 
 
116
117
118
119
120
 
 
121
122
123
124
125
126
 
127
128
129
130
131
 
132
133
134
 
136
137
138
 
 
 
 
139
140
141
 
148
149
150
151
 
152
153
154
 
155
156
157
158
159
 
160
161
162
163
 
 
 
164
165
 
 
 
166
167
168
 
189
190
191
192
 
193
194
195
 
 
 
 
 
196
197
198
 
199
200
201
202
203
204
 
205
206
207
 
208
209
 
210
211
212
 
230
231
232
233
 
234
235
236
 
237
238
239
 
241
242
243
244
 
245
246
247
 
281
282
283
284
 
285
286
287
288
289
290
 
291
292
293
 
316
317
318
319
 
320
321
322
 
323
324
325
 
327
328
329
330
 
331
332
333
 
335
336
337
 
 
338
339
 
 
340
341
342
 
349
350
351
352
 
353
354
355
356
357
358
359
360
361
362
363
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
365
366
367
368
 
369
370
 
 
 
 
 
 
371
372
373
 
383
384
385
386
 
387
388
389
 
396
397
398
399
400
401
402
 
403
404
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
35
36
 
 
 
 
 
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
 
 
52
53
54
55
56
 
64
65
66
 
67
68
69
70
71
72
73
74
 
75
76
77
78
 
79
80
81
82
83
84
85
 
86
87
88
89
90
91
92
93
94
95
96
97
 
98
99
100
101
102
103
 
104
105
106
107
 
120
121
122
 
 
123
124
125
126
127
128
 
 
129
130
131
132
 
133
134
 
135
136
137
138
139
 
140
141
142
143
 
145
146
147
148
149
150
151
152
153
154
 
161
162
163
 
164
165
166
 
167
168
169
170
171
 
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
 
208
209
210
 
211
212
213
 
214
215
216
217
218
219
220
 
221
222
223
 
224
225
 
226
227
228
 
229
230
 
231
232
233
234
 
252
253
254
 
255
256
257
 
258
259
260
261
 
263
264
265
 
266
267
268
269
 
303
304
305
 
306
307
308
 
309
310
 
311
312
313
314
 
337
338
339
 
340
341
342
 
343
344
345
346
 
348
349
350
 
351
352
353
354
 
356
357
358
359
360
361
362
363
364
365
366
367
 
374
375
376
 
377
378
 
 
 
 
 
 
 
 
 
 
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
 
397
398
 
399
400
401
402
403
404
405
406
407
 
417
418
419
 
420
421
422
423
 
430
431
432
 
433
434
 
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
@@ -34,24 +34,23 @@
  h->crc = 0;   h->n_txt = 0;   h->n_id = 0; - h->r = 0; - if (type == TREC) - h->r = TREC_R; - if (h->r > 0) - memset(h->n_rsrc, 0, sizeof(uint32_t) * h->r);   return h;  }    TDoc *newTDoc(E_TDocType type)  {   TDoc *tdoc; + int r; + r = 0; + if (type == TREC) + r = TREC_R;   tdoc = (TDoc *)malloc(sizeof(TDoc));   tdoc->h = _newTSubHeader(type);   tdoc->txt = (char *)malloc(sizeof(char) * TXTBUFSIZE);   tdoc->id = (char *)malloc(sizeof(char) * IDBUFSIZE);   tdoc->rsrc = NULL; - tdoc->rsrc = (char **)malloc(sizeof(char *) * tdoc->h->r); - for (int i = 0; i < tdoc->h->r; i++) + tdoc->rsrc = (char **)malloc(sizeof(char *) * r); + for (int i = 0; i < r; i++)   tdoc->rsrc[i] = (char *)malloc(sizeof(char) * RSRCBUFSIZE);   return tdoc;  } @@ -65,34 +64,44 @@
  return t;  }   -void freeTDoc(void *d) +void freeTDoc(void *d, E_TDocType type)  {   TDoc* tdoc; + int r; + r = 0; + if (type == TREC) + r = TREC_R;   tdoc = (TDoc *)d; - int i;   if (tdoc == NULL)   return;   free(tdoc->txt);   free(tdoc->id); - for (i = 0; i < tdoc->h->r; i++) + for (int i = 0; i < r; i++)   free(tdoc->rsrc[i]);   free(tdoc->rsrc);   free(tdoc->h);   free(tdoc);  }   -void freeTFile(TFile *tfile) +void freeTFile(TFile *tfile, E_TDocType type)  {   if (tfile == NULL)   return; + Node *n, *next; + n = tfile->list; + for (; n != NULL; n = next) { + next = n->next; + freeTDoc(n->data, type); + free(n); + }   free(tfile->h); - freelist(tfile->list, freeTDoc); + free(tfile);  }    int writeTFile(TFile *tfile, FILE *fp)  {   uint16_t v, check; - int n; + int n, c;   Node *p;   TDoc *tdoc;   @@ -111,24 +120,24 @@
  return 0;   }   - /* write the TDoc structures */ - for (p = tfile->list; p != NULL; p = p->next) { + c = 0; + + for (p = tfile->list; p != NULL; p = p->next) {/* for a TDoc */     tdoc = p->data;   - /* write the TDoc->TSubHeader */ - check = 0x000F; /* 0000 0000 0000 1111 */ + /* write TDoc->TSubHeader */ + check = 0x0007; /* 0000 0000 0000 0111 */   v = 0;   n = fwrite(&tdoc->h->crc, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fwrite(&tdoc->h->r, sizeof(uint32_t), 1, fp); v |= n; v <<= 1;   n = fwrite(&tdoc->h->n_txt, sizeof(uint32_t), 1, fp); v |= n; v <<= 1;   n = fwrite(&tdoc->h->n_id, sizeof(uint32_t), 1, fp); v |= n; - for (int i = 0; i < tdoc->h->r; i++) { + for (int i = 0; i < tfile->h->r; i++) {   n = fwrite(&tdoc->h->n_rsrc[i], sizeof(uint32_t), 1, fp);   v <<= 1; v |= n;   check <<=1; check |= 1;   } - if (v != check) { /* the lower tdoc->h->r bits of 'check' were set */ + if (v != check) { /* the lower tfile->h->r bits of 'check' were set */   fprintf(stderr, "ERROR: failed to write TSubHeader\n");   return 0;   } @@ -136,6 +145,10 @@
  /* write the TDoc->txt */   n = fwrite(tdoc->txt, tdoc->h->n_txt, 1, fp);   if (n != 1) { + /*DEBUG*/ + fprintf(stderr, "n = %d, tdoc->h->n_txt = %u, tdoc->id = %13s\n", + n, tdoc->h->n_txt, tdoc->id); + /*DEBUG*/   fprintf(stderr, "ERROR: failed to write TDoc->txt\n");   return 0;   } @@ -148,21 +161,27 @@
  }     /* write the TDoc->rsrc[i] */ - if (tdoc->h->r > 0) { + if (tfile->h->r > 0) {   v = 0;   check = 0x0000; - for (int i = 0; i < tdoc->h->r; i++) { + for (int i = 0; i < tfile->h->r; i++) {   n = fwrite(tdoc->rsrc[i], tdoc->h->n_rsrc[i], 1, fp);   v <<= 1; v |= n;   check <<= 1; check |= n;   } - if (v != check) { /* the lower tdoc->h->r bits of 'check' were set */ + if (v != check) { /* the lower tfile->h->r bits of 'check' were set */   fprintf(stderr, "ERROR: failed to write a TDoc->rsrc[j]\n");   return 0;   }   } + + c++; + fprintf(stderr, "\rwrote: %d/%u", c, tfile->h->n);   }   + if (c != tfile->h->n) + fprintf(stderr, "; skipped %d empty documents", tfile->h->n - c); + fprintf(stderr, "\n");   return 1;  }   @@ -189,24 +208,27 @@
  return h;  }   -TDoc *readTDoc(TDoc *tdoc, FILE *fp) +TDoc *readTDoc(TDoc *tdoc, FILE *fp, E_TDocType type)  {   uint16_t v, check; - int n; + int n, r; + + r = 0; + if (type == TREC) + r = TREC_R;     /* fill the TDoc->TSubHeader */ - check = 0x000F; /* 0000 0000 0000 1111 */ + check = 0x0007; /* 0000 0000 0000 0111 */   v = 0;   n = fread(&tdoc->h->crc, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fread(&tdoc->h->r, sizeof(uint32_t), 1, fp); v |= n; v <<= 1;   n = fread(&tdoc->h->n_txt, sizeof(uint32_t), 1, fp); v |= n; v <<= 1;   n = fread(&tdoc->h->n_id, sizeof(uint32_t), 1, fp); v |= n; - for (int j = 0; j < tdoc->h->r; j++) { + for (int j = 0; j < r; j++) {   n = fread(&tdoc->h->n_rsrc[j], sizeof(uint32_t), 1, fp);   v <<= 1; v |= n; - check <<=1; check |= 1; + check <<= 1; check |= 1;   } - if (v != check) { /* tdoc->h->r 1's pushed through the LSB */ + if (v != check) { /* r 1's pushed through the LSB */   fprintf(stderr, "ERROR: failed to read TDoc->TSubHeader\n");   return NULL;   } @@ -230,10 +252,10 @@
  }     /* read the TDoc->rsrc[i]->b */ - if (tdoc->h->r > 0) { + if (r > 0) {   v = 0;   check = 0x0000; - for (int j = 0; j < tdoc->h->r; j++) { + for (int j = 0; j < r; j++) {   if (tdoc->h->n_rsrc[j] > RSRCBUFSIZE)   tdoc->rsrc[j] = realloc(tdoc->rsrc[j],   tdoc->h->n_rsrc[j]); @@ -241,7 +263,7 @@
  v <<= 1; v |= n;   check <<= 1; check |= n;   } - if (v != check) { /* the lower tdoc->h->r bits of 'check' were set */ + if (v != check) { /* the lower r bits of 'check' were set */   fprintf(stderr, "ERROR: failed to read a TDoc->rsrc[j]\n");   return NULL;   } @@ -281,13 +303,12 @@
  tdoc = newTDoc(TREC);     /* fill the TDoc->TSubHeader */ - check = 0x000F; /* 0000 0000 0000 1111 */ + check = 0x0007; /* 0000 0000 0000 0111 */   v = 0;   n = fread(&tdoc->h->crc, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; - n = fread(&tdoc->h->r, sizeof(uint32_t), 1, fp); v |= n; v <<= 1;   n = fread(&tdoc->h->n_txt, sizeof(uint32_t), 1, fp); v |= n; v <<= 1;   n = fread(&tdoc->h->n_id, sizeof(uint32_t), 1, fp); v |= n; - for (int j = 0; j < tdoc->h->r; j++) { + for (int j = 0; j < tfile->h->r; j++) {   n = fread(&tdoc->h->n_rsrc[j], sizeof(uint32_t), 1, fp);   v <<= 1; v |= n;   check <<=1; check |= 1; @@ -316,10 +337,10 @@
  }     /* read the TDoc->rsrc[i]->b */ - if (tdoc->h->r > 0) { + if (tfile->h->r > 0) {   v = 0;   check = 0x0000; - for (int j = 0; j < tdoc->h->r; j++) { + for (int j = 0; j < tfile->h->r; j++) {   if (tdoc->h->n_rsrc[j] > RSRCBUFSIZE)   tdoc->rsrc[j] = realloc(tdoc->rsrc[j],   tdoc->h->n_rsrc[j]); @@ -327,7 +348,7 @@
  v <<= 1; v |= n;   check <<= 1; check |= n;   } - if (v != check) { /* the lower tdoc->h->r bits of 'check' were set */ + if (v != check) { /* the lower tfile->h->r bits of 'check' were set */   fprintf(stderr, "ERROR: failed to read a TDoc->rsrc[j]\n");   return NULL;   } @@ -335,8 +356,12 @@
    /* add to the list a new node with a TDoc payload */   tfile->list = addfront(tfile->list, newnode((void *)tdoc)); + + fprintf(stderr, "\r%d/%d", i+1, tfile->h->n);   }   + fprintf(stderr, "\n"); +   return tfile;  }   @@ -349,25 +374,34 @@
  h->b, h->n, h->r, h->bo);  }   -void _printTSubHeader(TSubHeader *h) +void _printTSubHeader(TSubHeader *h, E_TDocType type)  { - static int flag = 1; - if (flag) { - flag = 0; - printf("%-10s %-10s %-3s %-2s ", "CRC", "TXT", "ID", "R"); - for (int i = 0; i < h->r; i++) - printf("RSRC%-6u ", i); - printf("\n"); - } - printf("%-10u %-10u %-3u %-2u ", h->crc, h->n_txt, h->n_id, h->r); - for (int i = 0; i < h->r; i++) + int r; + r = 0; + if (type == TREC) + r = TREC_R; + + /* print header string */ + printf("%-10s %-10s %-3s ", "CRC", "TXT", "ID"); + for (int i = 0; i < r; i++) + printf("RSRC%-6u ", i); + printf("\n"); + + /* print header contents */ + printf("%-10u %-10u %-3u ", h->crc, h->n_txt, h->n_id); + for (int i = 0; i < r; i++)   printf("%-10u ", h->n_rsrc[i]);   printf("\n");  }   -void _printTDoc(TDoc *tdoc) +void _printTDoc(TDoc *tdoc, E_TDocType type)  { - _printTSubHeader(tdoc->h); + int r; + r = 0; + if (type == TREC) + r = TREC_R; + + _printTSubHeader(tdoc->h, type);     /* print the doc ID */   @@ -383,7 +417,7 @@
    /* print the resource blocks */   - for (int i = 0; i < tdoc->h->r; i++) { + for (int i = 0; i < r; i++) {   tdoc->rsrc[i][tdoc->h->n_rsrc[i]] = '\0';   printf("tdoc->rsrc[%d]: ", i);   printf("%s.\n", tdoc->rsrc[i]); @@ -396,9 +430,23 @@
  TDoc *tdoc;     _printTHeader(tfile->h); -   for (p = tfile->list; p != NULL; p = p->next) {   tdoc = p->data; - _printTDoc(tdoc); + _printTDoc(tdoc, TREC);   }  } + +E_TDocType getTDocType(char *s) +{ + E_TDocType t; + t = TREC; /* default */ + if (strcmp(s, "TREC") == 0) + t = TREC; + else if (strcmp(s, "WARC") == 0) + t = WARC; + else if (strcmp(s, "TRECQUERY") == 0) + t = TRECQUERY; + else + fprintf(stderr, "assuming default TDoc type TREC\n"); + return t; +}
Change 1 of 3 Show Entire File tfile.h Stacked
 
1
2
3
4
 
 
5
6
7
 
 
8
9
 
10
11
12
 
28
29
30
31
32
33
34
 
44
45
46
 
47
48
49
50
51
52
53
 
 
54
55
56
 
57
58
59
60
 
 
61
 
1
2
 
 
3
4
5
 
 
6
7
8
 
9
10
11
12
 
28
29
30
 
31
32
33
 
43
44
45
46
47
48
49
50
51
 
 
52
53
54
55
 
56
57
58
 
 
59
60
61
@@ -1,12 +1,12 @@
 #define THEADER_SIZE 32 /* header size in bytes */  #define TSUBHEADER_SIZE 12 /* sub-header size in bytes */ -#define TREC_R 1 /* number of resources for TDoc of type TREC */ -#define N_RSRC 10 /* number of resources */ +#define TREC_R 2 /* number of resources for TDoc of type TREC */ +#define NRSRC 10 /* number of resources */  #define TXTBUFSIZE 10240 /* 10KB */ -#define IDBUFSIZE 36 /* A UUID (if used) would be max 36 chars */ -#define RSRCBUFSIZE 100 +#define IDBUFSIZE 64 /* A UUID (if used) would be max 36 chars */ +#define RSRCBUFSIZE 32   -typedef enum {TREC, WARC} E_TDocType; +typedef enum {TREC, WARC, TRECQUERY} E_TDocType;    typedef struct TFile TFile;  typedef struct THeader THeader; @@ -28,7 +28,6 @@
  uint32_t crc;   uint32_t n_txt;   uint32_t n_id; - uint32_t r;   uint32_t n_rsrc[10];  };   @@ -44,18 +43,19 @@
  Node *list;  };   +E_TDocType getTDocType(char*);  THeader *_newTHeader(E_TDocType);  void _freeTHeader(void*);  TSubHeader *_newTSubHeader(E_TDocType);  TDoc *newTDoc(E_TDocType);  TFile *newTFile(E_TDocType); -void freeTDoc(void*); -void freeTFile(TFile*); +void freeTDoc(void*, E_TDocType); +void freeTFile(TFile*, E_TDocType);  int writeTFile(TFile*, FILE*);  THeader *readTHeader(THeader*, FILE*); -TDoc *readTDoc(TDoc*, FILE*); +TDoc *readTDoc(TDoc*, FILE*, E_TDocType);  TFile *readTFile(FILE*);  void _printTHeader(THeader*); -void _printTSubHeader(TSubHeader*); -void _printTDoc(TDoc*); +void _printTSubHeader(TSubHeader*, E_TDocType); +void _printTDoc(TDoc*, E_TDocType);  void printTFile(TFile*);
Change 1 of 2 Show Entire File tokenizer.c Stacked
 
60
61
62
63
 
64
65
66
67
 
 
 
 
 
 
 
 
68
69
70
71
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
74
75
76
77
78
79
 
 
 
 
80
81
 
82
83
 
84
85
86
 
87
88
89
 
95
96
97
98
99
 
 
100
101
102
103
104
105
 
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
 
60
61
62
 
63
64
65
66
 
67
68
69
70
71
72
73
74
75
 
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
 
 
 
106
107
108
109
110
111
112
113
 
114
115
116
 
117
118
119
120
 
126
127
128
 
 
129
130
131
132
133
134
135
 
136
137
138
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
@@ -60,30 +60,61 @@
  return t;  }   -Token *newtoken(int n) +Token *newtoken(char *str, unsigned n)  {   Token *t;   t = (Token *)malloc(sizeof(Token)); - t->str = (char *)malloc(n); + if (str == NULL) { + t->str = (char *)calloc(n, sizeof(char)); + t->l = 0; + } + else { + t->str = strdup(str); + t->l = strlen(str); + }   t->type = TERM; - t->len = 0;   return t;  }   +void freetoken(Token *t) +{ + if (t == NULL) + return; + free(((Token*)t)->str); + free(t); +} + +unsigned hashtoken(void *data, unsigned hsize) +{ + static unsigned MULTIPLIER = 31; + unsigned h; + h = 0; + for (char *p = ((Token *)data)->str; *p != '\0'; p++) + h = MULTIPLIER * h + *p; + return h % hsize; +} + +int cmptoken(void *d1, void *d2) +{ + return strcmp(((Token *)d1)->str, ((Token *)d2)->str); +} +  int gettoken(Token *tok, FILE *fp, Tokenizer *t)  {   int n;   char c, *s, c_[2]; - int len, bytes; - bytes = 0; - len = 0; + int l, b_read; + + b_read = 0; + l = 0;   tok->type = TERM;   s = tok->str; +   while ((n = fread(&c, 1, 1, fp)) > 0) { - bytes += n; + b_read += n;   if (t->asciitab[(int)c] == SEPCHAR) {   push(t->mem, &c); - if (len == 0) + if (l == 0)   continue;   *s = '\0';   if (c == '>') { @@ -95,36 +126,14 @@
  else if (c_[1] == '/' && c_[0] == '<')   tok->type = CTAG;   } - tok->len = len; - return bytes; + tok->l = l; + return b_read;   }   if (t->casechange == LOWERCASE && c >= 65 && c <= 90)   c += 32;   *s = c;   s++; - len++; + l++;   }   return n;  } - -int tokenize(char dest[100][100], char *src, unsigned size, Tokenizer *t) -{ - int i, j, k; - char *s; - i = j = k = 0; - for (i = 0, s = src; i < size; i++, s++) { - if (t->asciitab[(int)(*s)] == SEPCHAR) { - if (k > 0) { - dest[j][k] = '\0'; - j++; - k = 0; - } - continue; - } - dest[j][k] = *s; - if (t->casechange == LOWERCASE && *s >= 65 && *s <= 90) - dest[j][k] += 32; - k++; - } - return j; -}
Change 1 of 2 Show Entire File tokenizer.h Stacked
 
27
28
29
 
30
31
32
33
34
 
36
37
38
39
 
 
 
 
40
41
 
27
28
29
30
31
 
32
33
34
 
36
37
38
 
39
40
41
42
43
44
@@ -27,8 +27,8 @@
   struct Token {   char *str; + unsigned l;   int type; - int len;  };    void reset(Stack*, int); @@ -36,6 +36,9 @@
 char pop(Stack*);  void printstack(Stack*);  Tokenizer *newtokenizer(char*, int, Stack*); -Token *newtoken(int); +Token *newtoken(char*, unsigned); +void freetoken(Token*); +unsigned hashtoken(void*, unsigned); +int cmptoken(void*, void*);  int gettoken(Token*, FILE*, Tokenizer*);  int tokenize(char[100][100], char*, unsigned, Tokenizer*);