Repositories » TXT0 Read More
Clone URL:  
Pushed to one repository · View In Graph Contained in tip

TREC type TDoc now has 4 resources; a count of unique terms, sum of term frequencies, sum of squares of the term frequencies and the maximum term frequency.

Changeset 66dbe4eea2e5

Parent 31ed6555d40e

by Rup Palchowdhury

Changes to 2 files · Browse files at 66dbe4eea2e5 Showing diff from parent 31ed6555d40e Diff from another changeset...

Change 1 of 8 Show Entire File parser.c Stacked
 
59
60
61
62
 
63
64
65
 
68
69
70
71
 
72
73
74
 
96
97
98
99
 
100
101
102
103
104
 
124
125
126
127
 
128
129
130
 
139
140
141
142
 
143
144
145
 
160
161
162
163
 
164
165
166
167
168
 
 
 
169
170
171
172
173
174
 
 
 
 
 
175
176
177
178
179
180
181
182
 
190
191
192
193
 
194
195
196
197
 
198
199
200
201
 
202
203
204
 
 
 
 
205
206
207
208
209
210
211
212
 
 
 
 
213
214
215
 
220
221
222
223
224
225
226
227
228
 
 
 
229
230
231
 
59
60
61
 
62
63
64
65
 
68
69
70
 
71
72
73
74
 
96
97
98
 
99
100
 
101
102
103
 
123
124
125
 
126
127
128
129
 
138
139
140
 
141
142
143
144
 
159
160
161
 
162
163
 
 
 
 
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
 
 
 
180
181
182
 
190
191
192
 
193
194
195
196
 
197
198
199
200
 
201
202
203
204
205
206
207
208
209
210
211
212
213
214
 
 
215
216
217
218
219
220
221
 
226
227
228
 
 
 
 
 
 
229
230
231
232
233
234
@@ -59,7 +59,7 @@
  case TREC:   p->n_tag = 0;   strcpy(p->endtag, "doc"); - strcpy(p->idtag, "docno"); + strcpy(p->idtag, "docno");   break;   case WARC:   fprintf(stderr, "A parser for WARC isn't available.\n"); @@ -68,7 +68,7 @@
  case TRECQUERY:   p->n_tag = 0;   strcpy(p->endtag, "top"); - strcpy(p->idtag, "num"); + strcpy(p->idtag, "num");   break;   default:   fprintf(stderr, "Unknown parser type\n"); @@ -96,9 +96,8 @@
  p->n = n;   p->stemmer = create_stemmer();   } - if (x) { + if (x)   p->x = x; - }     return p;  } @@ -124,7 +123,7 @@
  TFile *tfile;   TDoc *tdoc;   unsigned lowmem, b_read; - uint32_t crc, n_term, n_txt, n_uterm, sumsq; + uint32_t crc, sumtf, n_txt, uterm, sumsqtf, maxtf;   int n, n_txtbuf, m, intact, n_header, n_subheader;   char *septxt = " ;,.:`'\"?!(){}[]<>~^&*_-+=#$%@|\\/";   char *sepid = " <>"; @@ -139,7 +138,7 @@
  hpost = newhash(NHASH, cmppost, hashpost);   token = newtoken(NULL, KB);   lowmem = KB; - n_txt = n_term = n_uterm = sumsq = b_read = 0; + n_txt = sumtf = uterm = sumsqtf = maxtf = b_read = 0;   tfile = newTFile(TREC);   tdoc = newTDoc(TREC);   n_txtbuf = TXTBUFSIZE; @@ -160,23 +159,24 @@
  if (tdoc->h->n_id <= 0)   intact = 0;   - /* compute sum(tf^2), fill tdoc->txt and empty + /* compute resources, fill tdoc->txt and empty   * the hash */ - n_txt = 0; - n_term = 0; - n_uterm = 0; - sumsq = 0; + + n_txt = sumtf = uterm = sumsqtf = maxtf = 0; +   for (int i = 0; i < hpost->n; i++) {   for (Node *np = hpost->tab[i]; np != NULL; np = np->next) {   post = (Post *)(np->data);   m = sprintf(&(tdoc->txt[n_txt]), "%s %u ",   post->id, post->tf);   n_txt += m; + uterm++; + sumtf += post->tf; + sumsqtf += post->tf * post->tf; + if (post->tf > maxtf) + maxtf = post->tf;   if (n_txtbuf - n_txt < lowmem)   tdoc->txt = erealloc(tdoc->txt, n_txtbuf <<= 1); - n_uterm++; - n_term += post->tf; - sumsq += post->tf * post->tf;   }   }   freehash(hpost, freepost); @@ -190,26 +190,32 @@
    /* fill TDoc->rsrc[i] */   - tdoc->h->n_rsrc[0] = sprintf(tdoc->rsrc[0], "%u", n_term); + tdoc->h->n_rsrc[0] = sprintf(tdoc->rsrc[0], "%u", sumtf);   if (tdoc->h->n_rsrc[0] == -1)   tdoc->h->n_rsrc[0] = 0;   - tdoc->h->n_rsrc[1] = sprintf(tdoc->rsrc[1], "%u", n_uterm); + tdoc->h->n_rsrc[1] = sprintf(tdoc->rsrc[1], "%u", uterm);   if (tdoc->h->n_rsrc[1] == -1)   tdoc->h->n_rsrc[1] = 0;   - tdoc->h->n_rsrc[2] = sprintf(tdoc->rsrc[2], "%u", sumsq); + tdoc->h->n_rsrc[2] = sprintf(tdoc->rsrc[2], "%u", sumsqtf);   if (tdoc->h->n_rsrc[2] == -1)   tdoc->h->n_rsrc[2] = 0;   + tdoc->h->n_rsrc[3] = sprintf(tdoc->rsrc[3], "%u", maxtf); + if (tdoc->h->n_rsrc[3] == -1) + tdoc->h->n_rsrc[3] = 0; +   /* fill remaining parts of TDoc->TSubHeader */   crc = crc_wordwise(&crc_m, 0, NULL, 0);   crc = crc_wordwise(&crc_m, crc, (unsigned char *)tdoc->txt,   tdoc->h->n_txt);   crc = crc_wordwise(&crc_m, crc, (unsigned char *)tdoc->id,   tdoc->h->n_id); - crc = crc_wordwise(&crc_m, crc, (unsigned char *)tdoc->rsrc[0], - tdoc->h->n_rsrc[0]); + for (int i = 0; i < tfile->h->r; i++) + crc = crc_wordwise(&crc_m, crc, + (unsigned char *)tdoc->rsrc[i], + tdoc->h->n_rsrc[i]);   tdoc->h->crc = crc;     /* if the incoming document is intact attach @@ -220,12 +226,9 @@
    /* update TFile->THeader */   tfile->h->n++; - tfile->h->b += n_subheader - + tdoc->h->n_txt - + tdoc->h->n_id - + tdoc->h->n_rsrc[0] - + tdoc->h->n_rsrc[1] - + tdoc->h->n_rsrc[2]; + tfile->h->b += n_subheader + tdoc->h->n_txt + tdoc->h->n_id; + for (int i = 0; i < tfile->h->r; i++) + tfile->h->b += tdoc->h->n_rsrc[i];   }   else   freeTDoc(tdoc, parser->type);
Change 1 of 1 Show Entire File tfile.h Stacked
 
1
 
2
3
4
 
 
1
2
3
4
@@ -1,4 +1,4 @@
-#define TREC_RSRC 3 /* number of resources for TDoc of type TREC */ +#define TREC_RSRC 4 /* number of resources for TDoc of type TREC */  #define MAX_RSRC 10 /* maximum number of resources */  #define TXTBUFSIZE 10240 /* 10KB */  #define IDBUFSIZE 64 /* A UUID (if used) would be max 36 chars */