Repositories » TXT0
Clone URL:  
Pushed to one repository · View In Graph Contained in tip

Got rid of kbuffer.h interface. It was bringing in unnecessary
indirections leading to poor readability and syntactic bugs.

Changeset aaf4d785f819

Parent dd8ba384c059

by Rup Palchowdhury

Changes to 6 files · Browse files at aaf4d785f819 Showing diff from parent dd8ba384c059 Diff from another changeset...

Change 1 of 5 Show Entire File parser.c Stacked
 
3
4
5
6
7
8
9
 
22
23
24
25
 
26
27
28
29
30
31
 
32
33
34
 
36
37
38
39
 
 
40
41
42
 
47
48
49
50
 
51
52
53
54
55
56
57
58
 
59
60
 
61
62
 
63
64
65
 
74
75
76
77
 
 
78
79
80
81
82
83
84
 
85
86
87
88
89
90
91
92
93
94
95
 
 
 
 
 
96
97
98
 
3
4
5
 
6
7
8
 
21
22
23
 
24
25
26
27
28
29
 
30
31
32
33
 
35
36
37
 
38
39
40
41
42
 
47
48
49
 
50
51
52
53
54
55
 
56
 
57
58
 
59
60
 
61
62
63
64
 
73
74
75
 
76
77
78
79
80
81
82
83
 
84
85
 
86
87
88
89
90
 
 
 
 
91
92
93
94
95
96
97
98
@@ -3,7 +3,6 @@
 #include <string.h>  #include <stdint.h>  #include <malloc/malloc.h> -#include <kak/kbuffer.h>  #include <kak/klog.h>  #include <kak/klist.h>  #include "crc.h" @@ -22,13 +21,13 @@
  TDoc *tdoc;   unsigned lowmem, bytesread;   uint32_t crc, n_term; - int n; + int n, ntxtbuf;   char *septxt = " ;,.:`'\"?!(){}[]<>~^&*_-=#$%@|\\/";   char *sepid = " <>";   /* Log *log; */     /* log = newlog("raw2t"); */ - lowmem = KB; + lowmem = 1024;   reset(&mem, 3);   toktxt = newtokenizer(septxt, LOWERCASE, &mem);   tokid = newtokenizer(sepid, KEEPCASE, &mem); @@ -36,7 +35,8 @@
  bytesread = 0;   tfile = newTFile(TREC);   tdoc = newTDoc(TREC); - token.str = tdoc->txt->p; + token.str = tdoc->txt; + ntxtbuf = BUFSIZE;     while((n = gettoken(&token, fp, toktxt)) > 0) {   @@ -47,19 +47,18 @@
  /* fill TDoc->rsrc[i], TDoc->txt and TDoc->id   * are complete */   - tdoc->h->n_rsrc[0] = sprintf(tdoc->rsrc[0]->b, "%u", n_term); + tdoc->h->n_rsrc[0] = sprintf(tdoc->rsrc[0], "%u", n_term);   if (tdoc->h->n_rsrc[0] == -1)   tdoc->h->n_rsrc[0] = 0;   n_term = 0;     /* fill remaining parts of TDoc->TSubHeader */ - tdoc->h->n_txt = tdoc->txt->i;   crc = crc_wordwise(crc_model, 0, NULL, 0); - crc = crc_wordwise(crc_model, crc, (unsigned char *)tdoc->txt->b, + crc = crc_wordwise(crc_model, crc, (unsigned char *)tdoc->txt,   tdoc->h->n_txt); - crc = crc_wordwise(crc_model, crc, (unsigned char *)tdoc->id->b, + crc = crc_wordwise(crc_model, crc, (unsigned char *)tdoc->id,   tdoc->h->n_id); - crc = crc_wordwise(crc_model, crc, (unsigned char *)tdoc->rsrc[0]->b, + crc = crc_wordwise(crc_model, crc, (unsigned char *)tdoc->rsrc[0],   tdoc->h->n_rsrc[0]);   tdoc->h->crc = crc;   @@ -74,25 +73,26 @@
    /* reset */   tdoc = newTDoc(TREC); - token.str = tdoc->txt->p; + token.str = tdoc->txt; + ntxtbuf = BUFSIZE;   continue;   }   if ((token.type == OTAG) && (strcmp(token.str, "docno") == 0)) {   n = gettoken(&token, fp, tokid);   bytesread += n;   /* fill TDoc->id and update TDoc->TSubHeader */ - memcpy(tdoc->id->b, token.str, token.len); + memcpy(tdoc->id, token.str, token.len);   tdoc->h->n_id = token.len; - shiftpointer(tdoc->id, token.len);   continue;   }   if (token.type == TERM) {   n_term++;   token.str[token.len] = ' '; - token.str = shiftpointer(tdoc->txt, token.len + 1); - if ((tdoc->txt->n - tdoc->txt->i) <= lowmem) { - tdoc->txt = resizebuffer(tdoc->txt, tdoc->txt->n <<= 1); - token.str = tdoc->txt->p; + token.str += token.len + 1; + tdoc->h->n_txt += token.len + 1; + if ((ntxtbuf - tdoc->h->n_txt) < lowmem) { + tdoc->txt = realloc(tdoc->txt, ntxtbuf <<= 1); + token.str = tdoc->txt + tdoc->h->n_txt;   }   }   }
Change 1 of 1 Show Entire File parser.h Stacked
 
 
1
 
1
2
@@ -1,1 +1,2 @@
+#define BUFSIZE 10240  TFile* parse(FILE*, model_t*);
Change 1 of 1 Show Entire File raw2t.c Stacked
 
14
15
16
17
18
19
20
 
14
15
16
 
17
18
19
@@ -14,7 +14,6 @@
 #include <stdlib.h>  #include <string.h>  #include <malloc/malloc.h> -#include <kak/kbuffer.h>  #include <kak/klog.h>  #include <kak/klist.h>  #include "crc.h"
Change 1 of 1 Show Entire File t2mem.c Stacked
 
1
2
3
4
5
6
7
 
1
2
3
 
4
5
6
@@ -1,7 +1,6 @@
 #include <stdio.h>  #include <stdlib.h>  #include <string.h> -#include <kak/kbuffer.h>  #include <kak/klog.h>  #include <kak/klist.h>  #include "tokenizer.h"
Change 1 of 12 Show Entire File tfile.c Stacked
 
1
2
3
4
5
6
7
 
28
29
30
 
 
31
32
33
34
35
36
37
38
 
39
40
41
42
43
44
45
46
47
48
49
 
 
50
51
52
53
54
55
 
 
 
56
57
58
 
72
73
74
75
76
77
78
79
80
81
82
 
 
 
 
 
83
84
85
 
95
96
97
98
 
99
100
101
 
126
127
128
129
130
131
132
133
134
 
 
 
 
135
136
137
 
139
140
141
142
 
143
144
145
146
147
148
149
 
150
151
152
153
154
 
 
155
156
157
158
159
 
 
160
161
162
 
173
174
175
176
 
177
178
179
 
195
196
197
198
 
199
200
201
 
206
207
208
209
210
211
212
213
214
 
 
 
 
215
216
217
 
219
220
221
222
223
224
 
 
 
225
226
227
228
229
230
231
232
233
 
 
 
234
235
236
 
240
241
242
243
244
245
246
247
 
 
 
 
 
248
249
250
 
272
273
274
275
276
277
278
279
280
 
281
282
283
284
285
 
286
287
288
289
290
291
292
293
294
295
296
297
298
 
299
300
 
301
302
303
304
305
 
306
307
 
308
309
310
311
312
 
 
313
314
 
315
316
317
 
1
2
3
 
4
5
6
 
27
28
29
30
31
32
33
34
 
 
 
35
 
36
37
38
39
40
41
42
 
43
44
 
 
45
46
47
 
 
 
 
 
48
49
50
51
52
53
 
67
68
69
 
 
 
 
 
 
 
 
70
71
72
73
74
75
76
77
 
87
88
89
 
90
91
92
93
 
118
119
120
 
 
 
 
 
 
121
122
123
124
125
126
127
 
129
130
131
 
132
133
134
135
136
137
138
 
139
140
141
142
143
 
144
145
146
147
148
 
 
149
150
151
152
153
 
164
165
166
 
167
168
169
170
 
186
187
188
 
189
190
191
192
 
197
198
199
 
 
 
 
 
 
200
201
202
203
204
205
206
 
208
209
210
 
 
 
211
212
213
214
215
216
217
218
219
 
 
 
220
221
222
223
224
225
 
229
230
231
 
 
 
 
 
232
233
234
235
236
237
238
239
 
261
262
263
 
264
265
266
267
 
268
269
270
271
272
 
273
274
275
276
277
278
279
 
 
280
281
282
283
 
284
285
 
286
287
 
288
289
 
290
291
 
292
293
294
295
 
 
296
297
298
 
299
300
301
302
@@ -1,7 +1,6 @@
 #include <stdio.h>  #include <stdlib.h>  #include <string.h> -#include <kak/kbuffer.h>  #include <kak/klist.h>  #include "tokenizer.h"  #include "tfile.h" @@ -28,31 +27,27 @@
  TSubHeader *h;   h = (TSubHeader *)malloc(sizeof(TSubHeader));   h->crc = 0; + h->n_txt = 0; + h->n_id = 0;   h->r = 0;   if (type == TREC)   h->r = TREC_R; - h->n_txt = 0; - h->n_id = 0; - h->n_rsrc = NULL;   if (h->r > 0) - h->n_rsrc = (uint32_t *)calloc(h->r, sizeof(uint32_t)); + memset(h->n_rsrc, 0, sizeof(uint32_t) * h->r);   return h;  }    TDoc *newTDoc(E_TDocType type)  {   TDoc *tdoc; - int i;   tdoc = (TDoc *)malloc(sizeof(TDoc));   tdoc->h = _newTSubHeader(type); - tdoc->txt = newbuffer(10*KB, CHARBUF); - tdoc->id = newbuffer(36, CHARBUF); /* a UUID is 36 chars long */ + tdoc->txt = (char *)malloc(TXTBUFSIZE); + tdoc->id = (char *)malloc(IDBUFSIZE);   tdoc->rsrc = NULL; - if (tdoc->h->r > 0) { - tdoc->rsrc = (Buffer **)malloc(sizeof(Buffer*) * tdoc->h->r); - for (i = 0; i < tdoc->h->r; i++) - tdoc->rsrc[i] = newbuffer(KB, CHARBUF); - } + tdoc->rsrc = (char **)malloc(tdoc->h->r); + for (int i = 0; i < tdoc->h->r; i++) + tdoc->rsrc[i] = (char *)malloc(RSRCBUFSIZE);   return tdoc;  }   @@ -72,14 +67,11 @@
  int i;   if (tdoc == NULL)   return; - freebuffer(tdoc->txt); - freebuffer(tdoc->id); - if (tdoc->h->r > 0) { - for (i = 0; i < tdoc->h->r; i++) - freebuffer(tdoc->rsrc[i]); - free(tdoc->rsrc); - } - free(tdoc->h->n_rsrc); + free(tdoc->txt); + free(tdoc->id); + for (i = 0; i < tdoc->h->r; i++) + free(tdoc->rsrc[i]); + free(tdoc->rsrc);   free(tdoc->h);   free(tdoc);  } @@ -95,7 +87,7 @@
 int writeTFile(TFile *tfile, FILE *fp)  {   uint16_t v, check; - int i, j, n; + int n;   Node *p;   TDoc *tdoc;   @@ -126,12 +118,10 @@
  n = fwrite(&tdoc->h->r, sizeof(uint32_t), 1, fp); v |= n; v <<= 1;   n = fwrite(&tdoc->h->n_txt, sizeof(uint32_t), 1, fp); v |= n; v <<= 1;   n = fwrite(&tdoc->h->n_id, sizeof(uint32_t), 1, fp); v |= n; - if (tdoc->h->r > 0) { - for (j = 0; j < tdoc->h->r; j++) { - n = fwrite(&tdoc->h->n_rsrc[j], sizeof(uint32_t), 1, fp); - v <<= 1; v |= n; - check <<=1; check |= 1; - } + for (int i = 0; i < tdoc->h->r; i++) { + n = fwrite(&tdoc->h->n_rsrc[i], sizeof(uint32_t), 1, fp); + v <<= 1; v |= n; + check <<=1; check |= 1;   }   if (v != check) { /* the lower tdoc->h->r bits of 'check' were set */   fprintf(stderr, "ERROR: failed to write TSubHeader\n"); @@ -139,24 +129,25 @@
  }     /* write the TDoc->txt */ - n = fwrite(tdoc->txt->b, tdoc->h->n_txt, 1, fp); + n = fwrite(tdoc->txt, tdoc->h->n_txt, 1, fp);   if (n != 1) {   fprintf(stderr, "ERROR: failed to write TDoc->txt\n");   return 0;   }     /* write the TDoc->id */ - n = fwrite(tdoc->id->b, tdoc->h->n_id, 1, fp); + n = fwrite(tdoc->id, tdoc->h->n_id, 1, fp);   if (n != 1) {   fprintf(stderr, "ERROR: failed to write TDoc->id\n");   return 0;   } - /* write the TDoc->rsrc[i]->b */ + + /* write the TDoc->rsrc[i] */   if (tdoc->h->r > 0) {   v = 0;   check = 0x0000; - for (j = 0; j < tdoc->h->r; j++) { - n = fwrite(tdoc->rsrc[j]->b, tdoc->h->n_rsrc[j], 1, fp); + for (int i = 0; i < tdoc->h->r; i++) { + n = fwrite(tdoc->rsrc[i], tdoc->h->n_rsrc[i], 1, fp);   v <<= 1; v |= n;   check <<= 1; check |= n;   } @@ -173,7 +164,7 @@
 TFile* readTFile(FILE *fp)  {   uint16_t v, check; - int i, j, n; + int n;   TFile *tfile;   TDoc *tdoc;   @@ -195,7 +186,7 @@
  }     /* fill the TDoc structures */ - for (i = 0; i < tfile->h->n; i++) { + for (int i = 0; i < tfile->h->n; i++) {     tdoc = newTDoc(TREC);   @@ -206,12 +197,10 @@
  n = fread(&tdoc->h->r, sizeof(uint32_t), 1, fp); v |= n; v <<= 1;   n = fread(&tdoc->h->n_txt, sizeof(uint32_t), 1, fp); v |= n; v <<= 1;   n = fread(&tdoc->h->n_id, sizeof(uint32_t), 1, fp); v |= n; - if (tdoc->h->r > 0) { - for (j = 0; j < tdoc->h->r; j++) { - n = fread(&tdoc->h->n_rsrc[j], sizeof(uint32_t), 1, fp); - v <<= 1; v |= n; - check <<=1; check |= 1; - } + for (int j = 0; j < tdoc->h->r; j++) { + n = fread(&tdoc->h->n_rsrc[j], sizeof(uint32_t), 1, fp); + v <<= 1; v |= n; + check <<=1; check |= 1;   }   if (v != check) { /* tfile->h->r 1's pushed through the LSB */   fprintf(stderr, "ERROR: failed to read TDoc->TSubHeader\n"); @@ -219,18 +208,18 @@
  }     /* fill TDoc->txt */ - if (tdoc->txt->n < tdoc->h->n_txt) - tdoc->txt = resizebuffer(tdoc->txt, tdoc->h->n_txt); - n = fread(tdoc->txt->b, tdoc->h->n_txt, 1, fp); + if (tdoc->h->n_txt > TXTBUFSIZE) + tdoc->txt = (char *)realloc(tdoc->txt, tdoc->h->n_txt); + n = fread(tdoc->txt, tdoc->h->n_txt, 1, fp);   if (n != 1) {   fprintf(stderr, "ERROR: failed to read TDoc->txt\n");   return NULL;   }     /* fill TDoc->id */ - if (tdoc->id->n < tdoc->h->n_id) - tdoc->id = resizebuffer(tdoc->id, tdoc->h->n_id); - n = fread(tdoc->id->b, tdoc->h->n_id, 1, fp); + if (tdoc->h->n_id > IDBUFSIZE) + tdoc->id = (char *)realloc(tdoc->id, tdoc->h->n_id); + n = fread(tdoc->id, tdoc->h->n_id, 1, fp);   if (n != 1) {   fprintf(stderr, "ERROR: failed to read TDoc->id\n");   return NULL; @@ -240,11 +229,11 @@
  if (tdoc->h->r > 0) {   v = 0;   check = 0x0000; - for (j = 0; j < tdoc->h->r; j++) { - if (tdoc->rsrc[j]->n < tdoc->h->n_rsrc[j]) - tdoc->rsrc[j] = resizebuffer(tdoc->rsrc[j], - tdoc->h->n_rsrc[j]); - n = fread(tdoc->rsrc[j]->b, tdoc->h->n_rsrc[j], 1, fp); + for (int j = 0; j < tdoc->h->r; j++) { + if (tdoc->h->n_rsrc[j] > RSRCBUFSIZE) + tdoc->rsrc[j] = realloc(tdoc->rsrc[j], + tdoc->h->n_rsrc[j]); + n = fread(tdoc->rsrc[j], tdoc->h->n_rsrc[j], 1, fp);   v <<= 1; v |= n;   check <<= 1; check |= n;   } @@ -272,46 +261,42 @@
   void _printTSubHeader(TSubHeader *h)  { - int i;   static int flag = 1;   if (flag) {   flag = 0;   printf("%-10s %-10s %-3s %-2s ", "CRC", "TXT", "ID", "R"); - for (i = 0; i < h->r; i++) + for (int i = 0; i < h->r; i++)   printf("RSRC%-6u ", i);   printf("\n");   }   printf("%-10u %-10u %-3u %-2u ", h->crc, h->n_txt, h->n_id, h->r); - for (i = 0; i < h->r; i++) + for (int i = 0; i < h->r; i++)   printf("%-10u ", h->n_rsrc[i]);   printf("\n");  }    void _printTDoc(TDoc *tdoc)  { - int i; -   _printTSubHeader(tdoc->h);     /* print the doc ID */   - ((char *)tdoc->id->b)[tdoc->h->n_id] = '\0'; + tdoc->id[tdoc->h->n_id] = '\0';   printf("tdoc->id: "); - printf("%s\n", tdoc->id->b); + printf("%s\n", tdoc->id);   -   /* print the tokenized doc text */   - ((char *)tdoc->txt->b)[tdoc->h->n_txt] = '\0'; + tdoc->txt[tdoc->h->n_txt] = '\0';   printf("tdoc->txt: "); - printf("%s\n", tdoc->txt->b); + printf("%s\n", tdoc->txt);     /* print the resource blocks */   - for (i = 0; i < tdoc->h->r; i++) { - ((char *)tdoc->rsrc[i]->b)[tdoc->h->n_rsrc[i]] = '\0'; + for (int i = 0; i < tdoc->h->r; i++) { + tdoc->rsrc[i][tdoc->h->n_rsrc[i]] = '\0';   printf("tdoc->rsrc[%d]: ", i); - printf("%s\n", tdoc->rsrc[i]->b); + printf("%s\n", tdoc->rsrc[i]);   }  }  
Change 1 of 2 Show Entire File tfile.h Stacked
 
1
2
3
 
 
 
 
4
5
6
 
25
26
27
28
 
29
30
31
32
33
34
35
 
 
 
36
37
38
 
1
2
3
4
5
6
7
8
9
10
 
29
30
31
 
32
33
34
35
36
 
 
 
37
38
39
40
41
42
@@ -1,6 +1,10 @@
 #define THEADER_SIZE 32 /* header size in bytes */  #define TSUBHEADER_SIZE 12 /* sub-header size in bytes */  #define TREC_R 1 /* number of resources for TDoc of type TREC */ +#define N_RSRC 10 /* number of resources */ +#define TXTBUFSIZE 10240 /* 10KB */ +#define IDBUFSIZE 36 /* A UUID (if used) would be max 36 chars */ +#define RSRCBUFSIZE 100    typedef enum {TREC, WARC} E_TDocType;   @@ -25,14 +29,14 @@
  uint32_t n_txt;   uint32_t n_id;   uint32_t r; - uint32_t *n_rsrc; + uint32_t n_rsrc[10];  };    struct TDoc {   TSubHeader *h; - Buffer *txt; - Buffer *id; - Buffer **rsrc; + char *txt; + char *id; + char **rsrc;  };    struct TFile {