Repositories » TXT0
Clone URL:  
Pushed to one repository · View In Graph Contained in v0.1 and tip

v0.1 This is a in-progress re-write of the tokenized-file facility
(TFile). The structures were renamed, and the interface cleaned. This

particular commit is for a version that has a Heisenbug. On line 49 in
parser.c, there is a memcpy() to the destination tdoc->rsrc[0] instead
of tdoc->rsrc[0]->b. This leads to a segfault (on OSX) when run, but,
not when run in lldb. The way to track it down is to create a core
dump and load lldb with the target program and the core file. In lldb,
a backtrace lists this line as a source of this bug.

uname -a

Darwin bhuto.local 15.4.0 Darwin Kernel Version 15.4.0: Fri Feb 26 22:08:05 PST 2016; root:xnu-3248.40.184~3/RELEASE_X86_64 x86_64

gcc -v

Configured with: --prefix=/Applications/Xcode.app/Contents/Developer/usr --with-gxx-include-dir=/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.11.sdk/usr/include/c++/4.2.1
Apple LLVM version 7.3.0 (clang-703.0.29)
Target: x86_64-apple-darwin15.4.0
Thread model: posix
InstalledDir: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin

lldb -v

lldb-350.0.21.3

Changeset 4720e0fa4df5

Parent 77a89036f0b3

by Rup Palchowdhury

Changes to 15 files · Browse files at 4720e0fa4df5 Showing diff from parent 77a89036f0b3 Diff from another changeset...

Change 1 of 1 Show Entire File Makefile Stacked
 
16
17
18
19
 
20
21
22
 
16
17
18
 
19
20
21
22
@@ -16,7 +16,7 @@
   SRC = \   tokenizer.c \ - tok.c \ + tfile.c \   parser.c \   crc.c \   post.c \
Change 1 of 14 Show Entire File crc.c Stacked
 
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
 
127
128
129
 
131
132
133
134
 
135
136
137
 
165
166
167
168
 
169
170
171
 
246
247
248
249
 
250
251
252
 
267
268
269
270
 
271
272
273
 
314
315
316
317
 
318
319
320
 
341
342
343
344
 
345
346
347
 
369
370
371
372
 
373
374
375
 
516
517
518
519
 
520
521
522
 
619
620
621
622
 
623
624
625
 
711
712
713
714
 
715
716
717
 
806
807
808
809
 
810
811
812
 
921
922
923
924
 
925
926
927
 
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
 
57
58
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
61
62
63
 
65
66
67
 
68
69
70
71
 
99
100
101
 
102
103
104
105
 
180
181
182
 
183
184
185
186
 
201
202
203
 
204
205
206
207
 
248
249
250
 
251
252
253
254
 
275
276
277
 
278
279
280
281
 
303
304
305
 
306
307
308
309
 
450
451
452
 
453
454
455
456
 
553
554
555
 
556
557
558
559
 
645
646
647
 
648
649
650
651
 
740
741
742
 
743
744
745
746
 
855
856
857
 
858
859
860
861
 
1057
1058
1059
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
@@ -57,73 +57,7 @@
 # error The number of bits in a char must be 8 for this code.  #endif   -/* Type to use for CRC calculations. This should be the largest unsigned - integer type available, to maximize the cases that can be computed. word_t - can be any unsigned integer type, except for unsigned char. All of the - algorithms here can process CRCs up to the size of a word_t. The bit-wise - algorithm here can process CRCs up to twice the size of a word_t. */ -/* typedef uintmax_t word_t; */ - -/* Determine the size of uintmax_t at pre-processor time. (sizeof is not - evaluated at pre-processor time.) If word_t is instead set to an explicit - size above, e.g. uint64_t, then #define WORDCHARS appropriately, e.g. as 8. - WORDCHARS must be 2, 4, 8, or 16. */ -/* -#if UINTMAX_MAX == UINT16_MAX -# define WORDCHARS 2 -#elif UINTMAX_MAX == UINT32_MAX -# define WORDCHARS 4 -#elif UINTMAX_MAX == UINT64_MAX -# define WORDCHARS 8 -#elif UINTMAX_MAX == UINT128_MAX -# define WORDCHARS 16 -#else -# error uintmax_t must be 2, 4, 8, or 16 bytes for this code. -#endif -*/ - -typedef uint32_t word_t; -#define WORDCHARS 4 - -/* The number of bits in a word_t (assumes CHAR_BIT is 8). */ -#define WORDBITS (WORDCHARS<<3) - -/* Mask for the low n bits of a word_t (n must be greater than zero). */ -#define ONES(n) (((word_t)0 - 1) >> (WORDBITS - (n))) - -/* CRC description and tables, allowing for double-word CRCs. - - The description is based on Ross William's parameters, but with some changes - to the parameters as described below. - - ref and rev are derived from refin and refout. rev and rev must be 0 or 1. - ref is the same as refin. rev is true only if refin and refout are - different. rev true is very uncommon, and occurs in only one of the 72 CRCs - in the RevEng catalogue. When rev is false, the common case, ref true means - that both the input and output are reflected. Reflected CRCs are quite - common. - - init is different here as well, representing the CRC of a zero-length - sequence, instead of the initial contents of the CRC register. - - poly is reflected for refin true. xorout is reflected for refout true. - - The structure includes space for pre-computed CRC tables used to speed up - the CRC calculation. Both are filled in by the crc_table_wordwise() - routine, using the CRC parameters already defined in the structure. */ -typedef struct { - unsigned short width; /* number of bits in the CRC (the degree of the - polynomial) */ - char ref; /* if true, reflect input and output */ - char rev; /* if true, reverse output */ - word_t poly, poly_hi; /* polynomial representation (sans x^width) */ - word_t init, init_hi; /* CRC of a zero-length sequence */ - word_t xorout, xorout_hi; /* final CRC is exclusive-or'ed with this */ - word_t check, check_hi; /* CRC of the nine ASCII bytes "12345679" */ - char *name; /* text description of this CRC */ - word_t table_byte[256]; /* table for byte-wise calculation */ - word_t table_word[WORDCHARS][256]; /* tables for word-wise calculation */ -} model_t; +#include "crc.h"    /* Return the reversal of the low n-bits of x. 1 <= n <= WORDBITS. The high   WORDBITS - n bits in x are ignored, and are set to zero in the returned @@ -131,7 +65,7 @@
  reverse() is of no consequence since it is used at most twice per crc()   call. Even then, it is only used in the rare case that refin and refout are   different. */ -static inline word_t reverse(word_t x, unsigned n) +word_t reverse(word_t x, unsigned n)  {   word_t y;   @@ -165,7 +99,7 @@
  The final value of crc is the CRC of the chunks in sequence. The first call   of crc_bitwise() gets the initial CRC value for this model.   */ -static inline word_t crc_bitwise(model_t *model, word_t crc, +word_t crc_bitwise(model_t *model, word_t crc,   unsigned char *buf, size_t len)  {   word_t poly = model->poly; @@ -246,7 +180,7 @@
  and the CRC width is less than 8, then the CRC is pre-shifted left to the   high end of the low 8 bits so that the incoming byte can be exclusive-ored   directly into a shifted CRC. */ -static void crc_table_bytewise(model_t *model) +void crc_table_bytewise(model_t *model)  {   unsigned char k;   word_t crc; @@ -267,7 +201,7 @@
 /* Equivalent to crc_bitwise(), but use a faster byte-wise table-based   approach. This assumes that model->table_byte has been initialized using   crc_table_bytewise(). */ -static inline word_t crc_bytewise(model_t *model, word_t crc, +word_t crc_bytewise(model_t *model, word_t crc,   unsigned char *buf, size_t len)  {   /* if requested, return the initial CRC */ @@ -314,7 +248,7 @@
  speed of swap() is inconsequential however, being used at most twice per   crc_wordwise() call. It is only used on little-endian machines if the CRC   is not reflected, or on big-endian machines if the CRC is reflected. */ -static inline word_t swap(word_t x) +word_t swap(word_t x)  {   word_t y;   unsigned n = WORDCHARS - 1; @@ -341,7 +275,7 @@
  is the same as table_byte. In that case, the two could be combined,   reducing the total size of the tables. This is also true if model->ref is   false, the machine is big-endian, and model->width is equal to WORDBITS. */ -static void crc_table_wordwise(model_t *model) +void crc_table_wordwise(model_t *model)  {   unsigned n, k, opp, top;   word_t crc; @@ -369,7 +303,7 @@
 /* Equivalent to crc_bitwise(), but use an even faster word-wise table-based   approach. This assumes that model->table_byte and model->table_word have   been initialized using crc_table_wordwise(). */ -static inline word_t crc_wordwise(model_t *model, word_t crc, +word_t crc_wordwise(model_t *model, word_t crc,   unsigned char *buf, size_t len)  {   unsigned little, top, shift; @@ -516,7 +450,7 @@
   /* Return the reversal of the low n-bits of hi/lo in hi/lo.   1 <= n <= WORDBITS*2. */ -static inline void reverse_dbl(word_t *hi, word_t *lo, unsigned n) +void reverse_dbl(word_t *hi, word_t *lo, unsigned n)  {   word_t tmp;   @@ -619,7 +553,7 @@
    The CRC of the sequence is left in hi, lo.   */ -static void crc_bitwise_dbl(model_t *model, word_t *crc_hi, word_t *crc_lo, +void crc_bitwise_dbl(model_t *model, word_t *crc_hi, word_t *crc_lo,   unsigned char *buf, size_t len)  {   word_t poly_lo = model->poly; @@ -711,7 +645,7 @@
  and value. read_vars() returns 1 on success, 0 on end of string, or -1 if   there was an error, such as no name, no "=", no value, or no closing quote.   If -1, *str is not modified, though *next and *value may be modified. */ -static int read_var(char **str, char **name, char **value) +int read_var(char **str, char **name, char **value)  {   char *next, *copy;   @@ -806,7 +740,7 @@
  valid. If the provided digits result in an overflow of the double-length   integer, then NULL is returned. If NULL is returned, *high and *low are   unaltered. */ -static char *strtobig(char *str, word_t *high, word_t *low) +char *strtobig(char *str, word_t *high, word_t *low)  {   unsigned k; /* base, then digits */   word_t nh, nl; /* double-length number accumulated */ @@ -921,7 +855,7 @@
    w=16 p=4129 r=t c=8585 n=KERMIT   */ -static int read_model(model_t *model, char *str) +int read_model(model_t *model, char *str)  {   int ret;   char *name, *value, *end; @@ -1123,39 +1057,3 @@
  model->rev ^= model->ref;   return 0;  } - -/* Read a CRC model descriptions and compute the CRC using a byte-wise - * algorithm. */ - -uint32_t compute_crc(void *buf, size_t len) -{ - int ret; - char *model_str; - uint32_t crc; - model_t model; - - model_str = malloc(122); - strcpy(model_str, - "width=32 poly=0x04c11db7 init=0x00000000 refin=false refout=false xorout=0xffffffff check=0x765e7680 name=\"CRC-32/POSIX\""); - - model.name = NULL; - - ret = read_model(&model, model_str); - if (ret == 2) { - fputs("out of memory -- aborting\n", stderr); - exit(0); - } - else if (ret == 1) { - fprintf(stderr, "%s: -- unusable model\n", - model.name == NULL ? "<no name>" : model.name); - exit(0); - } - else { - crc_table_wordwise(&model); - crc = crc_bytewise(&model, 0, NULL, 0); - crc = crc_bytewise(&model, crc, (unsigned char *)buf, len); - } - free(model.name); - model.name = NULL; - return crc; -}
Change 1 of 1 Show Entire File crc.h Stacked
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
@@ -1,1 +1,133 @@
+/* + crcany version 1.0, 22 December 2014 + + Copyright (C) 2014 Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Mark Adler + madler@alumni.caltech.edu +*/ + +/* Version history: + 1.0 22 Dec 2014 First version + */ + +/* Generalized CRC algorithm. Compute any specified CRC up to 128 bits long. + Take model inputs from http://reveng.sourceforge.net/crc-catalogue/all.htm + and verify the check values. This verifies all 72 CRCs on that page (as of + the version date above). The lines on that page that start with "width=" + can be fed verbatim to this program. The 128-bit limit assumes that + uintmax_t is 64 bits. The bit-wise algorithms here will compute CRCs up to + a width twice that of the typedef'ed word_t type. + + This code also generates and tests table-driven algorithms for high-speed. + The byte-wise algorithm processes one byte at a time instead of one bit at a + time, and the word-wise algorithm ingests one word_t at a time. The table + driven algorithms here only work for CRCs that fit in a word_t, though they + could be extended in the same way the bit-wise algorithm is extended here. + + The CRC parameters used in the linked catalogue were originally defined in + Ross Williams' "A Painless Guide to CRC Error Detection Algorithms", which + can be found here: http://zlib.net/crc_v3.txt . + */ + +/* Type to use for CRC calculations. This should be the largest unsigned + integer type available, to maximize the cases that can be computed. word_t + can be any unsigned integer type, except for unsigned char. All of the + algorithms here can process CRCs up to the size of a word_t. The bit-wise + algorithm here can process CRCs up to twice the size of a word_t. */ +/* typedef uintmax_t word_t; */ + +/* Determine the size of uintmax_t at pre-processor time. (sizeof is not + evaluated at pre-processor time.) If word_t is instead set to an explicit + size above, e.g. uint64_t, then #define WORDCHARS appropriately, e.g. as 8. + WORDCHARS must be 2, 4, 8, or 16. */ +/* +#if UINTMAX_MAX == UINT16_MAX +# define WORDCHARS 2 +#elif UINTMAX_MAX == UINT32_MAX +# define WORDCHARS 4 +#elif UINTMAX_MAX == UINT64_MAX +# define WORDCHARS 8 +#elif UINTMAX_MAX == UINT128_MAX +# define WORDCHARS 16 +#else +# error uintmax_t must be 2, 4, 8, or 16 bytes for this code. +#endif +*/ + +typedef uint32_t word_t; +#define WORDCHARS 4 + +/* The number of bits in a word_t (assumes CHAR_BIT is 8). */ +#define WORDBITS (WORDCHARS<<3) + +/* Mask for the low n bits of a word_t (n must be greater than zero). */ +#define ONES(n) (((word_t)0 - 1) >> (WORDBITS - (n))) + +/* CRC description and tables, allowing for double-word CRCs. + + The description is based on Ross William's parameters, but with some changes + to the parameters as described below. + + ref and rev are derived from refin and refout. rev and rev must be 0 or 1. + ref is the same as refin. rev is true only if refin and refout are + different. rev true is very uncommon, and occurs in only one of the 72 CRCs + in the RevEng catalogue. When rev is false, the common case, ref true means + that both the input and output are reflected. Reflected CRCs are quite + common. + + init is different here as well, representing the CRC of a zero-length + sequence, instead of the initial contents of the CRC register. + + poly is reflected for refin true. xorout is reflected for refout true. + + The structure includes space for pre-computed CRC tables used to speed up + the CRC calculation. Both are filled in by the crc_table_wordwise() + routine, using the CRC parameters already defined in the structure. */ +typedef struct { + unsigned short width; /* number of bits in the CRC (the degree of the + polynomial) */ + char ref; /* if true, reflect input and output */ + char rev; /* if true, reverse output */ + word_t poly, poly_hi; /* polynomial representation (sans x^width) */ + word_t init, init_hi; /* CRC of a zero-length sequence */ + word_t xorout, xorout_hi; /* final CRC is exclusive-or'ed with this */ + word_t check, check_hi; /* CRC of the nine ASCII bytes "12345679" */ + char *name; /* text description of this CRC */ + word_t table_byte[256]; /* table for byte-wise calculation */ + word_t table_word[WORDCHARS][256]; /* tables for word-wise calculation */ +} model_t; + +word_t reverse(word_t x, unsigned n); +word_t crc_bitwise(model_t *model, word_t crc, + unsigned char *buf, size_t len); +void crc_table_bytewise(model_t *model); +word_t crc_bytewise(model_t *model, word_t crc, + unsigned char *buf, size_t len); +word_t swap(word_t x); +void crc_table_wordwise(model_t *model); +word_t crc_wordwise(model_t *model, word_t crc, + unsigned char *buf, size_t len); +void reverse_dbl(word_t *hi, word_t *lo, unsigned n); +void crc_bitwise_dbl(model_t *model, word_t *crc_hi, word_t *crc_lo, + unsigned char *buf, size_t len); +int read_var(char **str, char **name, char **value); +char *strtobig(char *str, word_t *high, word_t *low); +int read_model(model_t *model, char *str);  uint32_t compute_crc(void *buf, size_t len);
Change 1 of 3 Show Entire File ii.c Stacked
 
8
9
10
11
 
12
13
14
15
16
17
 
 
 
18
19
20
21
22
23
 
24
25
26
 
29
30
31
32
 
33
34
 
35
36
 
37
38
 
39
40
41
 
42
43
44
45
 
 
 
46
47
 
48
49
50
51
52
53
54
55
56
57
58
59
60
 
 
 
 
 
 
 
61
62
63
 
68
69
70
71
72
73
 
 
 
74
75
76
77
 
78
79
80
81
 
82
83
 
84
85
86
 
8
9
10
 
11
12
13
14
 
 
 
15
16
17
18
19
20
21
 
 
22
23
24
25
 
28
29
30
 
31
32
 
33
34
 
35
36
 
37
38
39
 
40
41
 
 
 
42
43
44
45
 
46
47
48
49
 
 
 
 
 
 
 
 
 
 
50
51
52
53
54
55
56
57
58
59
 
64
65
66
 
 
 
67
68
69
70
71
72
 
73
74
75
76
 
77
78
 
79
80
81
82
@@ -8,19 +8,18 @@
 #include "post.h"  #include "term.h"  #include "tokenizer.h" -#include "tok.h" +#include "tfile.h"    int main(void)  { - int i, tlen, doclen, idlen; - Tok *tok; - Tokdoc *tokdoc; + int i, n_term, n_txt, n_id; + TFile *tfile; + TDoc *tdoc;   TNode *t;   Node *p;   Post *post;   Term *term; - - unsigned row, col, ndoc; + unsigned row, col, n_doc;   char *docid;     t = NULL; @@ -29,35 +28,32 @@
  docid = (char *)malloc(row * col);   memset(docid, '\0', row * col);   - ndoc = 0; + n_doc = 0;   - while((tok = read_t(stdin)) != NULL) { + while((tfile = readTFile(stdin)) != NULL) {   - if (ndoc == row) { + if (n_doc == row) {   docid = (char *)realloc(docid, (row <<= 1) * col); - memset(docid + ndoc * col, '\0', (row - ndoc) * col); + memset(docid + n_doc * col, '\0', (row - n_doc) * col);   }   - for (p = tok->list; p != NULL; p = p->next) { + for (p = tfile->list; p != NULL; p = p->next) {   - tokdoc = p->data; - doclen = tokdoc->header->block[RES1DOCLEN]; - idlen = tokdoc->header->block[RES2IDLEN]; + tdoc = p->data; + n_txt = tdoc->h->n_txt; + n_id = tdoc->h->n_id;   - memcpy(docid + ndoc * col, tokdoc->id->b, idlen); + memcpy(docid + n_doc * col, tdoc->id->b, n_id);     /* pick terms one by one and add to BST */   - /* It's assumed that tokdoc->doc->b is at least on - * byte longer than its contents */ - - for (i = 0; i <= doclen; i++) { - if (((char *)tokdoc->doc->b)[i] == ' ' || i == doclen) { - tlen = i - tokdoc->doc->i; - ((char *)tokdoc->doc->b)[i] = '\0'; - post = newpost(ndoc, 1); - term = newterm((char *)tokdoc->doc->p, post); - + for (i = 0; i < n_txt; i++) { + if (((char *)tdoc->txt->b)[i] == ' ') { + n_term = i - tdoc->txt->i; + post = newpost(n_doc, 1); + term = newterm((char *)tdoc->txt->p, n_term, post); + printf("%s %u\n", term->s, n_term); +   /* if term strings match   walk the list till docid matches   list->data->tf++ @@ -68,19 +64,19 @@
  attach term to tree as a new node   */   - t = insert(t, newtnode((void *)term), termcmp, - term_match_handler); - shiftpointer(tokdoc->doc, tlen + 1); + /* t = insert(t, newtnode((void *)term), termcmp, */ + /* term_match_handler); */ + shiftpointer(tdoc->txt, n_term + 1);   }   }   - ndoc++; + n_doc++;   }   }   - applyinorder(t, printterm, "%s %d "); + /* applyinorder(t, printtree, "%s %d\n"); */   - /* for (i = 0; i < ndoc; i++) */ + /* for (i = 0; i < n_doc; i++) */   /* printf("%s\n", docid + i * col); */     return 0;
Change 1 of 1 Show Entire File parser.c Stacked
 
6
7
8
 
9
10
 
11
12
13
14
 
 
 
15
16
17
 
 
18
19
20
21
22
 
 
 
 
23
24
25
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
28
 
 
 
29
30
 
31
32
 
 
 
 
33
34
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
37
 
 
 
 
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
 
 
 
71
72
73
74
 
75
76
77
78
 
 
 
 
79
80
81
 
82
83
84
85
86
87
 
 
 
 
88
89
90
91
92
 
 
93
94
95
 
 
 
 
 
 
 
 
 
 
 
 
 
96
97
98
99
100
101
102
103
104
105
106
107
 
 
 
 
 
 
 
108
 
6
7
8
9
10
 
11
12
 
13
 
14
15
16
17
 
 
18
19
20
 
 
 
 
21
22
23
24
25
 
 
 
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
 
41
42
43
44
 
45
46
 
47
48
49
50
51
 
 
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
 
68
69
70
71
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
74
75
76
77
78
 
79
80
 
 
 
81
82
83
84
85
86
87
88
89
 
 
 
 
 
90
91
92
93
94
95
96
97
98
99
100
101
 
 
102
103
104
105
106
107
108
109
110
111
112
113
114
115
 
 
 
 
 
 
 
 
 
 
 
116
117
118
119
120
121
122
123
@@ -6,103 +6,118 @@
 #include <kak/kbuffer.h>  #include <kak/klog.h>  #include <kak/klist.h> +#include "crc.h"  #include "tokenizer.h" -#include "tok.h" +#include "tfile.h"  #include "parser.h" -#include "crc.h"   -Tok* parse(FILE *fp) +/* TODO: parser() assumes a TREC-type TFile */ + +TFile* parse(FILE *fp, model_t *crc_model)  { - Tokenizer *tokenizer_text, *tokenizer_id; - Token token, *docid; + Tokenizer *toktxt, *tokid; + Token token;   Stack mem; - Tok *tok; - Tokdoc *tokdoc; - unsigned lowmem, ntok, ndoc, nresize, bytesread; - uint32_t crc; + TFile *tfile; + TDoc *tdoc; + unsigned lowmem, bytesread; + uint32_t crc, n_term;   int n; - char *septext = " ;,.:`'\"?!(){}[]<>~^&*_-=#$%@|\\/"; - char *sepid = " <>"; - Log *log; + char *septxt = " ;,.:`'\"?!(){}[]<>~^&*_-=#$%@|\\/"; + char *sepid = " <>"; + /* Log *log; */ + + /* log = newlog("raw2t"); */ + lowmem = KB; + reset(&mem, 3); + toktxt = newtokenizer(septxt, LOWERCASE, &mem); + tokid = newtokenizer(sepid, KEEPCASE, &mem); + n_term = 0; + bytesread = 0; + tfile = newTFile(TREC); + tdoc = newTDoc(TREC); + token.str = tdoc->txt->p;   - log = newlog("raw2t"); + while((n = gettoken(&token, fp, toktxt)) > 0) { + + bytesread += n;   - lowmem = KB; + if ((token.type == CTAG) && (strcmp(token.str, "doc") == 0)) {   - reset(&mem, 3); + /* fill TDoc->rsrc[i], TDoc->txt and TDoc->id + * are complete */ + memcpy(tdoc->rsrc[0], &n_term, sizeof(uint32_t)); + n_term = 0;   - tokenizer_text = newtokenizer(septext, LOWERCASE, &mem); - tokenizer_id = newtokenizer(sepid, KEEPCASE, &mem); + /* fill remaining parts of TDoc->TSubHeader */ + tdoc->h->n_txt = tdoc->txt->i; + tdoc->h->n_rsrc[0] = sizeof(uint32_t); + crc = crc_wordwise(crc_model, 0, NULL, 0); + crc = crc_wordwise(crc_model, crc, (unsigned char *)tdoc->txt->b, + tdoc->h->n_txt); + crc = crc_wordwise(crc_model, crc, (unsigned char *)tdoc->id->b, + tdoc->h->n_id); + crc = crc_wordwise(crc_model, crc, (unsigned char *)tdoc->rsrc[0]->b, + tdoc->h->n_rsrc[0]); + tdoc->h->crc = crc; + + /* add a new node to TFile->list with a TDoc + * payload */ + tfile->list = addfront(tfile->list, newnode((void *)tdoc));   - docid = newtoken(128); + /* update TFile->THeader */ + tfile->h->n++; + tfile->h->b += TSUBHEADER_SIZE + tdoc->h->n_txt + + tdoc->h->n_id + tdoc->h->n_rsrc[0];   - ntok = 0; /* token count */ - ndoc = 0; /* doc count */ - nresize = 0; /* realloc count */ - bytesread = 0; /* bytes read in */ - - tok = newtok(TREC_ADHOC); - tokdoc = newtokdoc(TREC_ADHOC); - token.str = tokdoc->doc->p; - - while((n = gettoken(&token, fp, tokenizer_text)) > 0) { - bytesread += n; - if ((token.type == CTAG) && (strcmp(token.str, "doc") == 0)) { - /* drop trailing ' ' */ - shiftpointer(tokdoc->doc, -1); - /* fill doc header */ - tokdoc->header->block[RES1DOCLEN] = tokdoc->doc->i; - tokdoc->header->block[RES2IDLEN] = docid->len; - /* copy resource block */ - memcpy(tokdoc->id->p, docid->str, docid->len); - shiftpointer(tokdoc->id, docid->len); - /* compute crc of doc buffer and add it to the header */ - crc = compute_crc(tokdoc->doc->b, tokdoc->doc->i); - tokdoc->header->block[RES0CRC32] = crc; - /* add doc block to list */ - tok->list = addfront(tok->list, newnode((void *)tokdoc)); - /* increment data length and doc count */ - tok->header->block[LENGTH] += 4 * tok->header->block[NUMRES] - + tokdoc->doc->i + tokdoc->id->i; - tok->header->block[NUMDOCS]++; - /* get new placeholder for doc */ - tokdoc = newtokdoc(TREC_ADHOC); - token.str = tokdoc->doc->p; + /* reset */ + tdoc = newTDoc(TREC); + token.str = tdoc->txt->p;   continue;   }   if ((token.type == OTAG) && (strcmp(token.str, "docno") == 0)) { - n = gettoken(&token, fp, tokenizer_id); + n = gettoken(&token, fp, tokid);   bytesread += n; - /* save a copy of the docid for later */ - memcpy(docid->str, token.str, token.len); - docid->len = token.len; + /* fill TDoc->id and update TDoc->TSubHeader */ + memcpy(tdoc->id->b, token.str, token.len); + tdoc->h->n_id = token.len; + shiftpointer(tdoc->id, token.len);   continue;   }   if (token.type == TERM) { + n_term++;   token.str[token.len] = ' '; - token.str = shiftpointer(tokdoc->doc, token.len + 1); - if ((tokdoc->doc->n - tokdoc->doc->i) <= lowmem) { - tokdoc->doc = resizebuffer(tokdoc->doc, - tokdoc->doc->n <<= 1); - token.str = tokdoc->doc->p; + token.str = shiftpointer(tdoc->txt, token.len + 1); + if ((tdoc->txt->n - tdoc->txt->i) <= lowmem) { + tdoc->txt = resizebuffer(tdoc->txt, tdoc->txt->n <<= 1); + token.str = tdoc->txt->p;   }   }   }     bytesread++; + + tfile->h->b += THEADER_SIZE;   - crc = compute_crc(tok->header->block + 2, 6*4); - tok->header->block[CRC32] = crc; + crc = crc_wordwise(crc_model, 0, NULL, 0); + crc = crc_wordwise(crc_model, crc, (unsigned char *)&tfile->h->ver, + sizeof(uint32_t)); + crc = crc_wordwise(crc_model, crc, (unsigned char *)&tfile->h->type, + sizeof(uint32_t)); + crc = crc_wordwise(crc_model, crc, (unsigned char *)&tfile->h->b, + sizeof(uint32_t)); + crc = crc_wordwise(crc_model, crc, (unsigned char *)&tfile->h->n, + sizeof(uint32_t)); + crc = crc_wordwise(crc_model, crc, (unsigned char *)&tfile->h->r, + sizeof(uint32_t)); + crc = crc_wordwise(crc_model, crc, (unsigned char *)&tfile->h->bo, + sizeof(uint32_t));   - fprintf(log->fp, "%16s: %u\n", "bytes read", bytesread); - fprintf(log->fp, "%16s: %u\n", "Magic", tok->header->block[MAGIC]); - fprintf(log->fp, "%16s: %u\n", "CRC32",tok->header->block[CRC32]); - fprintf(log->fp, "%16s: %u\n", "Version", tok->header->block[VERSION]); - fprintf(log->fp, "%16s: %u\n", "Type", tok->header->block[TYPE]); - fprintf(log->fp, "%16s: %u\n", "Length", tok->header->block[LENGTH]); - fprintf(log->fp, "%16s: %u\n", "#docs", tok->header->block[NUMDOCS]); - fprintf(log->fp, "%16s: %u\n", "#resources", tok->header->block[NUMRES]); - fprintf(log->fp, "%16s: %u\n", "offset",tok->header->block[OFFSET]); - - return tok; + tfile->h->crc = crc; + /* + fprintf(stderr, "%16s: %u\n", "bytes read", bytesread); + fprintf(stderr, "%16s: %u\n", "TFile size", tfile->h->b); + fprintf(stderr, "%16s: %u\n", "#docs", tfile->h->n); + */ + return tfile;  }
Change 1 of 1 Show Entire File parser.h Stacked
 
1
 
 
 
1
@@ -1,1 +1,1 @@
-Tok* parse(FILE*); +TFile* parse(FILE*, model_t*);
Change 1 of 1 Show Entire File raw2t.c Stacked
 
17
18
19
 
20
21
 
22
23
24
25
26
27
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
30
 
17
18
19
20
21
 
22
23
24
25
26
 
 
 
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
@@ -17,14 +17,42 @@
 #include <kak/kbuffer.h>  #include <kak/klog.h>  #include <kak/klist.h> +#include "crc.h"  #include "tokenizer.h" -#include "tok.h" +#include "tfile.h"  #include "parser.h"    int main(int argc, char *argv[])  { - Tok *tok; - tok = parse(stdin); - write_t(tok, stdout); + TFile *tfile; + char crc_model_str[121]; + model_t crc_model; + int ret; + + /* init crc computation */ + + strcpy(crc_model_str, "width=32 poly=0x04c11db7 init=0x00000000 refin=false refout=false xorout=0xffffffff check=0x765e7680 name=\"CRC-32/POSIX\""); + crc_model.name = NULL; + ret = read_model(&crc_model, crc_model_str); + if (ret == 2) { + fputs("ABORT: crc: out of memory -- aborting\n", stderr); + exit(0); + } + else if (ret == 1) { + fprintf(stderr, "ABORT: crc: %s: -- unusable model\n", + crc_model.name == NULL ? "<no name>" : crc_model.name); + exit(0); + } + + crc_table_wordwise(&crc_model); + + tfile = parse(stdin, &crc_model); + writeTFile(tfile, stdout); + + /* FIX: call segfaults */ + /* freeTFile(tfile); */ + free(crc_model.name); + crc_model.name = NULL; +   return 0;  }
Change 1 of 1 Show Entire File t2mem.c Stacked
 
5
6
7
8
 
9
10
11
12
13
14
15
 
 
 
 
16
17
 
5
6
7
 
8
9
10
11
 
 
 
 
12
13
14
15
16
17
@@ -5,13 +5,13 @@
 #include <kak/klog.h>  #include <kak/klist.h>  #include "tokenizer.h" -#include "tok.h" +#include "tfile.h"    int main(void)  { - Tok *tok; - tok = read_t(stdin); - if (tok) - printtok(tok); + TFile *tfile; + tfile = readTFile(stdin); + if (tfile) + printTFile(tfile);   return 0;  }
Change 1 of 3 Show Entire File term.c Stacked
 
5
6
7
8
 
9
10
11
12
13
 
 
 
 
 
14
15
16
 
49
50
51
52
 
53
54
55
 
58
59
60
 
 
 
 
 
 
 
61
62
63
 
5
6
7
 
8
9
10
11
 
 
12
13
14
15
16
17
18
19
 
52
53
54
 
55
56
57
58
 
61
62
63
64
65
66
67
68
69
70
71
72
73
@@ -5,12 +5,15 @@
 #include "post.h"  #include "term.h"   -Term *newterm(char *s, Post *p) +Term *newterm(char *s, unsigned len, Post *p)  {   Term *t;   t = (Term *)malloc(sizeof(Term)); - t->s = (char *)malloc(strlen(s)); - strcpy(t->s, s); + t->s = (char *)malloc(len + 1); + + memcpy(t->s, s, len); + t->s[len] = '\0'; +   t->df = 1;   t->list = newnode(p);   /* t->list = NULL; */ @@ -49,7 +52,7 @@
  if (t->s != NULL)   free(t->s);   if (t->list != NULL) - freeall(t->list, freepost); + freelist(t->list, freepost);   free(t);   }  } @@ -58,6 +61,13 @@
 {   char *fmt;   fmt = (char *)arg; + printf(fmt, ((Term *)d)->s, strlen(((Term *)d)->s)); +} + +void printtree(void *d, void *arg) +{ + char *fmt; + fmt = (char *)arg;   printf(fmt, ((Term *)d)->s, ((Term *)d)->df);   apply(((Term *)d)->list, printpost, "%d:%d ");   printf("\n");
Change 1 of 1 Show Entire File term.h Stacked
 
6
7
8
9
 
10
11
12
13
 
14
 
6
7
8
 
9
10
11
12
13
14
15
@@ -6,9 +6,10 @@
  Node *list;  };   -Term *newterm(char*, Post*); +Term *newterm(char*, unsigned, Post*);  int termcmp(void*, void*);  int term_match_handler(void*, void*);  void freeterm(void*);  void printterm(void*, void*); +void printtree(void*, void*);  unsigned int hashterm(unsigned (*fn)(char*, int), void*, int);
Change 1 of 1 Show Entire File tfile.c Stacked
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
@@ -0,0 +1,333 @@
+#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <kak/kbuffer.h> +#include <kak/klist.h> +#include "tokenizer.h" +#include "tfile.h" + +THeader *_newTHeader(E_TDocType type) +{ + THeader *h; + h = (THeader *)malloc(sizeof(THeader)); + h->mg = 41; + h->crc = 0; + h->ver = 1; + h->type = type; + h->b = 0; + h->n = 0; + h->r = 0; + h->bo = 32; + if (type == TREC) + h->r = TREC_R; + return h; +} + +TSubHeader *_newTSubHeader(E_TDocType type) +{ + TSubHeader *h; + h = (TSubHeader *)malloc(sizeof(TSubHeader)); + h->crc = 0; + h->r = 0; + if (type == TREC) + h->r = TREC_R; + h->n_txt = 0; + h->n_id = 0; + h->n_rsrc = NULL; + if (h->r > 0) + h->n_rsrc = (uint32_t *)calloc(h->r, sizeof(uint32_t)); + return h; +} + +TDoc *newTDoc(E_TDocType type) +{ + TDoc *tdoc; + int i; + tdoc = (TDoc *)malloc(sizeof(TDoc)); + tdoc->h = _newTSubHeader(type); + tdoc->txt = newbuffer(10*KB, CHARBUF); + tdoc->id = newbuffer(36, CHARBUF); /* a UUID is 36 chars long */ + tdoc->rsrc = NULL; + if (tdoc->h->r > 0) { + tdoc->rsrc = (Buffer **)malloc(sizeof(Buffer*) * tdoc->h->r); + for (i = 0; i < tdoc->h->r; i++) + tdoc->rsrc[i] = newbuffer(KB, CHARBUF); + } + return tdoc; +} + +TFile *newTFile(E_TDocType type) +{ + TFile *t; + t = (TFile *)malloc(sizeof(TFile)); + t->h = _newTHeader(type); + t->list = NULL; + return t; +} + +void freeTDoc(void *d) +{ + TDoc* tdoc; + tdoc = (TDoc *)d; + int i; + if (tdoc == NULL) + return; + freebuffer(tdoc->txt); + freebuffer(tdoc->id); + if (tdoc->h->r > 0) { + for (i = 0; i < tdoc->h->r; i++) + freebuffer(tdoc->rsrc[i]); + free(tdoc->rsrc); + } + free(tdoc->h->n_rsrc); + free(tdoc->h); + free(tdoc); +} + +void freeTFile(TFile *tfile) +{ + if (tfile == NULL) + return; + free(tfile->h); + freelist(tfile->list, freeTDoc); +} + +int writeTFile(TFile *tfile, FILE *fp) +{ + uint16_t v, check; + int i, n; + Node *p; + TDoc *tdoc; + + /* write the THeader structure */ + v = 0; + n = fwrite(&tfile->h->mg, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->crc, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->ver, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->type, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->b, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->n, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->r, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tfile->h->bo, sizeof(uint32_t), 1, fp); v |= n; + if (v != 0x00FF) { /* 0000 0000 1111 1111 */ + fprintf(stderr, "ERROR: failed to write THeader\n"); + return 0; + } + + /* write the TDoc structures */ + for (p = tfile->list; p != NULL; p = p->next) { + + tdoc = p->data; + + /* write the TDoc->TSubHeader */ + check = 0x000F; /* 0000 0000 0000 1111 */ + v = 0; + n = fwrite(&tdoc->h->crc, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tdoc->h->r, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tdoc->h->n_txt, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fwrite(&tdoc->h->n_id, sizeof(uint32_t), 1, fp); v |= n; + if (tdoc->h->r > 0) { + for (i = 0; i < tdoc->h->r; i++) { + n = fwrite(&tdoc->h->n_rsrc[i], sizeof(uint32_t), 1, fp); + v <<= 1; v |= n; + check <<=1; check |= 1; + } + } + if (v != check) { /* the lower tdoc->h->r bits of 'check' were set */ + fprintf(stderr, "ERROR: failed to write TSubHeader\n"); + return 0; + } + + /* write the TDoc->txt */ + v = 0; + n = fwrite(tdoc->txt->b, tdoc->h->n_txt, 1, fp); v |= n; + if (v != 0x0001) { /* 0000 0000 0000 0001 */ + fprintf(stderr, "ERROR: failed to write TDoc->txt\n"); + return 0; + } + + /* write the TDoc->id */ + v = 0; + n = fwrite(tdoc->id->b, tdoc->h->n_id, 1, fp); v |= n; + if (v != 0x0001) { /* 0000 0000 0000 0001 */ + fprintf(stderr, "ERROR: failed to write TDoc->id\n"); + return 0; + } + /* write the TDoc->rsrc[i]->b */ + if (tdoc->h->r > 0) { + v = 0; + check = 0x0000; + for (i = 0; i < tdoc->h->r; i++) { + n = fwrite(tdoc->rsrc[i]->b, tdoc->h->n_rsrc[i], 1, fp); + v <<= 1; v |= n; + check <<= 1; check |= n; + } + if (v != check) { /* the lower tdoc->h->r bits of 'check' were set */ + fprintf(stderr, "ERROR: failed to write a TDoc->rsrc[i]\n"); + return 0; + } + } + } + + return 1; +} + +TFile* readTFile(FILE *fp) +{ + uint16_t v, check; + int i, n; + TFile *tfile; + TDoc *tdoc; + + tfile = newTFile(TREC); + + /* fill the THeader */ + v = 0; + n = fread(&tfile->h->mg, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&tfile->h->crc, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&tfile->h->ver, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&tfile->h->type, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&tfile->h->b, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&tfile->h->n, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&tfile->h->r, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&tfile->h->bo, sizeof(uint32_t), 1, fp); v |= n; + if (v != 0x00FF) { /* 0000 0000 1111 1111 */ + fprintf(stderr, "ERROR: failed to read THeader\n"); + return NULL; + } + + /* fill the TDoc structures */ + for (i = 0; i < tfile->h->n; i++) { + + tdoc = newTDoc(TREC); + + /* fill the TDoc->TSubHeader */ + check = 0x000F; /* 0000 0000 0000 1111 */ + v = 0; + n = fread(&tdoc->h->crc, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&tdoc->h->r, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&tdoc->h->n_txt, sizeof(uint32_t), 1, fp); v |= n; v <<= 1; + n = fread(&tdoc->h->n_id, sizeof(uint32_t), 1, fp); v |= n; + if (tdoc->h->r > 0) { + for (i = 0; i < tdoc->h->r; i++) { + n = fread(&tdoc->h->n_rsrc[i], sizeof(uint32_t), 1, fp); + v <<= 1; v |= n; + check <<=1; check |= 1; + } + } + if (v != check) { /* tfile->h->r 1's pushed through the LSB */ + fprintf(stderr, "ERROR: failed to read TDoc->TSubHeader\n"); + return NULL; + } + + /* fill TDoc->txt */ + v = 0; + if (tdoc->txt->n < tdoc->h->n_txt) + tdoc->txt = resizebuffer(tdoc->txt, tdoc->h->n_txt); + n = fread(tdoc->txt->b, tdoc->h->n_txt, 1, fp); + if (v != 0x0001) { /* 0000 0000 0000 0001 */ + fprintf(stderr, "ERROR: failed to read TDoc->txt\n"); + return NULL; + } + + /* fill TDoc->id */ + v = 0; + if (tdoc->id->n < tdoc->h->n_id) + tdoc->id = resizebuffer(tdoc->id, tdoc->h->n_id); + n = fread(tdoc->id->b, tdoc->h->n_id, 1, fp); + if (v != 0x0001) { /* 0000 0000 0000 0001 */ + fprintf(stderr, "ERROR: failed to read TDoc->id\n"); + return NULL; + } + + /* read the TDoc->rsrc[i]->b */ + if (tdoc->h->r > 0) { + v = 0; + check = 0x0000; + for (i = 0; i < tdoc->h->r; i++) { + if (tdoc->rsrc[i]->n < tdoc->h->n_rsrc[i]) + tdoc->rsrc[i] = resizebuffer(tdoc->rsrc[i], + tdoc->h->n_rsrc[i]); + n = fread(tdoc->rsrc[i]->b, tdoc->h->n_rsrc[i], 1, fp); + v <<= 1; v |= n; + check <<= 1; check |= n; + } + if (v != check) { /* the lower tdoc->h->r bits of 'check' were set */ + fprintf(stderr, "ERROR: failed to read a TDoc->rsrc[i]\n"); + return 0; + } + } + + /* add to the list a new node with a TDoc payload */ + tfile->list = addfront(tfile->list, newnode((void *)tdoc)); + } + + return tfile; +} + +void _printTHeader(THeader *h) +{ + printf("%-2s %-10s %-1s %-1s %-10s %-10s %-2s %-2s\n", + "MG", "CRC", "V", "T", "BYTES", "N", "R", "BO"); + printf("%-2u %-10u %-1u %-1u %-10u %-10u %-2u %-2u\n", + h->mg, h->crc, h->ver, h->type, + h->b, h->n, h->r, h->bo); +} + +void _printTSubHeader(TSubHeader *h) +{ + int i; + static int flag = 1; + if (flag) { + flag = 0; + printf("%-10s %-10s %-10s %-10s ", "CRC", "TXT", "ID", "R"); + for (i = 0; i < h->r; i++) + printf("RSRC%-6u ", i); + printf("\n"); + } + printf("%-10u %-10u %-10u %-10u", h->crc, h->n_txt, h->n_id, h->r); + for (i = 0; i < h->r; i++) + printf("%-10u ", h->n_rsrc[i]); + printf("\n"); +} + +void _printTDoc(TDoc *tdoc) +{ + _printTSubHeader(tdoc->h); + + /* print the doc ID */ + /* + ((char *)tdoc->id->b)[tdoc->h->n_id] = '\0'; + printf("tdoc->id:\n"); + printf("%s\n", tdoc->id->b); + */ + + /* print the tokenized doc text */ + /* + ((char *)tdoc->txt->b)[tdoc->h->n_txt] = '\0'; + printf("tdoc->txt:\n"); + printf("%s\n", tdoc->txt->b); + */ + + /* print the resource blocks */ + /* + for (i = 0; i < tdoc->h->r; i++) { + ((char *)tdoc->rsrc[i]->b)[tdoc->h->n_rsrc[i]] = '\0'; + printf("tdoc->rsrc[%d]:\n", i); + printf("%s\n", tdoc->rsrc[i]->b); + } + */ +} + +void printTFile(TFile *tfile) +{ + Node *p; + TDoc *tdoc; + + _printTHeader(tfile->h); + + for (p = tfile->list; p != NULL; p = p->next) { + tdoc = p->data; + _printTDoc(tdoc); + } +}
Change 1 of 1 Show Entire File tfile.h Stacked
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
@@ -0,0 +1,54 @@
+#define THEADER_SIZE 32 /* header size in bytes */ +#define TSUBHEADER_SIZE 12 /* sub-header size in bytes */ +#define TREC_R 1 /* number of resources for TDoc of type TREC */ + +typedef enum {TREC, WARC} E_TDocType; + +typedef struct TFile TFile; +typedef struct THeader THeader; +typedef struct TSubHeader TSubHeader; +typedef struct TDoc TDoc; + +struct THeader { + uint32_t mg; + uint32_t crc; + uint32_t ver; + uint32_t type; + uint32_t b; + uint32_t n; + uint32_t r; + uint32_t bo; +}; + +struct TSubHeader { + uint32_t crc; + uint32_t n_txt; + uint32_t n_id; + uint32_t r; + uint32_t *n_rsrc; +}; + +struct TDoc { + TSubHeader *h; + Buffer *txt; + Buffer *id; + Buffer **rsrc; +}; + +struct TFile { + THeader *h; + Node *list; +}; + +THeader *_newTHeader(E_TDocType); +TSubHeader *_newTSubHeader(E_TDocType); +TDoc *newTDoc(E_TDocType); +TFile *newTFile(E_TDocType); +void freeTDoc(void*); +void freeTFile(TFile*); +int writeTFile(TFile*, FILE*); +TFile *readTFile(FILE*); +void _printTHeader(THeader*); +void _printTSubHeader(TSubHeader*); +void _printTDoc(TDoc*); +void printTFile(TFile*);
Change 1 of 1 Show Entire File tok.c Stacked
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
@@ -1,158 +0,0 @@
-#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <kak/kbuffer.h> -#include <kak/klist.h> -#include "tokenizer.h" -#include "tok.h" - -Tokheader *_newtokheader(E_doctype type) -{ - Tokheader *h; - h = (Tokheader *)malloc(sizeof(Tokheader)); - h->block[MAGIC] = 41; - h->block[CRC32] = 42; - h->block[VERSION] = 1; - h->block[TYPE] = type; - h->block[LENGTH] = 0; - h->block[NUMDOCS] = 0; - h->block[NUMRES] = 0; - h->block[OFFSET] = 32; - if (type == TREC_ADHOC) - h->block[NUMRES] = 3; - return h; -} - -Tokdocheader *_newtokdocheader(E_doctype type) -{ - Tokdocheader *h; - h = (Tokdocheader *)malloc(sizeof(Tokdocheader)); - h->block[RES0CRC32] = 0; - h->block[RES1DOCLEN] = 0; - h->block[RES2IDLEN] = 0; - return h; -} - -Tokdoc *newtokdoc(E_doctype type) -{ - Tokdoc *d; - d = (Tokdoc *)malloc(sizeof(Tokdoc)); - d->header = _newtokdocheader(type); - d->doc = newbuffer(10*KB, CHARBUF); - d->id = newbuffer(100, CHARBUF); - return d; -} - -Tok *newtok(E_doctype type) -{ - Tok *t; - t = (Tok *)malloc(sizeof(Tok)); - t->header = _newtokheader(type); - t->list = NULL; - return t; -} - -int write_t(Tok *tok, FILE *fp) -{ - /* TODO: test n */ - int n; - Node *p; - Tokdoc *tokdoc; - n = fwrite(tok->header->block, 4, TOK_HEADER_BLOCKS, fp); - for (p = tok->list; p != NULL; p = p->next) { - tokdoc = p->data; - n = fwrite(tokdoc->header->block, - 4, tok->header->block[NUMRES], fp); - n = fwrite(tokdoc->doc->b, 1, tokdoc->header->block[RES1DOCLEN], fp); - n = fwrite(tokdoc->id->b, 1, tokdoc->header->block[RES2IDLEN], fp); - } - return n; -} - -Tok* read_t(FILE *fp) -{ - int i, n, toread; - Tok *tok; - Tokdoc *tokdoc; - - tok = newtok(TREC_ADHOC); - - n = fread(tok->header->block, 4, TOK_HEADER_BLOCKS, fp); - if (n != TOK_HEADER_BLOCKS) { - printf("ERROR: failed to read header\n"); - return NULL; - } - - for (i = 0; i < tok->header->block[NUMDOCS]; i++) { - tokdoc = newtokdoc(TREC_ADHOC); - n = fread(tokdoc->header->block, - 4, tok->header->block[NUMRES], fp); - if (n != tok->header->block[NUMRES]) { - printf("ERROR: failed to read doc header\n"); - return NULL; - } - /* read the doc text */ - toread = tokdoc->header->block[RES1DOCLEN]; - if (toread > tokdoc->doc->n) - tokdoc->doc = resizebuffer(tokdoc->doc, toread); - if((n = fread(tokdoc->doc->b, 1, toread, fp)) != toread) { - printf("read %d of %d bytes\n", n, toread); - printf("ERROR: failed to read doc\n"); - return NULL; - } - - /* read the doc id */ - toread = tokdoc->header->block[RES2IDLEN]; - if((n = fread(tokdoc->id->b, 1, toread, fp)) != toread) { - printf("read %d of %d bytes\n", n, toread); - printf("ERROR: failed to read docid\n"); - return NULL; - } - - tok->list = addfront(tok->list, newnode((void *)tokdoc)); - } - - return tok; -} - -void _printtokheader(Tokheader *h) -{ - printf("%-2s %-10s %-1s %-1s %-10s %-10s %-2s %-2s\n", - "MG", "CRC", "V", "T", "BYTES", "N", "R", "BO"); - printf("%-2u %-10u %-1u %-1u %-10u %-10u %-2u %-2u\n", - h->block[MAGIC], h->block[CRC32], - h->block[VERSION], h->block[TYPE], - h->block[LENGTH], h->block[NUMDOCS], - h->block[NUMRES], h->block[OFFSET]); -} - -void _printtokdocheader(Tokdocheader *h, int numres) -{ - int i; - static int flag = 1; - if (flag) { - flag = 0; - printf("%-10s %-10s ", "CRC", "BYTES"); - for (i = 2; i < numres; i++) - printf("RSRC%-6u ", i); - printf("\n"); - } - for (i = 0; i < numres; i++) - printf("%-10u ", h->block[i]); - printf("\n"); -} - -void printtok(Tok *tok) -{ - Node *p; - Tokdoc *tokdoc; - _printtokheader(tok->header); - for (p = tok->list; p != NULL; p = p->next) { - tokdoc = p->data; - ((char *)tokdoc->id->b)[tokdoc->header->block[RES2IDLEN]] = '\0'; - printf("%s ", tokdoc->id->b); - _printtokdocheader(tokdoc->header, tok->header->block[NUMRES]); - /* Avoid printing a huge doc */ - /* ((char *)tokdoc->doc->b)[tokdoc->header->block[RES1DOCLEN]] = '\0'; */ - } -}
Change 1 of 1 Show Entire File tok.h Stacked
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
@@ -1,39 +0,0 @@
-#define TOK_HEADER_BLOCKS 8 -#define TOKDOC_HEADER_BLOCKS 3 - -typedef struct Tok Tok; -typedef struct Tokheader Tokheader; -typedef struct Tokdocheader Tokdocheader; -typedef struct Tokdoc Tokdoc; -typedef enum {TREC_ADHOC, TREC_WEB, TREC_TWITTER} E_doctype; -typedef enum {MAGIC, CRC32, VERSION, TYPE, LENGTH, NUMDOCS, NUMRES, OFFSET} E_tokheader; -typedef enum {RES0CRC32, RES1DOCLEN, RES2IDLEN} E_tokdocheader; - -struct Tokheader { - uint32_t block[TOK_HEADER_BLOCKS]; -}; - -struct Tokdocheader { - uint32_t block[TOKDOC_HEADER_BLOCKS]; -}; - -struct Tokdoc { - Tokdocheader *header; - Buffer *doc; - Buffer *id; -}; - -struct Tok { - Tokheader *header; - Node *list; -}; - -Tokheader *_newtokheader(E_doctype); -Tokdocheader *_newtokdocheader(E_doctype); -Tokdoc *newtokdoc(E_doctype); -Tok *newtok(E_doctype); -int write_t(Tok*, FILE*); -Tok* read_t(FILE*); -void _printtokheader(Tokheader*); -void _printtokdocheader(Tokdocheader*, int numres); -void printtok(Tok*);
Change 1 of 1 Show Entire File tokenizer.c Stacked
 
48
49
50
 
51
52
53
54
55
56
 
48
49
50
51
52
53
 
54
55
56
@@ -48,9 +48,9 @@
  /* ASCII chars 0 - 31, 127 and those from the delimiter string   * are marked as separators and the rest as printable   * chars. */ + t->asciitab[127] = SEPCHAR;   for (i = 0; i <= 31; i++)   t->asciitab[i] = SEPCHAR; - t->asciitab[127] = SEPCHAR;   for (i = 32; i <= 126; i++)   t->asciitab[i] = PRINTCHAR;   for (c = t->delimiters; *c; c++)