Repositories » TXT0
Clone URL:  
Pushed to one repository · View In Graph Contained in v0.1 and tip

Removed ASCII control characters (\n, \r, \t) from separator string in
parser(). It is cleaner to deal with this using an ASCII table in

tokenizer.c. All control characters (non-printable) in the table are
marked as separators, then the printable characters are marked out and
finally those printable characters found in the separator string are
again marked in the table as separators. This ensures that all control
characters and characters from the delimiter string are treated as
separators when the incoming character is checked against this table.

Changeset 77a89036f0b3

Parent b6965546b4eb

by Rup Palchowdhury

Changes to 3 files · Browse files at 77a89036f0b3 Showing diff from parent b6965546b4eb Diff from another changeset...

Change 1 of 1 Show Entire File parser.c Stacked
 
21
22
23
24
25
 
 
26
27
28
 
21
22
23
 
 
24
25
26
27
28
@@ -21,8 +21,8 @@
  unsigned lowmem, ntok, ndoc, nresize, bytesread;   uint32_t crc;   int n; - char *septext = " \t\r\n;,.:`'\"?!(){}[]<>~^&*_-=#$%@|\\/"; - char *sepid = " \t\r\n<>"; + char *septext = " ;,.:`'\"?!(){}[]<>~^&*_-=#$%@|\\/"; + char *sepid = " <>";   Log *log;     log = newlog("raw2t");
Change 1 of 3 Show Entire File tokenizer.c Stacked
 
41
42
43
44
 
45
46
47
48
49
50
51
 
 
 
 
 
 
 
 
 
 
 
52
53
54
 
75
76
77
78
 
79
80
81
 
107
108
109
110
 
111
112
113
 
41
42
43
 
44
45
46
 
 
 
 
 
47
48
49
50
51
52
53
54
55
56
57
58
59
60
 
81
82
83
 
84
85
86
87
 
113
114
115
 
116
117
118
119
@@ -41,14 +41,20 @@
 {   Tokenizer *t;   int i; - char *s; + char *c;   t = (Tokenizer *)malloc(sizeof(Tokenizer));   strcpy(t->delimiters, delimiters); - for (i = 0; i < NDELIMS; i++) - t->dtab[i] = 0; - s = t->delimiters; - for (; *s; s++) - t->dtab[(int)(*s)] = 1; + memset(t->asciitab, SEPCHAR, sizeof(int) * ASCII); + /* ASCII chars 0 - 31, 127 and those from the delimiter string + * are marked as separators and the rest as printable + * chars. */ + for (i = 0; i <= 31; i++) + t->asciitab[i] = SEPCHAR; + t->asciitab[127] = SEPCHAR; + for (i = 32; i <= 126; i++) + t->asciitab[i] = PRINTCHAR; + for (c = t->delimiters; *c; c++) + t->asciitab[(int)(*c)] = SEPCHAR;   t->mem = mem;   t->casechange = casechange;   return t; @@ -75,7 +81,7 @@
  s = tok->str;   while ((n = fread(&c, 1, 1, fp)) > 0) {   bytes += n; - if (t->dtab[(int)c] == 1) { + if (t->asciitab[(int)c] == SEPCHAR) {   push(t->mem, &c);   if (len == 0)   continue; @@ -107,7 +113,7 @@
  static int len = 0;   n = 0;   for (i = 0; i < size; i++, src++) { - if (t->dtab[(int)(*src)] == 1) { + if (t->asciitab[(int)(*src)] == 1) {   if (len > 0) {   *dest = ' ';   dest++;
Change 1 of 2 Show Entire File tokenizer.h Stacked
 
1
 
2
3
4
5
6
7
 
8
9
10
 
16
17
18
19
20
 
 
21
22
23
 
 
1
2
3
4
5
6
7
8
9
10
11
 
17
18
19
 
 
20
21
22
23
24
@@ -1,10 +1,11 @@
-#define NDELIMS 128 +#define ASCII 128  #define KB 1024  #define MB 1048576  #define GB 1073741824    enum {KEEPCASE, LOWERCASE};  enum {TERM, OTAG, CTAG}; +typedef enum {CTRLCHAR = 1, PRINTCHAR, SEPCHAR} E_asciitype;  typedef struct Tokenizer Tokenizer;  typedef struct Token Token;  typedef struct Stack Stack; @@ -16,8 +17,8 @@
 };    struct Tokenizer { - char delimiters[NDELIMS]; - int dtab[NDELIMS]; + char delimiters[ASCII]; + int asciitab[ASCII];   Stack *mem;   int casechange;  };