Repositories » TXT0
Clone URL:  
Pushed to one repository · View In Graph Contained in tip

raw2t, t2mem and ii works without glitches at this point. Removing the
portions in ii where the docid was being stored in an array fixed

issues with spoilt pointers. This bit needs to be worked out.

Changeset bced1df4033b

Parent 8b895f063105

by Rup Palchowdhury

Changes to 5 files · Browse files at bced1df4033b Showing diff from parent 8b895f063105 Diff from another changeset...

Change 1 of 2 Show Entire File ii.c Stacked
 
1
2
3
 
4
5
6
 
11
12
13
14
 
 
15
16
17
 
18
19
20
21
22
 
 
23
24
 
 
25
26
27
28
29
30
 
31
32
33
34
35
36
37
38
 
 
39
40
41
42
 
43
44
45
46
47
48
49
50
51
52
53
54
55
 
 
56
57
 
58
59
60
61
 
 
62
63
 
64
65
66
 
67
68
69
 
 
 
 
 
 
 
 
 
 
 
 
70
71
72
73
74
75
 
 
 
 
76
77
78
 
79
80
81
82
83
84
 
85
86
87
88
89
90
 
1
2
3
4
5
6
7
 
12
13
14
 
15
16
17
18
 
19
20
21
22
 
 
23
24
25
 
26
27
28
 
 
 
 
 
29
30
 
 
 
 
 
 
 
31
32
33
34
35
 
36
37
38
 
 
39
 
 
 
40
41
42
 
 
43
44
45
 
46
47
 
 
 
48
49
50
 
51
52
53
 
54
55
56
 
57
58
59
60
61
62
63
64
65
66
67
68
69
70
 
 
 
 
71
72
73
74
75
76
77
78
79
 
80
81
82
 
83
84
 
 
 
85
86
@@ -1,6 +1,7 @@
 #include <stdio.h>  #include <stdlib.h>  #include <string.h> +#include <assert.h>  #include <kak/klog.h>  #include <kak/klist.h>  #include <kak/ktree.h> @@ -11,80 +12,75 @@
   int main(void)  { - int len, n_txt, n_id, i, i_; + /* TODO: store docids */ + int i_;   TFile *tfile;   TDoc *tdoc; - TNode *t; + TNode *tree;   Node *p, *next;   Post *post;   Term *term; - unsigned row, col, n_doc; - char *docid, *p_txt; + unsigned n_doc; + char *p_txt;   - t = NULL; + tree = NULL; + n_doc = 0;   - row = 100; col = 15; - docid = (char *)malloc(row * col); - memset(docid, '\0', row * col); - - n_doc = 0; + tfile = readTFile(stdin);   - while((tfile = readTFile(stdin)) != NULL) { - - if (n_doc == row) { - docid = (char *)realloc(docid, (row <<= 1) * col); - memset(docid + n_doc * col, '\0', (row - n_doc) * col); - } - + if (tfile != NULL) { +   for (p = tfile->list; p != NULL; p = next) {     fprintf(stderr, "%d/%d\r", n_doc, tfile->h->n); - +   next = p->next;   tdoc = p->data; - n_txt = tdoc->h->n_txt; - n_id = tdoc->h->n_id;   - memcpy(docid + n_doc * col, tdoc->id, n_id); - - len = 0;   i_ = 0;   p_txt = tdoc->txt;   - /* pick terms one by one and add to BST */ - for (i = 0; i < n_txt; i++) { + for (int i = 0; i < tdoc->h->n_txt; i++) { +   if (tdoc->txt[i] == ' ') { - len = i - i_; +   post = newpost(n_doc, 1); - term = newterm(p_txt, len, post); - /* printf("%s %u\n", term->s, len); */ - + term = newterm(p_txt, i - i_, post); +   /* if term exists - if doc matches front of list + if doc exists at front of list   post->tf++   else - add new post to front of list + add new doc at front of list   term->df++   else - attach term to tree + attach node with term to tree + */ + + tree = nrinsert(tree, newtnode((void *)term), termcmp, + term_match_handler); + + i_ = i + 1; + p_txt = &(tdoc->txt[i + 1]); + + /* + 2 = both term and post matched + 1 = term matched not the post   */   - t = nrinsert(t, newtnode((void *)term), termcmp, - term_match_handler); - i_ += len + 1; - p_txt += len + 1; + if (term->status == 2) + freeterm(term, freepost); + else if (term->status == 1) + freeterm(term, NULL);   }   }   + freenode(p, freeTDoc);   n_doc++; - freenode(p, freeTDoc);   }   }   - applyinorder(t, printtree, "%s:%d "); + applyinorder(tree, printtree, "%s:%d ");   - /* for (i = 0; i < n_doc; i++) */ - /* printf("%s\n", docid + i * col); */ -   return 0;  }
Change 1 of 1 Show Entire File parser.c Stacked
 
22
23
24
25
 
26
27
28
 
22
23
24
 
25
26
27
28
@@ -22,7 +22,7 @@
  unsigned lowmem, bytesread;   uint32_t crc, n_term;   int n, ntxtbuf; - char *septxt = " ;,.:`'\"?!(){}[]<>~^&*_-=#$%@|\\/"; + char *septxt = " ;,.:`'\"?!(){}[]<>~^&*_-+=#$%@|\\/";   char *sepid = " <>";   /* Log *log; */  
Change 1 of 2 Show Entire File term.c Stacked
 
15
16
17
18
19
20
 
 
 
 
21
22
23
 
32
33
34
35
36
 
 
37
 
 
38
39
40
41
42
 
 
 
43
44
45
46
 
47
48
49
 
50
51
52
53
 
54
55
56
 
15
16
17
 
 
 
18
19
20
21
22
23
24
 
33
34
35
 
 
36
37
38
39
40
41
42
43
 
 
44
45
46
47
48
49
 
50
51
52
 
53
54
55
56
 
57
58
59
60
@@ -15,9 +15,10 @@
  t->s[len] = '\0';     t->df = 1; - t->list = newnode(p); - /* t->list = NULL; */ - /* t->list = addfront(t->list, newpost((void *)));; */ + t->list = newnode(p); + + t->status = 0; +   return t;  }   @@ -32,25 +33,28 @@
    t = (Term *)term;   nt = (Term *)newterm; - - if (postcmp(nt->list->data, t->list->data) == 0) + + if (postcmp(nt->list->data, t->list->data) == 0) {   ((Post *)t->list->data)->tf++; + nt->status = 2; + }   else {   t->list = addfront(t->list, nt->list);   t->df++; - } - + nt->status = 1; + } +   return 0;  }   -void freeterm(void *t_) +void freeterm(void *term, void (*freepost)(void*))  {   Term *t; - t = (Term *)t_; + t = (Term *)term;   if (t != NULL) {   if (t->s != NULL)   free(t->s); - if (t->list != NULL) + if (freepost != NULL && t->list != NULL)   freelist(t->list, freepost);   free(t);   }
Change 1 of 1 Show Entire File term.h Stacked
 
4
5
6
 
7
8
9
10
11
12
 
13
14
15
 
4
5
6
7
8
9
10
11
12
 
13
14
15
16
@@ -4,12 +4,13 @@
  char *s;   unsigned df;   Node *list; + unsigned status;  };    Term *newterm(char*, unsigned, Post*);  int termcmp(void*, void*);  int term_match_handler(void*, void*); -void freeterm(void*); +void freeterm(void*, void (*fn)(void*));  void printterm(void*, void*);  void printtree(void*, void*);  unsigned int hashterm(unsigned (*fn)(char*, int), void*, int);
Change 1 of 2 Show Entire File tfile.c Stacked
 
188
189
190
191
 
192
193
194
 
281
282
283
284
 
285
286
287
288
 
 
289
290
 
291
292
293
294
 
 
295
296
 
297
298
299
300
 
301
302
303
304
305
 
188
189
190
 
191
192
193
194
 
281
282
283
 
284
285
286
 
 
287
288
289
 
290
291
292
 
 
293
294
295
 
296
297
298
299
 
300
301
 
302
303
304
@@ -188,7 +188,7 @@
    /* fill the TDoc structures */   for (int i = 0; i < tfile->h->n; i++) { - +   tdoc = newTDoc(TREC);     /* fill the TDoc->TSubHeader */ @@ -281,25 +281,24 @@
  _printTSubHeader(tdoc->h);     /* print the doc ID */ - /* +   tdoc->id[tdoc->h->n_id] = '\0';   printf("tdoc->id: "); - printf("%s\n", tdoc->id); - */ + printf("%s.\n", tdoc->id); +   /* print the tokenized doc text */ - /* +   tdoc->txt[tdoc->h->n_txt] = '\0';   printf("tdoc->txt: "); - printf("%s\n", tdoc->txt); - */ + printf("%s.\n", tdoc->txt); +   /* print the resource blocks */ - /* +   for (int i = 0; i < tdoc->h->r; i++) {   tdoc->rsrc[i][tdoc->h->n_rsrc[i]] = '\0';   printf("tdoc->rsrc[%d]: ", i); - printf("%s\n", tdoc->rsrc[i]); + printf("%s.\n", tdoc->rsrc[i]);   } - */  }    void printTFile(TFile *tfile)