Repositories » TXT0
Clone URL:  
Pushed to one repository · View In Graph Contained in tip

Hash tables introduced to hold doc IDs and terms.

Term was modified to have a reference count 'ref' in stead of the
'status' variable. The idea is to change a ref to 0 when the payload
is not needed and then call free() on it by checking if ref is zero.

Term's termcmp() was modified to compare memory addresses for
equality.

Post uses a reference count variable too.

Post now stores the docid string in stead of a unsigned integer. Since
ID strings reside in only one place, Posts point to memory and do not
carry around copies or use a level of indirection to refer to a
document.

Post's postcmp compares memory addresses for equality in stead of
calling strcmp() on the strings.

Since deallocation needed on Term and Post turns out to be shallow
only, their deallocation routines were simplified. At this point none
of them are in use; free() is good enough.

ii.c now uses a the khash.h interface to make use of two hash tables
to store the doc ID strings and terms. Pointers from the nodes in the
BST point to this data replacing the need to keep copies of strings in
the trees.

tfile.c had mallocs() allocating wrong amounts of memory as valgrind
pointed out. These were fixed.

Makefile uses the -o0 in CFLAGS so that valgrind may be used on the
output.

Changeset 9f7191bdc9a6

Parent 089568eb333e

by Rup Palchowdhury

Changes to 7 files · Browse files at 9f7191bdc9a6 Showing diff from parent 089568eb333e Diff from another changeset...

Change 1 of 2 Show Entire File Makefile Stacked
 
3
4
5
6
 
7
8
9
 
57
58
59
60
 
61
62
63
 
3
4
5
 
6
7
8
9
 
57
58
59
 
60
61
62
63
@@ -3,7 +3,7 @@
 TEST=test    CC = gcc -CFLAGS = -g -Wall +CFLAGS = -g -Wall -O0  INCLUDES = -I/usr/local/include  LDFLAGS = -L/usr/local/lib  LIBS = -lk @@ -57,7 +57,7 @@
   .PHONY: clean  clean: - rm -f $(O)/* $(DEPDIR)/* raw2t t2mem + rm -f $(O)/*.o $(DEPDIR)/*.d raw2t t2mem    .PHONY: quicktest  quicktest:
Change 1 of 3 Show Entire File ii.c Stacked
 
5
6
7
 
8
9
10
11
12
 
 
 
13
14
15
16
17
18
19
20
21
22
 
 
 
 
23
24
25
 
 
 
 
 
 
26
27
28
 
30
31
32
33
 
34
35
36
 
38
39
40
41
 
 
 
42
43
44
45
 
 
 
 
 
 
46
47
48
49
50
51
 
 
 
 
 
52
53
54
 
 
 
 
 
55
56
57
58
59
 
60
61
 
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
 
 
 
 
 
 
 
77
78
79
80
81
82
 
83
84
 
 
 
 
 
 
 
 
85
86
87
 
5
6
7
8
9
10
11
12
13
14
15
16
17
18
 
19
20
21
22
23
24
 
25
26
27
28
29
 
 
30
31
32
33
34
35
36
37
38
 
40
41
42
 
43
44
45
46
 
48
49
50
 
51
52
53
54
55
56
57
58
59
60
61
62
63
64
 
 
 
 
 
65
66
67
68
69
70
 
 
71
72
73
74
75
76
 
 
 
 
77
78
 
79
80
81
82
83
84
85
 
 
 
 
 
 
 
 
 
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
 
101
102
103
104
105
106
107
108
109
110
111
@@ -5,24 +5,34 @@
 #include <kak/klog.h>  #include <kak/klist.h>  #include <kak/ktree.h> +#include <kak/khash.h>  #include "post.h"  #include "term.h"  #include "tokenizer.h"  #include "tfile.h"   +/* enum {NTERMS = 100003, NDOCS = 100003}; */ +enum {NTERMS = 10007, NDOCS = 4093}; +  int main(void)  { - /* TODO: store docids */   int i_;   TDoc *tdoc;   TNode *tree;   Post *post;   Term *term;   unsigned n_doc; - char *p_txt; + char *p_txt, *ts; + THeader *h; + Node *idtab[NDOCS], *termtab[NTERMS]; + Node *npid, *npt;   - THeader *h; - + + for (int i = 0; i < NDOCS; i++) + idtab[i] = NULL; + for (int i = 0; i < NTERMS; i++) + termtab[i] = NULL; +   tree = NULL;     h = _newTHeader(TREC); @@ -30,7 +40,7 @@
  if ((h = readTHeader(h, stdin)) == NULL)   exit(0);   - for(n_doc = 0; n_doc < h->n; n_doc++) { + for(n_doc = 1; n_doc <= h->n; n_doc++) {     fprintf(stderr, "%d/%d\r", n_doc, h->n);   @@ -38,50 +48,64 @@
    if ((tdoc = readTDoc(tdoc, stdin)) == NULL)   exit(0); - + + tdoc->id[tdoc->h->n_id] = '\0'; + npid = hlookup(idtab, newnode(strdup(tdoc->id)), 1, hash, NDOCS, strcmp);   i_ = 0;   p_txt = tdoc->txt;     for (int i = 0; i < tdoc->h->n_txt; i++) { + + if (tdoc->txt[i] == ' ') { + + ts = (char *)malloc(i - i_ + 1); + memcpy(ts, p_txt, i - i_); + ts[i - i_] = '\0';   - if (tdoc->txt[i] == ' ') { - - post = newpost(n_doc, 1); - term = newterm(p_txt, i - i_, post); - + npt = hlookup(termtab, newnode(ts), 1, hash, NTERMS, strcmp); + + post = newpost((char *)(npid->data), 1); + term = newterm((char *)(npt->data), 1, post); +   /* if term exists - if doc exists at front of list - post->tf++ + if post exists at front of list + post->tf++ + else + add new post at front of list + term->df++   else - add new doc at front of list - term->df++ - else - attach node with term to tree + add term to tree   */ - +   tree = nrinsert(tree, newtnode((void *)term), termcmp,   term_match_handler);     i_ = i + 1;   p_txt = &(tdoc->txt[i + 1]);   - /* - 2 = both term and post matched - 1 = term matched not the post - */ - - if (term->status == 2) - freeterm(term, freepost); - else if (term->status == 1) - freeterm(term, NULL); + if (post->ref == 0) { + free(post); + free(term->plist); + } + if (term->ref == 0) { + free(term); + }   }   }     freeTDoc(tdoc);   }   + fprintf(stderr, "%d/%d\n", n_doc - 1, h->n);   _freeTHeader(h); - applyinorder(tree, printtree, "%s:%d "); + + /* applyinorder(tree, printtree, "%s:%d "); */ + + /* fprintf(stderr, "\n"); */ + /* fprintf(stderr, "idtab[]\n"); */ + /* hstats(idtab, NDOCS); */ + /* fprintf(stderr, "termtab[]\n"); */ + /* hstats(termtab, NTERMS); */     return 0;  }
Change 1 of 2 Show Entire File post.c Stacked
 
3
4
5
6
 
7
8
9
10
11
12
 
 
 
 
 
 
13
14
15
 
16
17
 
 
 
18
19
20
 
23
24
25
26
27
28
29
30
31
 
3
4
5
 
6
7
 
 
 
 
 
8
9
10
11
12
13
14
15
 
16
17
 
18
19
20
21
22
23
 
26
27
28
 
 
 
 
 
 
@@ -3,18 +3,21 @@
 #include <string.h>  #include "post.h"   -Post *newpost(unsigned id, unsigned tf) +Post *newpost(char *id, unsigned tf)  { - Post *t; - t = (Post *)malloc(sizeof(Post)); - t->id = id; - t->tf = tf; - return t; + Post *p; + p = (Post *)malloc(sizeof(Post)); + p->id = id; + p->tf = tf; + p->ref = 1; + return p;  }   -int postcmp(void *t, void *t1) +int postcmp(void *d1, void *d2)  { - return ((Post *)t)->id - ((Post *)t1)->id; + if (((Post *)d1)->id == ((Post *)d2)->id) + return 0; + return 1;  }    void printpost(void *d, void *arg) @@ -23,9 +26,3 @@
  fmt = (char *)arg;   printf(fmt, ((Post *)d)->id, ((Post *)d)->tf);  } - -void freepost(void *t) -{ - if (t != NULL) - free(t); -}
Change 1 of 1 Show Entire File post.h Stacked
 
1
2
3
4
 
5
 
6
7
8
 
9
10
11
 
1
2
3
 
4
5
6
7
8
 
9
10
11
 
@@ -1,11 +1,11 @@
 typedef struct Post Post;    struct Post { - unsigned id; + char* id;   unsigned tf; + int ref;  };   -Post *newpost(unsigned, unsigned); +Post *newpost(char*, unsigned);  int postcmp(void*, void*);  void printpost(void*, void*); -void freepost(void*);
Change 1 of 2 Show Entire File term.c Stacked
 
5
6
7
8
 
9
10
11
12
13
14
15
16
17
18
19
20
21
 
 
 
 
22
23
24
25
 
26
27
 
 
 
28
29
30
 
31
32
 
 
33
34
35
36
37
38
39
 
 
 
 
 
 
 
 
 
 
 
40
41
42
 
43
44
45
46
47
48
49
50
 
51
52
53
54
55
56
57
58
59
60
 
 
 
 
61
62
63
 
72
73
74
75
 
76
77
78
 
5
6
7
 
8
9
10
11
 
 
 
 
 
 
 
 
 
 
12
13
14
15
16
17
18
 
19
20
 
21
22
23
24
25
 
26
27
 
28
29
30
 
 
 
 
 
 
31
32
33
34
35
36
37
38
39
40
41
42
43
 
44
45
 
46
47
48
49
50
 
51
52
 
 
 
 
 
 
 
 
 
53
54
55
56
57
58
59
 
68
69
70
 
71
72
73
74
@@ -5,59 +5,55 @@
 #include "post.h"  #include "term.h"   -Term *newterm(char *s, unsigned len, Post *p) +Term *newterm(char *s, unsigned df, Post *post)  {   Term *t;   t = (Term *)malloc(sizeof(Term)); - t->s = (char *)malloc(len + 1); - - memcpy(t->s, s, len); - t->s[len] = '\0'; - - t->df = 1; - t->list = newnode(p); - - t->status = 0; - + t->s = s; + t->df = df; + t->plist = newnode(post); + t->ref = 1;   return t;  }   -int termcmp(void *t, void *t1) +int termcmp(void *d1, void *d2)  { - return strcmp(((Term *)t)->s, ((Term *)t1)->s); + if (((Term *)d1)->s == ((Term *)d2)->s) + return 0; + return strcmp(((Term *)d1)->s, ((Term *)d2)->s);  }   -int term_match_handler(void *term, void *newterm) +int term_match_handler(void *d1, void *d2)  { - Term *t, *nt; + Term *t, *newt; + Post *p, *newp;   - t = (Term *)term; - nt = (Term *)newterm; - - if (postcmp(nt->list->data, t->list->data) == 0) { - ((Post *)t->list->data)->tf++; - nt->status = 2; + newt = (Term *)d1; + t = (Term *)d2; + + newt->ref = 0; + + newp = (Post *)(newt->plist->data); + p = (Post *)(t->plist->data); + + if (postcmp(newp, p) == 0) { + p->tf++; + newp->ref = 0;   }   else { - t->list = addfront(t->list, nt->list); + t->plist = addfront(t->plist, newt->plist);   t->df++; - nt->status = 1;   }     return 0;  }   -void freeterm(void *term, void (*freepost)(void*)) +void freeterm(void *term)  { - Term *t; - t = (Term *)term; - if (t != NULL) { - if (t->s != NULL) - free(t->s); - if (freepost != NULL && t->list != NULL) - freelist(t->list, freepost); - free(t); - } + if (term == NULL) + return; + free(((Term *)term)->s); + free(term);  }    void printterm(void *d, void *arg) @@ -72,7 +68,7 @@
  char *fmt;   fmt = (char *)arg;   printf(fmt, ((Term *)d)->s, ((Term *)d)->df); - apply(((Term *)d)->list, printpost, "%d:%d "); + apply(((Term *)d)->plist, printpost, "%s:%d ");   printf("\n");  }  
Change 1 of 1 Show Entire File term.h Stacked
 
3
4
5
6
7
 
 
8
9
10
11
12
13
 
14
15
16
 
3
4
5
 
 
6
7
8
9
10
11
12
 
13
14
15
16
@@ -3,14 +3,14 @@
 struct Term {   char *s;   unsigned df; - Node *list; - unsigned status; + Node *plist; + int ref;  };    Term *newterm(char*, unsigned, Post*);  int termcmp(void*, void*);  int term_match_handler(void*, void*); -void freeterm(void*, void (*fn)(void*)); +void freeterm(void*);  void printterm(void*, void*);  void printtree(void*, void*);  unsigned int hashterm(unsigned (*fn)(char*, int), void*, int);
Change 1 of 2 Show Entire File tfile.c Stacked
 
47
48
49
50
51
 
 
52
53
 
54
55
 
56
57
58
 
168
169
170
171
 
172
173
174
 
47
48
49
 
 
50
51
52
 
53
54
 
55
56
57
58
 
168
169
170
 
171
172
173
174
@@ -47,12 +47,12 @@
  TDoc *tdoc;   tdoc = (TDoc *)malloc(sizeof(TDoc));   tdoc->h = _newTSubHeader(type); - tdoc->txt = (char *)malloc(TXTBUFSIZE); - tdoc->id = (char *)malloc(IDBUFSIZE); + tdoc->txt = (char *)malloc(sizeof(char) * TXTBUFSIZE); + tdoc->id = (char *)malloc(sizeof(char) * IDBUFSIZE);   tdoc->rsrc = NULL; - tdoc->rsrc = (char **)malloc(tdoc->h->r); + tdoc->rsrc = (char **)malloc(sizeof(char *) * tdoc->h->r);   for (int i = 0; i < tdoc->h->r; i++) - tdoc->rsrc[i] = (char *)malloc(RSRCBUFSIZE); + tdoc->rsrc[i] = (char *)malloc(sizeof(char) * RSRCBUFSIZE);   return tdoc;  }   @@ -168,7 +168,7 @@
   THeader *readTHeader(THeader *h, FILE *fp)  { - uint16_t v, check; + uint16_t v;   int n;     /* fill the THeader */