#define STACK_SIZE 100000 #define Top_i (stack[sp-1].i) #define Top_k (stack[sp-1].k) #define Top_df (stack[sp-1].df) struct STACK {int i; int k; int df;} stack[STACK_SIZE]; int sp = 0; /* stack pointer */ void push(int i, int k, int df) { if(sp >= STACK_SIZE){ fprintf(stderr, "stack overflow\n"); exit(2);} stack[sp].i = i; stack[sp].k = k; stack[sp++].df = df;} void pop() {sp--;} void output(int i, int j, int k, int df) { int LBL; if(lcp[i] > lcp[j+1]) LBL = lcp[i]; else LBL = lcp[j+1]; if(i==j) printf("trivial <%d,%d>, tf=1\n", i, j); else if(LBL < lcp[k]) printf("nontrivial <%d, %d>, rep=%d, tf=%d, df=%d\n", i, j, k, j-i+1, df ); } /* * Print_LDIs_with_df does not only print tf, but also df. * It takes the corpus size, N, and the number of documents, D. * doc() returns the document number of the suffix array's index. * dec_df() decrease a df-counter in the stack when duplicate * counting occurs. */ void dec_df(int docid) { int beg=0, end=sp, mid=sp/2; while(beg != mid) { if(doclink[docid] >= stack[mid].i) beg = mid; else end = mid; mid = (beg + end) / 2; } stack[mid].df--; } print_LDIs_with_df(int N, int D) { int i, j, df; doclink = (int *)malloc(sizeof(int) * D); for(i = 0; i < D; i++) doclink[i] = -1; push(0,0,1); for(j = 0; j < N; j++) { output(j,j,0,1); if(doclink[doc(j)] != -1) dec_df(doc(j)); doclink[doc(j)] = j; df = 1; while (lcp[j+1] < lcp[Top_k]) { df = Top_df + df; output(Top_i,j,Top_k,df); pop(); } /* original code */ push(Top_k, j+1, df); /* efficient code */ /* if(lcp[Top_k] == lcp[j+1]) { Top_k = j+1; Top_df += df; } else push(Top_k, j+1, df); */ } }