C/some stiches..
Expert: Zlatko - 5/20/2010
QuestionQUESTION: hello zlatko,
now, when i tested the full code with this file
===============================infile.txt=============================================
aaa bbb aaa cc bb bb aaa bbb bb ccc cc cc ccc aa aaa aaa aaa aaa aaa aaa aaa aa aa aa aaa aaa aa aaa aa aaa aa
aaa aaa bb bb cc bbb bb cc cc ccc cc ccc ccc cc cc ccc aaa ccc ccc ccc bbb bb cccc ccc cc ccc ccc ccc bbb bb bbb bbb
bbb bb bbb bb
==================================================================================
i dont get the exact counting..what could be the reason
============================================================================================
#include<stdio.h>
#include<conio.h>
#include<string.h>
#include<errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include<stdlib.h>
void reverseString(char* mem, int len);
void sort_word(char *a,int n);
int cstring_cmp(const void *a, const void *b);//the comparison function
int main()
{
// FILE* fp = fopen("infile.txt", "w");
/* get the file size */
int ix= 0;
int counter=0;
int perc_count;
int temp_count = 0;
int t;
struct _stat statbuf;
_stat("infile.txt", &statbuf);
int length = statbuf.st_size;
printf("%d", length);
puts("
");
/* allocate memory */
char* mem = (char*)malloc(length+1);//extra 1 space for the NULL character
/* Read the file in */
FILE* fp = fopen("infile.txt", "r");
/* reassign length. Because of the way windows encodes files,
the number of characters
read will be less than the file size */
length = fread(mem, 1, length, fp);
mem[length] = 0;
char** words = (char**)malloc(sizeof(char*) * (length + 1));
char *p = strtok(mem, " ");
while (p != NULL)
{
words[ix++] = p;
counter++;
p = strtok(NULL, " ");
}
printf("%d",counter);
fclose(fp);
puts("
");
printf("Original
");
for(t = 0; t<counter; t++)
printf("%s
", words[t]);
puts("
");
qsort(words, counter, sizeof(char*), cstring_cmp);
printf("Sorted
");
for(t = 0; t<counter; t++)
printf("%s
", words[t]);
puts("
");
//write the memory to the output with results
fp = fopen("outfile.txt", "w");
for(t = 0; t<counter-1; t++)
{
//fprintf(fp, "%s
", words[t]);
if(!(strcmp(words[t], words[t+1]))){
temp_count++;
}
else
{
perc_count =(int)(temp_count/counter)*100;
//fprintf(fp, "%s occurs %d times
", words[t], temp_count);
// fputs("
",fp );
//fprintf(fp,"pecentage is : %d
", perc_count);
printf("%s occurs %d times
", words[t], temp_count);
printf("pecentage is : %d
", perc_count);
}
}
fclose(fp);
/* free the memories */
free(mem);
free(words);
getche();
}
//the comparison function
int cstring_cmp(const void *a, const void *b)
{
const char **ia = (const char**)a;
const char **ib = (const char**)b;
return strcmp(*ia, *ib);
/* strcmp functions works exactly as expected from
comparison function */
}
==================================================================================================
kiitos avusta ystavani.. meaning thanks for your help dear friend....
ANSWER: Oh, that's a good bug.
in the counter loop, I put in this print statement to see what was being compared
printf("Compare <%s> <%s>\n", words[t], words[t+1]);
What I got was this
Compare <aa> <aa>
Compare <aa> <aa>
Compare <aa> <aa>
Compare <aa> <aa>
Compare <aa> <aa>
Compare <aa> <aa
aaa>
Notice the newline between aa and aaa
That is actually one word.
Since you have newlines in your file, you need to break works apart based on spaces and newlines. You need to changes your strtok to this:
strtok(mem, " \n");
and
strtok(NULL, " \n");
the second parameter in strtok is a string of all the characters that break up mem.
The second problem is one of counting.
temp_count has to start at 1, and be reset to 1 whenever the word changes.
Imagine this
a
a
a
a
b
a1 matches a2, incrementing temp_count
a2 matches a3, incrementing temp_count
a3 matches a4, incrementing temp_count
so we have 3 increments, but 4 'a' characters. We forgot to count the either the first, or last, however you want to look at it.
Finally, the problem with perc_count is that you are doing integer division.
temp_count < counter, so temp_count / counter is less than 1, but you are doing integer division, so the result is 0.
You need to cast one of the values as float to do floating point division, or simply multiply temp_count by 100.0, instead of 100 before the division. The dot 0 makes the numerator a floating point value.
Here is the counter almost loop fixed. I made perc_count a float so that you get better accuracy. I added a perc_total as a way of checking the algorithm. It should add up to 100
temp_count = 1;
float perc_total = 0;
for(t = 0; t<counter-1; t++)
{
printf("%d: compare <%s> <%s>\n", t, words[t], words[t+1]);
if(!(strcmp(words[t], words[t+1]))){
temp_count++;
}
else
{
perc_count = temp_count*100.0/counter;
printf("%s occurs %d times\n", words[t], temp_count);
printf("pecentage is : %.2f\n", perc_count);
perc_total+=perc_count;
temp_count = 1;
}
}
printf("Total percentage %f\n", perc_total);
Notice that the total percentage does not add to 100 for the file you gave me.
Look carefully at the printouts and determine why.
---------- FOLLOW-UP ----------
QUESTION: Hello zlatko,
i guess it did not add to 100 because of the number of occurrences of the words
in the file.. mayb . also, could you teach me more on the comparison function
used in qsort? i get confused over how to write the function..because of the
pointer conversion done.
AnswerIt is important that you get the percentage total to add to 100, (or 99.9999 with round off error). It is actually more important as a programmer that you solve that problem than it is to understand qsort. But I'll explain qsort too.
Remember the loop
for(t = 0; t<counter-1; t++)
was set to go to counter-2, instead of counter-1 because of the strcmp with words[t+1].
There is actually nothing wrong with running the loop to counter-1, as long as the comparison is not done, because on the last word, there is no next word to compare to.
Keep that in mind.
Try to find the pattern of what makes the percentage not add to 100. Is it the even or odd number of elements in the file? Is it wrong when there are single instances of a word? Is it wrong when there are multiple instances of each word? Try a shorter file that shows the problem and step through the program with a debugger.
The qsort is confusing. I agree.
The qsort comparison function gets pointers to the elements being compared.
If your array is one of integers, the comparison function will get pointers to each integer, so you have to treat them as such.
int cmpInt(void* a void* b)
{
int* pa = (int*)a;
int* pb = (int*)b;
// Compare the 2 integers. To compare them, we must de-reference the pointers to them.
// De-referencing means seeing what they point to.
if (*pa < *pb) return -1;
// etc
}
If your array is one of characters, the idea is similar. The array has characters, the function gets pointers to the characters.
int cmpChar(void* a void* b)
{
char* pa = (char*)a;
char* pb = (char*)b;
// compare the 2 characters. We are not comparing strings here, just 2 characters
// To compare them, we must de-reference the pointers to them.
if (*pa < *pb) return -1;
// etc
}
C strings are a little more complicated. Your array has pointers to C strings, meaning pointers to char*. So the comparison function gets pointers to the pointers.
int cmpCstring(void* a void* b)
{
char** pa = (char**)a;
char** pb = (char**)b;
// compare the 2 strings
// To do so you have to de-reference the pointer to the pointer to the string
// Dereferencing gives you pointer to the first character of the string, which is what strcmp takes.
return (strcmp(*pa, *pb));
}
If you find all these pointers to pointers confusing, create a new type for yourself so that C strings look more like integers.
typedef char* C_String;
All the typedef does is hide one '*'
int cmpCString(void* a, void* b)
{
C_String* pa = (C_String*)a;
C_String* pb = (C_String*)b;
return strcmp(*pa, *pb);
}
Here is your program with the C_String typedef
#include<stdio.h>
#include<conio.h>
#include<string.h>
#include<errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include<stdlib.h>
typedef char* C_String;
void reverseString(C_String mem, int len);
int cstring_cmp(const void *a, const void *b);//the comparison function
int main()
{
// FILE* fp = fopen("infile.txt", "w");
/* get the file size */
int ix= 0;
int counter=0;
float perc_count;
int temp_count = 0;
int t;
struct _stat statbuf;
_stat("infile.txt", &statbuf);
int length = statbuf.st_size;
printf("%d\n", length);
/* allocate memory */
C_String mem = (C_String)malloc(length+1);//extra 1 space for the NULL character
/* Read the file in */
FILE* fp = fopen("infile.txt", "r");
/* reassign length. Because of the way windows encodes files,
the number of characters
read will be less than the file size */
length = fread(mem, 1, length, fp);
mem[length] = 0;
C_String* words = (C_String*)malloc(sizeof(C_String) * (length + 1));
C_String p = strtok(mem, " \n");
while (p != NULL)
{
words[ix++] = p;
counter++;
p = strtok(NULL, " \n");
}
printf("%d\n",counter);
fclose(fp);
printf("Original\n");
for(t = 0; t<counter; t++)
printf("%s\n", words[t]);
qsort(words, counter, sizeof(C_String), cstring_cmp);
printf("Sorted\n");
for(t = 0; t<counter; t++)
printf("%s\n", words[t]);
//write the memory to the output with results
fp = fopen("outfile.txt", "w");
temp_count = 1;
float perc_tot = 0;
for(t = 0; t<counter-1; t++)
{
printf("%d: compare <%s> <%s>\n", t, words[t], words[t+1]);
if(!(strcmp(words[t], words[t+1]))){
temp_count++;
}
else
{
perc_count = temp_count*100.0/counter;
printf("%s occurs %d times\n", words[t], temp_count);
printf("pecentage is : %.2f\n", perc_count);
perc_tot+=perc_count;
temp_count = 1;
}
}
printf("Total percentage %f\n", perc_tot);
fclose(fp);
/* free the memories */
free(mem);
free(words);
getche();
}
//the comparison function
int cstring_cmp(const void *a, const void *b)
{
const C_String *ia = (const C_String*)a;
const C_String *ib = (const C_String*)b;
return strcmp(*ia, *ib);
/* strcmp functions works exactly as expected from
comparison function */
}