From ffd46fd134188c423054645fc0a73af8223f9b29 Mon Sep 17 00:00:00 2001 From: tmikolov Date: Mon, 8 Sep 2014 20:14:01 +0000 Subject: [PATCH] fixed minor bugs --- compute-accuracy.c | 8 +++++++- distance.c | 8 +++++++- word-analogy.c | 8 +++++++- word2vec.c | 4 ++-- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/compute-accuracy.c b/compute-accuracy.c index d83fcbb..61c31ac 100644 --- a/compute-accuracy.c +++ b/compute-accuracy.c @@ -53,7 +53,13 @@ int main(int argc, char **argv) return -1; } for (b = 0; b < words; b++) { - fscanf(f, "%s%c", &vocab[b * max_w], &ch); + a = 0; + while (1) { + vocab[b * max_w + a] = fgetc(f); + if (feof(f) || (vocab[b * max_w + a] == ' ')) break; + if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; + } + vocab[b * max_w + a] = 0; for (a = 0; a < max_w; a++) vocab[b * max_w + a] = toupper(vocab[b * max_w + a]); for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); len = 0; diff --git a/distance.c b/distance.c index fbeb24a..731999f 100644 --- a/distance.c +++ b/distance.c @@ -51,7 +51,13 @@ int main(int argc, char **argv) { return -1; } for (b = 0; b < words; b++) { - fscanf(f, "%s%c", &vocab[b * max_w], &ch); + a = 0; + while (1) { + vocab[b * max_w + a] = fgetc(f); + if (feof(f) || (vocab[b * max_w + a] == ' ')) break; + if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; + } + vocab[b * max_w + a] = 0; for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); len = 0; for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; diff --git a/word-analogy.c b/word-analogy.c index ea91ab1..8c486f6 100644 --- a/word-analogy.c +++ b/word-analogy.c @@ -50,7 +50,13 @@ int main(int argc, char **argv) { return -1; } for (b = 0; b < words; b++) { - fscanf(f, "%s%c", &vocab[b * max_w], &ch); + a = 0; + while (1) { + vocab[b * max_w + a] = fgetc(f); + if (feof(f) || (vocab[b * max_w + a] == ' ')) break; + if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; + } + vocab[b * max_w + a] = 0; for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); len = 0; for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; diff --git a/word2vec.c b/word2vec.c index 6763cfa..b9d3dc6 100644 --- a/word2vec.c +++ b/word2vec.c @@ -152,9 +152,9 @@ void SortVocab() { train_words = 0; for (a = 0; a < size; a++) { // Words occuring less than min_count times will be discarded from the vocab - if (vocab[a].cn < min_count) { + if ((vocab[a].cn < min_count) && (a != 0)) { vocab_size--; - free(vocab[vocab_size].word); + free(vocab[a].word); } else { // Hash will be re-computed, as after the sorting it is not actual hash=GetWordHash(vocab[a].word);