编程珠玑单词中,需要统计《圣经》一书中,出现频率最高的单词,这是典型的TOP K问题,采用经典的Hash + 堆方法解决,代码如下:
#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include <assert.h>
#include <stdlib.h>
#include <ctime>
#define HASHLEN 101
#define WORDLEN 30
#define MAX 100000
#define DOMAIN 300
#define K 10
//Hash链表的节点定义
typedef struct node
{
char *word;
int count;
struct node *next;
} node, *node_ptr;
static node_ptr head[HASHLEN];
static node array[K];
//Hash函数
int hash_function(char *p)
{
unsigned int value = 0;
while (*p != '\0')
{
value = value * 31 + *p++;
if (value > HASHLEN)
value = value % HASHLEN;
}
return value;
}
//加入节点到HASH链表
void append_word(char *str)
{
int index = hash_function(str);
node_ptr p = head[index];
while (p != NULL)
{
if (strcmp(str, p->word) == 0)
{
(p->count)++;
return;
}
p = p->next;
}
// 新建一个结点
node_ptr q = (node_ptr) malloc(sizeof(node));
q->count = 1;
q->word = (char *) malloc(sizeof(str) + 1);
strcpy(q->word, str);
//insert into the list head
q->next = head[index];
head[index] = q;
}
//产生0~DOMAIN - 1范围内的MAX个整数
void gen_data()
{
FILE *fp = fopen("c://data1.txt", "w");
assert(fp);
int i = 0;
srand((int) (time(0)));
for (i = 0; i < MAX; i++)
fprintf(fp, "%d ", rand() % DOMAIN);
fclose(fp);
}
//堆调整:调整为最小堆
void heapAdjust(node array[], int beginIndex, int endIndex, int index)
{
int length = endIndex - beginIndex + 1;
int largestIndex = index;
int leftIndex = 2 * index + 1; //下标从0开始,可以自己做实验
int rightIndex = 2 * index + 2;
if (leftIndex <= length - 1
&& array[leftIndex].count <= array[largestIndex].count)
{
largestIndex = leftIndex;
}
if (rightIndex <= length - 1
&& array[rightIndex].count <= array[largestIndex].count)
{
largestIndex = rightIndex;
}
if (largestIndex != index)
{
node temp = array[largestIndex];
array[largestIndex] = array[index];
array[index] = temp;
heapAdjust(array, beginIndex, endIndex, largestIndex);
}
}
//建堆
void heapBuild(node array[], int len)
{
int i = 0;
for (i = len / 2 - 1; i >= 0; i--)
{
heapAdjust(array, 0, len - 1, i);
}
return ;
}
int main()
{
gen_data();
char str[WORDLEN];
int i;
int cnt1 = 0;
//init
for (i = 0; i < HASHLEN; i++)
head[i] = NULL;
FILE *fp_passage = fopen("c://data1.txt", "r");
assert(fp_passage);
while (fscanf(fp_passage, "%s", str) != EOF)
{
cnt1++;
append_word(str);
}
printf("the cnt1 is %d\n", cnt1);
fclose(fp_passage);
//寻找Top K
for (i = 0; i < HASHLEN; i++)
{
if (i < K - 1)
array[i] = *head[i];
else if (i == K - 1)
{
array[i] = *head[i];
heapBuild(array, K);
}
else
{
if (array[0].count < head[i]->count)
{
array[0] = *head[i];
heapAdjust(array, 0, K - 1, 0);
}
}
}
printf("the top %d is as follows\n", K);
for (i = 0; i < K; i++)
printf("%s , and its count is %d\n", array[i].word, array[i].count);
// printf("the total number of word is %d",cnt);
return 0;
}
代码中核心代码有:
1,哈希散列函数
2,调整堆函数
关键数据结构:
适应hash的结构体,包含字符串,统计信息,指针
版权声明:本文为lxmky原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。