编程珠玑 15.1 单词

  • Post author:
  • Post category:其他


编程珠玑单词中,需要统计《圣经》一书中,出现频率最高的单词,这是典型的TOP K问题,采用经典的Hash + 堆方法解决,代码如下:

#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include <assert.h>
#include <stdlib.h>
#include <ctime>
#define HASHLEN 101
#define WORDLEN 30
#define MAX  100000
#define DOMAIN 300
#define K 10

//Hash链表的节点定义
typedef struct node
{
	char *word;
	int count;
	struct node *next;
} node, *node_ptr;
static node_ptr head[HASHLEN];
static node array[K];

//Hash函数
int hash_function(char *p)
{
	unsigned int value = 0;
	while (*p != '\0')
	{
		value = value * 31 + *p++;
		if (value > HASHLEN)
			value = value % HASHLEN;
	}
	return value;
}

//加入节点到HASH链表
void append_word(char *str)
{
	int index = hash_function(str);
	node_ptr p = head[index];
	while (p != NULL)
	{
		if (strcmp(str, p->word) == 0)
		{
			(p->count)++;
			return;
		}
		p = p->next;
	}
	// 新建一个结点
	node_ptr q = (node_ptr) malloc(sizeof(node));
	q->count = 1;
	q->word = (char *) malloc(sizeof(str) + 1);
	strcpy(q->word, str);

	//insert into the list head
	q->next = head[index];
	head[index] = q;
}

//产生0~DOMAIN - 1范围内的MAX个整数
void gen_data()
{
	FILE *fp = fopen("c://data1.txt", "w");
	assert(fp);
	int i = 0;
	srand((int) (time(0)));
	for (i = 0; i < MAX; i++)
		fprintf(fp, "%d  ", rand() % DOMAIN);
	fclose(fp);
}

//堆调整:调整为最小堆
void heapAdjust(node array[], int beginIndex, int endIndex, int index)
{
	int length = endIndex - beginIndex + 1;
	int largestIndex = index;
	int leftIndex = 2 * index + 1; //下标从0开始,可以自己做实验
	int rightIndex = 2 * index + 2;
	if (leftIndex <= length - 1
			&& array[leftIndex].count <= array[largestIndex].count)
	{
		largestIndex = leftIndex;
	}
	if (rightIndex <= length - 1
			&& array[rightIndex].count <= array[largestIndex].count)
	{
		largestIndex = rightIndex;
	}
	if (largestIndex != index)
	{
		node temp = array[largestIndex];
		array[largestIndex] = array[index];
		array[index] = temp;
		heapAdjust(array, beginIndex, endIndex, largestIndex);
	}
}

//建堆
void heapBuild(node array[], int len)
{
	int i = 0;
	for (i = len / 2 - 1; i >= 0; i--)
	{
		heapAdjust(array, 0, len - 1, i);
	}

	return ;
}

int main()
{
	gen_data();
	char str[WORDLEN];
	int i;
	int cnt1 = 0;

	//init
	for (i = 0; i < HASHLEN; i++)
		head[i] = NULL;

	FILE *fp_passage = fopen("c://data1.txt", "r");
	assert(fp_passage);
	while (fscanf(fp_passage, "%s", str) != EOF)
	{
		cnt1++;
		append_word(str);
	}
	printf("the cnt1 is %d\n", cnt1);
	fclose(fp_passage);

	//寻找Top K
	for (i = 0; i < HASHLEN; i++)
	{
		if (i < K - 1)
			array[i] = *head[i];
		else if (i == K - 1)
		{
			array[i] = *head[i];
			heapBuild(array, K);
		}
		else
		{
			if (array[0].count < head[i]->count)
			{
				array[0] = *head[i];
				heapAdjust(array, 0, K - 1, 0);
			}
		}
	}

	printf("the top %d is as follows\n", K);
	for (i = 0; i < K; i++)
		printf("%s , and its count is %d\n", array[i].word, array[i].count);
	//	printf("the total number of word is %d",cnt);
	return 0;
}

代码中核心代码有:

1,哈希散列函数

2,调整堆函数

关键数据结构:

适应hash的结构体,包含字符串,统计信息,指针



版权声明:本文为lxmky原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。