您的位置：首页 > 其它

哈夫曼树实现文件压缩与解压缩

2016-06-06 21:23 411 查看

见识了360压缩的神奇后，想要实现自己的文件压缩程序，然后花了近一个星期的时间去完成文件压缩与解压缩，期间有很多坑，花了很长时间去调试它，最后把坑给填了（真心的感受到了程序员写代码时的小小粗心会把自己给坑惨）。以下是些程序时的一些坑：

在windows下回车的字符是‘\r’'\n'，编译器在读取字符时读取到'\r'后再读取到'\n'就会转换为回车。。。
在解压缩小文件时不会出现的问题在解压缩大文件时会出现。最常见的时没有解压缩完文件就退出了，因为会出现一些控制字符导致程序提前退出。
压缩汉字的时候要使用unsigned char！！！

正如标题所说，实现文件压缩我是使用哈夫曼树产生哈夫曼编码，使用哈夫曼编码来压缩文件。

构造哈夫曼树的key值是文件中每个字符出现的次数。将出现的字符插入一个最小堆中，每次从堆中取出出现次数最少的字符构造哈夫曼树。

为此，我们先实现一个最小堆：

#pragma once
#define _CRT_SECURE_NO_WARNINGS 1
#include<iostream>
#include<vector>
#include<assert.h>
//#include"HaffmanTree.h"
using namespace std;

template<class T>
struct Less
{
bool operator()(const T& l, const T& r)
{
return l < r;
}
};

template<class T>
struct Greater
{
bool operator()(const T& l, const T& r)
{
return l > r;
}
};

template<class T>
struct Less<T*>
{
bool operator()(const T*Nodel, const T*Noder)
{
return Nodel->_wight < Noder->_wight;
}
};

template<class T,class Continer = Less<T>>//默认为小堆
class Heap
{
public:
Heap(){};
Heap(const T* a, size_t size,const T& invalid);
Heap(vector<T> a);
Heap(const vector<T>& v);
void Push(const T& x);
void Pop();
T& GetTop();
bool Empty();
size_t Size();
void HeapSort(T* a, size_t size);
protected:
void _AdjustDown(size_t parent);
void _AdjustUp(int child);
protected:
vector<T> _a;
};

template<class T, class Continer = Less<T>>
Heap<T, Continer>::Heap(const T* a, size_t size,const T& invalid)
{
_a.reserve(size);

for (size_t i = 0; i < size; ++i)
{
if (a[i] != invalid)
{
_a.push_back(a[i]);
}
}

//建堆
for (int i = (_a.size() - 2) / 2; i >= 0; i--)
//从第一个非叶子结点开始下调，叶子结点可以看作是一个大堆或小堆
{

_AdjustDown(i);
}
}
template<class T, class Continer = Less<T>>
Heap<T, Continer>::Heap(vector<T> a)
{
_a.swap(a);

// 建堆
for (int i = (_a.size() - 2) / 2; i >= 0; --i)
{
_AdjustDown(i);
}
}
template<class T, class Continer = Less<T>>
Heap<T, Continer>::Heap(const vector<T>& v)
:_a(v)
{
//_a.resize(v.size());
}
template<class T, class Continer = Less<T>>
void Heap<T, Continer>::Push(const T& x)
{
_a.push_back(x);
_AdjustUp(_a.size() - 1);
}
template<class T, class Continer = Less<T>>
void Heap<T, Continer>::Pop()
{
assert(!_a.empty());
size_t size = _a.size();
swap(_a[0], _a[size - 1]);
_a.pop_back();
_AdjustDown(0);
}
template<class T, class Continer = Less<T>>
T& Heap<T, Continer>::GetTop()
{
return _a[0];
}
template<class T, class Continer = Less<T>>
bool Heap<T, Continer>::Empty()
{
return _a.empty();
}
template<class T, class Continer = Less<T>>
size_t Heap<T, Continer>::Size()
{
return _a.size();
}

template<class T, class Continer = Less<T>>
void Heap<T, Continer>::_AdjustDown(size_t parent)
{
Continer _con;
size_t child = parent * 2 + 1;
size_t size = _a.size();
while (child < size)
{
if (child + 1 < size&&_con(_a[child + 1], _a[child]))
//注意这必须是child+1更大或更小，所以把child+1放在前面
++child;
if (/*_a[parent] < _a[child]*/_con(_a[child], _a[parent]))
{
swap(_a[parent], _a[child]);
parent = child;
child = parent * 2 + 1;
}
else
break;
}
}
template<class T, class Continer = Less<T>>
void Heap<T, Continer>::_AdjustUp(int child)
{
Continer _con;
int parent = (child - 1) / 2;
while (child > 0)
{
if (_con(_a[child], _a[parent]))
{
swap(_a[child], _a[parent]);
child = parent;
parent = (child - 1) / 2;
}
else
break;
}
<p>}</p>

使用类模板实现的小顶堆，方便我们传入哈夫曼结点的结构体，并且实现了比较两个结构体的大小的仿函数。实质是比较_wight值

实现了堆以后实现哈夫曼树：

#pragma once
#include<iostream>
#include"Heap.h"
#include"FileComparess.h"

using namespace std;

template<class T>
struct HaffmanNode
{
HaffmanNode<T>* _left;
HaffmanNode<T>* _right;
T _wight;
HaffmanNode(const T& wight)
:_left(NULL)
, _right(NULL)
, _wight(wight)
{}
};

template<class T>
class HaffmanTree
{
public:
typedef HaffmanNode<T> Node;
HaffmanTree(const T* a, size_t size, const T& invalid)
{
_root = _CreatHaffmanTree(a, size, invalid);
}
Node* GetRoot()
{
return _root;
}
protected:
Node* _CreatHaffmanTree(const T* a,size_t size, const T& invalid)
{
Heap<Node*, Less<Node*>> minHeap;
for (size_t i = 0; i < size; ++i)
{
if (a[i] != invalid)
{
Node* tmp = new Node(a[i]);
minHeap.Push(tmp);
}
}
while (!minHeap.Empty())
{
Node* left = minHeap.GetTop();
minHeap.Pop();
Node* right = NULL;
if (!minHeap.Empty())
{
right = minHeap.GetTop();
minHeap.Pop();
}
Node* parent = NULL;
if (right)
{
parent = new Node(left->_wight + right->_wight);
}
else
{
parent = new Node(left->_wight);
}
parent->_left = left;
parent->_right = right;
if (minHeap.Empty())
{
return parent;
}
minHeap.Push(parent);
}
return NULL;
}
protected:
Node* _root;
};

可以看到树节点的_wight成员，建立哈夫曼树时就是依据_wight大小来建立的，也就是文件中各个字符出现的次数。

构造哈夫曼树时每次从小顶堆中取出堆顶元素插入到哈夫曼树中，当堆中的元素为空时，构造哈夫曼树完成。

哈夫曼树构造完成开始文件压缩：

#pragma once
#define _CRT_SECURE_NO_WARNINGS 1
#include<iostream>
#include"HaffmanTree.h"
using namespace std;
typedef long LongType;

struct CharInfo
{
unsigned char _ch;
LongType _count;
string _code;
CharInfo(const LongType count = 0 )
:_count(count)
{}
CharInfo(const char ch)
:_ch(ch)
{}
bool operator!=(const CharInfo& c)const
{
return _count != c._count;
}
CharInfo operator+(const CharInfo& c)const
{
return CharInfo(_count + c._count);
}
bool operator<(const CharInfo& c)const
{
return _count < c._count;
}
};

class FileComparess
{
public:
//文件压缩
void Comparess(const char* filename)
{
FILE* fread = fopen(filename, "rb");
if (fread == NULL)
{
cout << "打开待压缩文件失败" << endl;
return;
}
for (int i = 0; i < 256; i++)
{
_info[i]._ch = i;
}
unsigned char ch = fgetc(fread); //不能使用char，压缩汉字时的字符出现范围是0~255
while (!feof(fread)) //统计各字符出现的次数
{
//在windows下回车是'\r\n'的组合，遇到‘\r\n’时屏幕上打印换行
if (ch == '\r')
{
ch = fgetc(fread); //跳过‘\r’
if (ch != '\n')
{
fseek(fread, -1, SEEK_CUR);
}
}
_info[ch]._count++;
ch = fgetc(fread);
}
HaffmanTree<CharInfo> h(_info, 256, CharInfo());
HaffmanNode<CharInfo>* root = h.GetRoot();
string str;
GenerateHaffmanCode(root, str);
//重新打开待压缩文件读
fseek(fread, 0, SEEK_SET);
ch = fgetc(fread);
unsigned char data = 0;   //要写入压缩文件的数据
int bitcount = 7;  //标记移位信息
//打开文件写压缩后的编码
string write(filename);
write = write + ".comparess";
FILE* fwrite = fopen(write.c_str(), "wb");
while (!feof(fread))
{
if (ch == '\r')
{
ch = fgetc(fread);
if (ch != '\n')
{
fseek(fread, -1, SEEK_CUR);
}
}
const char* cur = _info[ch]._code.c_str();
while (*cur)
{
if (bitcount >= 0)
{
data = data | ((*cur - '0') << bitcount);
bitcount--;
}
if (bitcount < 0)
{
fputc(data, fwrite);
bitcount = 7;
data = 0;
}
cur++;
}
ch = fgetc(fread);
}
fputc(data, fwrite);//最后一个字节没写满8位也要把data写入文件（困扰好久）
//写配置文件
WriteConfig(filename);
fclose(fread);
fclose(fwrite);
}

//文件解压缩
void UnComparess(const char* filename)
{
CharInfo HNarry[256];
//读配置文件
ReadConfig(filename, HNarry);
//重建Haffman树
HaffmanTree<CharInfo> h(HNarry, 256, CharInfo());
//遍历树，找叶子结点，写输出文件
HaffmanNode<CharInfo>* root = h.GetRoot();
HaffmanNode<CharInfo>* cur = root;
//打开压缩文件读
string comf(filename);
comf = comf + ".comparess";
FILE* fread = fopen(comf.c_str(), "rb");
unsigned char ch = fgetc(fread);
FILE* fwrite = fopen("output", "wb");
int readcount = root->_wight._count;//根节点的_count值就是整棵树字符出现的次数
while (readcount)
{
unsigned int tmp = 1;
int bit = 7;   //移动的位数
while (bit>=0)
{
if (ch & (tmp << bit))
{
cur = cur->_right;
bit--;
}
else
{
cur = cur->_left;
bit--;
}
//找到叶子结点
if (cur->_left == NULL&&cur->_right == NULL)
{
fputc(cur->_wight._ch, fwrite);
cur = root;
readcount--;
//最后一个字符的编码在最后两个字节当中的情况
if (!readcount)
{
break;
}
}

}
ch = fgetc(fread);
}
fclose(fread);
fclose(fwrite);
}
protected:
//得到Haffman编码（后序遍历HaffmanTree）
void GenerateHaffmanCode(HaffmanNode<CharInfo>* root, string& code)
{
if (root == NULL)
return;
GenerateHaffmanCode(root->_left, code + '0');
GenerateHaffmanCode(root->_right, code + '1');
root->_wight._code = code;
if (root->_left == NULL&&root->_right == NULL)
{
_info[root->_wight._ch]._code = code;
}

}
void WriteConfig(const char* filename)
{
string conf(filename);
conf = conf + "config";
FILE* fcon = fopen(conf.c_str(), "wb");
for (int i = 0; i < 256; ++i)
{

if (_info[i]._count)
{
fputc(_info[i]._ch, fcon);
fputc(',', fcon);
char count[100];
_itoa(_info[i]._count, count, 10);
fputs(count, fcon);
fputc(',', fcon);
fputs(_info[i]._code.c_str(), fcon);
fputc('\n', fcon);
}
}
fclose(fcon);
}
void ReadConfig(const char* filename, CharInfo* HNarry)
{
string conf(filename);
conf = conf + "config";
FILE* fread = fopen(conf.c_str(), "rb");
if (fread == NULL)
{
cout << "打开待压缩文件失败" << endl;
return;
}
char str[100];
while (fgets(str, 100, fread))
{
char* ptr = str;
unsigned char index = (unsigned char)*ptr;
if (index == '\n') //换行符
{
HNarry[index]._ch = index;
fgets(str, 100, fread);
char* ptr = str;
ptr++;
LongType count = 0;//字符的权值，出现的次数
while (*ptr != ',' && *ptr)
{
count *= 10;
count += (*ptr - '0');
ptr++;
}
HNarry[index]._count = count;
ptr++;
string code(ptr);
HNarry[index]._code = code;
}
else
{
HNarry[index]._ch = index;
ptr += 2;
LongType count = 0;//字符的权值，出现的次数
while (*ptr != ',' && *ptr)
{
count *= 10;
count += (*ptr - '0');
ptr++;
}
HNarry[index]._count = count;
ptr++;
string code(ptr);
HNarry[index]._code = code;
}
}
}
protected:
CharInfo _info[256];
};

文件压缩的思路已在代码中表明。

注意：

在文件压缩时最好写配置文件，使用配置文件可以简化压缩程序代码。
在写配置文件和读配置文件的时候要采用一样的规则，不然会出错。在这个程序中，我是以“ 字符，出现次数，哈夫曼编码 ”的格式来写和读的。
压缩文件时要考虑到所有会出现的情况，比如在压缩完最后一个字符时往压缩文件中写的那个字节不满8位，我们也要把这个字节写进压缩文件.compress。
在读压缩文件时这里有一个小技巧。重建哈夫曼树后哈夫曼树的root所携带的_count值就是所有字符出现次数的总和。利用这个值可以控制读压缩文件.compress时什么时候结束。有时候最后一个字节（8位）的后几位并不是我们需要的，如果读了会出现许错误。

以下是此程序压缩与解压缩花费的时间截图：

debug下：

release下：

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航