您的位置:首页 > 其它

.net 下如何将文档文件(Word, Pdf等) 中的文本提取出来

2013-01-10 19:58 549 查看
经常有人问我怎么将类似word,pdf这样的文档转换为文本然后索引,.net这方面的解决方案不是很多,为了方便大家,我花了一天时间自己做了一个。

Java版本的lucence提供了一个tika的工具用于将word,excel,pdf等文档转换为文本,然后进行索引。但这个工具没有.net版本,要在.net下用,需要用IKVM.net,很麻烦。而且这个工具实际上底层是调用POI和PDFParse来转换的。从网上搜索到的信息看,POI对office2007以上版本的文档处理有问题,不知道最新版本是否解决了,我没有试过。PDFParse这个东西,我用过.net版本,对中文不支持,不知道Java版本是否支持。

其实.net下完全不需要用这些开源解决方案来解决,因为微软提供了一个官方的解决方案,这个解决方案叫IFilter,这个过滤器是为SQLSERVER的全文索引设计的,但第三方软件可以调用API来完成文档的提取工作。

为了方便大家,我把IFilter转换的功能封装到了一个开源的组件中去,大家可以到下面地址去下载源码:HBTextParse.

调用很简单:

这个是提取文件中的文本到字符串的代码

if(openFileDialog.ShowDialog()==DialogResult.OK)
{
//要转换的文件
textBoxFilePath.Text=openFileDialog.FileName;

//实例化TextParse,传入要转换的文件路径
TextParsetextParse=newTextParse(textBoxFilePath.Text);

//提取文件中的文本,并输出
richTextBoxView.Text=textParse.ConvertToString();
}


这个是将文件转换为文本文件的代码:

if(saveFileDialog.ShowDialog()==DialogResult.OK)
{
try
{
//实例化TextParse,传入要转换的文件的路径
TextParsetextParse=newTextParse(textBoxFilePath.Text);

//将文件转换到saveFileDialog.FileName指定的文本文件中
textParse.ConvertToFile(saveFileDialog.FileName);
}
catch(Exceptionex)
{
MessageBox.Show(ex.Message,"Error",MessageBoxButtons.OK,MessageBoxIcon.Error);
}
}


要注意的问题是提取Pdf文档,如果机器是64为操作系统,必须要安装AdobePDFiFilter9for64-bitplatforms.否则会报异常。这个问题我搞了将近一天才搞定。

支持的文档类型:

目前这个组件可以支持所有Microsoftoffice提供的文档类型,包括*.rtf,*.doc,*.docx,*.xls,*.xlsx,*.ppt,*.pptx等等

除了微软Office的文档外,还可以转换

html文档:可以把html文档中的文本提取出来(不包含标签)

Pdf文档:我测试过,对中文支持没有问题

Txt文档

这个代码的核心部分是一个叫FilterCode的类。这个类是从http://ifilter.codeplex.com/这个地方下载的,我对这个类做了改进,加入了转换到文件的方法。我把这个类的代码贴出来,如果对如何调用IFilter的windowsAPI感兴趣,可以参考这段代码

IFilter的相关API函数如下:通常这些API函数就可以通过IFilter接口提取文本。

[DllImport("query.dll",SetLastError=true,CharSet=CharSet.Unicode)]
staticexternintLoadIFilter(stringpwcsPath,
[MarshalAs(UnmanagedType.IUnknown)]objectpUnkOuter,
refIFilterppIUnk);


[ComImport,Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
publicinterfaceIFilter
{
///<summary>
///TheIFilter::Initmethodinitializesafilteringsession.
///</summary>
[PreserveSig]
IFilterReturnCodesInit(
//[in]FlagsettingsfromtheIFILTER_INITenumerationfor
//controllingtextstandardization,propertyoutput,embedding
//scope,andIFilteraccesspatterns.
IFILTER_INITgrfFlags,

//[in]Thesizeoftheattributesarray.Whennonzero,cAttributes
//takes
//precedenceoverattributesspecifiedingrfFlags.Ifno
//attributeflags
//arespecifiedandcAttributesiszero,thedefaultisgivenby
//the
//PSGUID_STORAGEstoragepropertyset,whichcontainsthedateand
//time
//ofthelastwritetothefile,size,andsoon;andbythe
//PID_STG_CONTENTS
//'contents'property,whichmapstothemaincontentsofthe
//file.
//Formoreinformationaboutpropertiesandpropertysets,see
//PropertySets.
intcAttributes,

//[in]ArrayofpointerstoFULLPROPSPECstructuresforthe
//requestedproperties.
//WhencAttributesisnonzero,onlythepropertiesinaAttributes
//arereturned.
IntPtraAttributes,

//[out]Informationaboutadditionalpropertiesavailabletothe
//caller;fromtheIFILTER_FLAGSenumeration.
outIFILTER_FLAGSpdwFlags);

///<summary>
///TheIFilter::GetChunkmethodpositionsthefilteratthebeginning
///ofthenextchunk,
///oratthefirstchunkifthisisthefirstcalltotheGetChunk
///method,andreturnsadescriptionofthecurrentchunk.
///</summary>
[PreserveSig]
IFilterReturnCodesGetChunk(outSTAT_CHUNKpStat);

///<summary>
///TheIFilter::GetTextmethodretrievestext(text-typeproperties)
///fromthecurrentchunk,
///whichmusthaveaCHUNKSTATEenumerationvalueofCHUNK_TEXT.
///</summary>
[PreserveSig]
IFilterReturnCodesGetText(
//[in/out]Onentry,thesizeofawcBufferarrayinwide/Unicode
//characters.Onexit,thenumberofUnicodecharacterswrittento
//awcBuffer.
//Notethatthisvalueisnotthenumberofbytesinthebuffer.
refintpcwcBuffer,

//Textretrievedfromthecurrentchunk.Donotterminatethe
//bufferwithacharacter.
[Out(),MarshalAs(UnmanagedType.LPWStr)]
StringBuilderawcBuffer);

///<summary>
///TheIFilter::GetValuemethodretrievesavalue(public
///value-typeproperty)fromachunk,
///whichmusthaveaCHUNKSTATEenumerationvalueofCHUNK_VALUE.
///</summary>
[PreserveSig]
IFilterReturnCodesGetValue(
//AllocatethePROPVARIANTstructurewithCoTaskMemAlloc.Some
//PROPVARIANT
//structurescontainpointers,whichcanbefreedbycallingthe
//PropVariantClearfunction.
//ItisuptothecalleroftheGetValuemethodtocallthe
//PropVariantClearmethod.
//refIntPtrppPropValue
//[MarshalAs(UnmanagedType.Struct)]
refIntPtrPropVal);

///<summary>
///TheIFilter::BindRegionmethodretrievesaninterfacerepresenting
///thespecifiedportionoftheobject.
///Currentlyreservedforfutureuse.
///</summary>
[PreserveSig]
IFilterReturnCodesBindRegion(refFILTERREGIONorigPos,
refGuidriid,refobjectppunk);
}


从文档中提取文本的代码如下:

///<summary>
///UtilizesIFilterinterfaceinWindowstoparsethecontentsoffiles.
///</summary>
///<paramname="path">Path-Locationoffiletoparse</param>
///<paramname="buffer">Buffer-Returntextartifacts</param>
///<returns>Rawsetofstringsfromthedocumentinplaintextformat.</returns>
publicvoidGetTextFromDocument(stringpath,refStringBuilderbuffer)
{
IFilterfilter=null;
inthresult;
IFilterReturnCodesrtn;

//Initializethereturnbufferto64K.
buffer=newStringBuilder(64*1024);

//Trytoloadthefilterforthepathgiven.
hresult=LoadIFilter(path,newIntPtr(0),reffilter);
if(hresult==0)
{
IFILTER_FLAGSuflags;

//Initthefilterprovider.
rtn=filter.Init(
IFILTER_INIT.IFILTER_INIT_CANON_PARAGRAPHS|
IFILTER_INIT.IFILTER_INIT_CANON_HYPHENS|
IFILTER_INIT.IFILTER_INIT_CANON_SPACES|
IFILTER_INIT.IFILTER_INIT_APPLY_INDEX_ATTRIBUTES|
IFILTER_INIT.IFILTER_INIT_INDEXING_ONLY,
0,newIntPtr(0),outuflags);
if(rtn==IFilterReturnCodes.S_OK)
{
STAT_CHUNKstatChunk;

//Outerloopwillreadchunksfromthedocumentatatime.Forthose
//chunksthathavetext,thecontentswillbepulledandputintothe
//returnbuffer.
boolbMoreChunks=true;
while(bMoreChunks)
{
rtn=filter.GetChunk(outstatChunk);
if(rtn==IFilterReturnCodes.S_OK)
{
//Ignoreallnon-textchunks.
if(statChunk.flags!=CHUNKSTATE.CHUNK_TEXT)
continue;

//Checkforwhitespaceitemsandaddtheappropriatebreaks.
switch(statChunk.breakType)
{
caseCHUNK_BREAKTYPE.CHUNK_NO_BREAK:
break;

caseCHUNK_BREAKTYPE.CHUNK_EOW:
buffer.Append('');
break;

caseCHUNK_BREAKTYPE.CHUNK_EOC:
caseCHUNK_BREAKTYPE.CHUNK_EOP:
caseCHUNK_BREAKTYPE.CHUNK_EOS:
buffer.AppendLine();
break;
}

//Atthispointwehaveatextchunk.Thefollowingcodewillpullout
//allofitandaddittothebuffer.
boolbMoreText=true;
while(bMoreText)
{
//Createatemporarystringbufferwecanusefortheparsingalgorithm.
intcBuffer=DefaultBufferSize;
StringBuildersbBuffer=newStringBuilder(DefaultBufferSize);

//Readthenextpieceofdatauptothesizeofourlocalbuffer.
rtn=filter.GetText(refcBuffer,sbBuffer);
if(rtn==IFilterReturnCodes.S_OK||rtn==IFilterReturnCodes.FILTER_S_LAST_TEXT)
{
//Ifanydatawasreturned,scrubitandthenaddittothebuffer.
CleanUpCharacters(cBuffer,sbBuffer);
buffer.Append(sbBuffer.ToString());

//Ifwegotbacksometextbutthereisnomore,terminatetheloop.
if(rtn==IFilterReturnCodes.FILTER_S_LAST_TEXT)
{
bMoreText=false;
break;
}
}
//Oncealldataisexhausted,wearedonesoterminate.
elseif(rtn==IFilterReturnCodes.FILTER_E_NO_MORE_TEXT)
{
bMoreText=false;
break;
}
//Checkforanyfatalerrors.Itisabugifyoulandhere.
elseif(rtn==IFilterReturnCodes.FILTER_E_NO_TEXT)
{
System.Diagnostics.Debug.Assert(false,"Shouldnotgethere");
thrownewInvalidOperationException();
}
}
}
//Onceallchunkshavebeenread,wearedonewiththefile.
elseif(rtn==IFilterReturnCodes.FILTER_E_END_OF_CHUNKS)
{
bMoreChunks=false;
break;
}
elseif(rtn==IFilterReturnCodes.FILTER_E_EMBEDDING_UNAVAILABLE||
rtn==IFilterReturnCodes.FILTER_E_LINK_UNAVAILABLE)
{
continue;
}
else
{
thrownewCOMException("IFilterCOMerror:"+rtn.ToString());
}
}
}
}
else
{
//Ifyougetherethereisnofilterforthefiletypeyouaskedfor.Throwan
//exceptionforthecaller.
thrownewInvalidOperationException("FailedtofindIFilterforfile"+path);
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: