您的位置:首页 > 其它

使用pdfbox提取pdf文件中的字符信息

2012-09-03 12:03 357 查看
前段时间使用了一下pdfbox(1.6.0)的文本提取功能,发现很好用。但是能给出的比较准确的结果只有行的粒度,后来又有了定位文章题目、章节标题、自然段落的需求,pdfbox目前好像没有这方面的支持(尤其是对于中文的期刊论文而言,排版情况很复杂,如一页中存在多篇文章混排等),只能先从比较低层次的字符提取入手,但需要保留字符的位置、大小、字体等信息,pdfbox源码中的一个小例子PrintTextLocatins比较接近,但未给出字体信息,本人这里仿照它重新定义了一个PrintTextLocatins2类,代码如下:

import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import java.io.IOException;

public class PrintTextLocatins2 extends PDFTextStripper{

private static int BOLD_F_NUM = 2;
private static String[] BOLD_FLAGS = {"Bold", "CAJ FNT04"};
private static int ITALIC_F_NUM = 2;
private static String[] ITALIC_FLAGS = {"Italic", "CAJ FNT03"};

private static boolean IsBold(String font)
{
int i;
for (i = 0; i < BOLD_F_NUM; i++)
if (font.contains(BOLD_FLAGS[i]))
return true;
return false;
}

private static boolean IsItalic(String font)
{
int i;
for (i = 0; i < ITALIC_F_NUM; i++)
if (font.contains(ITALIC_FLAGS[i]))
return true;
return false;
}

public PrintTextLocatins2() throws IOException
{
super.setSortByPosition( false );
}

protected void processTextPosition( TextPosition text )
{
//PDFontDescriptor fd = text.getFont().getFontDescriptor();

System.out.println( "String[" +
text.getXDirAdj() + "," +
text.getYDirAdj() +
" fs=" + text.getFontSize() +
" xscale=" + text.getXScale() +
" height=" + text.getHeightDir() +
" space=" + text.getWidthOfSpace() +
" width=" + text.getWidthDirAdj() +
" subfont=" + text.getFont().getSubType() +
" basefont=" + text.getFont().getBaseFont() +
" isBold=" + IsBold(text.getFont().getBaseFont()) +
" isItalic=" + IsItalic(text.getFont().getBaseFont()) +
"]" +
text.getCharacter() );
}

/**
* This will print the usage for this document.
*/
private static void usage()
{
System.err.println( "Usage: java org.apache.pdfbox.examples.pdmodel.PrintTextLocations <input-pdf>" );
}
}


使用方法:

public class CLayoutTest {

public void printTextLocations(String file) throws IOException
{
String pdfFile = file;

PDDocument document = null;
int file_len;

try{
document = PDDocument.load(pdfFile, true);
if( document.isEncrypted() )
{
try
{
document.decrypt( "" );
}
catch(InvalidPasswordException e )
{
System.err.println( "Error: Document is encrypted with a password." );
System.exit( 1 );
}
}

file_len = pdfFile.length();

PrintStream old = System.out;
try
{
PrintStream out = new PrintStream(pdfFile.substring(0, file_len - 4) + "_layout.txt");
System.setOut(out);
}
catch(FileNotFoundException e)
{
e.printStackTrace();
}

PrintTextLocatins2 printer = new PrintTextLocatins2();

List allPages = document.getDocumentCatalog().getAllPages();
for( int i=0; i<allPages.size(); i++ )
{
System.out.println("<page " + i + ">" );

PDPage page = (PDPage)allPages.get( i );
PDStream contents = page.getContents();
if( contents != null )
{
printer.processStream(page, page.findResources(), page.getContents().getStream());
}

System.out.println("</page " + i + ">" );
}
System.setOut(old);
}
catch(Exception e){
System.out.println(e.toString());
}
finally{
if (document != null){
document.close();
}
}
}

public static void main(String[] args) {
// TODO Auto-generated method stub
CLayoutTest pb_t = new CLayoutTest();
try{
pb_t.printTextLocations("E:\\eclipse\\workspace\\pdf\\单剂量冰片及单剂量复方制剂中冰片的药物动力学比较研究.pdf");
}
catch(Exception e){
System.out.println(e.toString());
}
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: