您的位置:首页 > 其它

获取EMF文件内全部文字, 并按照左上到右下的顺序排序

2015-07-28 17:11 681 查看
因为工作要求, 需要对EMF文件文字内容做分析.....SO, 如下代码出现了

懒得加注释了, 反正对外接口属性就那么几个, 根据英文猜吧, 很容易的

说明一下:

  这个东西结果会对所有文字内容按照左上到右下的顺序排序(EMF内数据顺序是根据画图顺序来的, 所以不一定是什么顺序, 但是数据分析就要得到行列关系)

但是图片没有行列概念, 所以我简单借鉴了一下纯粹横排版模式, 认为2个文字元素, 只要显示范围的中线在对方范围内, 就会被认为是同一行

2015-10-19:

  1.修改了几个排序时的BUG, 增加了一个对显示区域的处理, 最大方式减少对排版的影响

  2.修改了获取SmallTextOut的处理方式

{
EMF文件分析单元
读取EMF内文字元素并排版

最后修改时间 2015-10-19

by: 刘志林
E-Mail: lzl_17948876@hotmail.com
}

unit Comm.EMFInfo;

interface

uses
System.Types, System.Generics.Collections,
Vcl.Graphics;

type
TEMFStrInfo = record
DisplayRect: TRect; {显示区域}
Text: string; {显示内容}
LineKey: string; {行标记}
end;
PEMFStrInfo = ^TEMFStrInfo;

TEMFStrInfoList = Class
private
FList: TList<PEMFStrInfo>;
FDic: TDictionary<string, UInt32>;
FMaxHeight: Integer;
FJSONStrs: string; {定位查找失败时,使用文本进行泛查找}

function GetItem(Index: UInt32): TEMFStrInfo;
function GetCount: UInt32;
function GetJSONStrs: string;
public
constructor Create;
destructor Destroy; override;

procedure Append(AEMF: TMetafile; var AHeight: Integer);
procedure Clear;
property Count: UInt32 read GetCount;
property Items[Index: UInt32]: TEMFStrInfo read GetItem;
function TryGetInfo(AInfoName: string; var AInfo: TEMFStrInfo; var AIndex: UInt32): Boolean;
function StrAnalyze(ALeavePattern: array of string; var AResult: string): Boolean;
property JSONStr: string read GetJSONStrs;
property MaxHeight: Integer read FMaxHeight;
end;

implementation

uses
System.SysUtils, System.Classes, System.Generics.Defaults,
System.RegularExpressions,
Winapi.Windows,
Vcl.Printers,
QJSON;

const
// if set use ANSI version else UNICODE
SMALLTEXT_TYPE_ANSI = $200;
// if set use EMR_SMALLTEXTOUT else use EMR_SMALLTEXTOUTCLIP
SMALLTEXT_TYPE_WITHOUT_CLIP = $100;

// Structures
type
EMR_SMALLTEXTOUT_HEAD = RECORD
emr: emr;
ptlReference: TPoint;
nChars: DWORD;
fuOptions: DWORD; // this record type
// == SMALLTEXT_TYPE_WITHOUT_CLIP
// == SMALLTEXT_TYPE_ANSI
// also holds fuOptions like in the ExtTextOut function
iGraphicsMode: DWORD; // See iMode parameter of SetGraphicsMode
exScale: Single; { X and Y scales from Page units to .01mm units }
eyScale: Single; { if graphics mode is GM_COMPATIBLE. }
END;

PEMRSmallTextOutHead = ^EMR_SMALLTEXTOUT_HEAD;

EMR_SMALLTEXTOUTCLIPA = RECORD
emr: emr;
ptlReference: TPoint; // might be in negative numbers, so take abs
nChars: DWORD;
fuOptions: DWORD; // this record type
// != SMALLTEXT_TYPE_WITHOUT_CLIP
// == SMALLTEXT_TYPE_ANSI
// also holds fuOptions like in the ExtTextOut function
iGraphicsMode: DWORD; // See iMode parameter of SetGraphicsMode
exScale: Single; { X and Y scales from Page units to .01mm units }
eyScale: Single; { if graphics mode is GM_COMPATIBLE. }
rclClip: TRect;
cString: Array [0 .. 0] of AnsiChar;
{ This is followed by the string array }
END;

PEMRSmallTextOutClipA = ^EMR_SMALLTEXTOUTCLIPA;

EMR_SMALLTEXTOUTCLIPW = RECORD
emr: emr;
ptlReference: TPoint;
nChars: DWORD;
fuOptions: DWORD; // this record type
// != SMALLTEXT_TYPE_WITHOUT_CLIP
// != SMALLTEXT_TYPE_ANSI
// also holds fuOptions like in the ExtTextOut function
iGraphicsMode: DWORD; // See iMode parameter of SetGraphicsMode
exScale: Single; { X and Y scales from Page units to .01mm units }
eyScale: Single; { if graphics mode is GM_COMPATIBLE. }
rclClip: TRect;
cString: Array [0 .. 0] of WideChar;
{ This is followed by the string array }
END;

PEMRSmallTextOutClipW = ^EMR_SMALLTEXTOUTCLIPW;

EMR_SMALLTEXTOUTA = RECORD
emr: emr;
ptlReference: TPoint;
nChars: DWORD;
fuOptions: DWORD; // this record type
// == SMALLTEXT_TYPE_WITHOUT_CLIP
// == SMALLTEXT_TYPE_ANSI
// also holds fuOptions like in the ExtTextOut function
iGraphicsMode: DWORD; // See iMode parameter of SetGraphicsMode
exScale: Single; { X and Y scales from Page units to .01mm units }
eyScale: Single; { if graphics mode is GM_COMPATIBLE. }
cString: Array [0 .. 0] of AnsiChar;
{ This is followed by the string array }
END;

PEMRSmallTextOutA = ^EMR_SMALLTEXTOUTA;

EMR_SMALLTEXTOUTW = RECORD
emr: emr;
ptlReference: TPoint;
nChars: DWORD;
fuOptions: DWORD; // this record type
// == SMALLTEXT_TYPE_WITHOUT_CLIP
// != SMALLTEXT_TYPE_ANSI
// also holds fuOptions like in the ExtTextOut function
iGraphicsMode: DWORD; // See iMode parameter of SetGraphicsMode
exScale: Single; { X and Y scales from Page units to .01mm units }
eyScale: Single; { if graphics mode is GM_COMPATIBLE. }
cString: Array [0 .. 0] of WideChar;
{ This is followed by the string array }
END;

PEMRSmallTextOutW = ^EMR_SMALLTEXTOUTW;

var
FReferenceDC: VCL.Graphics.TBitmap;

function EnumTextProc(DC: HDC; lpHTable: PHANDLETABLE; EMFR: PENHMETARECORD;
nObj, lpData: Integer): Integer; stdcall;

function _IsEffeetiveRect(const ARect: TRect): Boolean;
begin
Result := (not ARect.IsEmpty) and (ARect.Right > 0) and (ARect.Left > 0)
and (ARect.Bottom - ARect.Top > 4) and (ARect.Right - ARect.Left > 4);
end;

procedure _ShrinkRect(var ARect: TRect; ASize: TSize);
var
v: Integer;
begin
v := ARect.Left + ASize.cx;
if ARect.Right > v then
ARect.Right := v;
v := ARect.Top + ASize.cy;
if ARect.Bottom > v then
ARect.Bottom := v;
end;

var
nSize: TSize;
nStrA: PAnsiChar;
nStrW: PWideChar;
nEMRTO: PEMRExtTextOut;

nEMRSTOHead: PEMRSmallTextOutHead;
nEMRSTO_A: PEMRSmallTextOutA;
nEMRSTO_AC: PEMRSmallTextOutClipA;
nEMRSTO_W: PEMRSmallTextOutW;
nEMRSTO_WC: PEMRSmallTextOutClipW;

nOTR: PEMFStrInfo;
nEMFElementList: TList<PEMFStrInfo>;
begin
nEMFElementList := Pointer(lpData);
nSize.cX := 0;
nSize.cY := 0;

if (EMFR.iType = EMR_EXTTEXTOUTA) then
begin
nEMRTO := PEMRExtTextOut(EMFR);
nStrA := AnsiStrAlloc(nEMRTO.EMRText.nChars + 1);
try
FillChar(nStrA^, nEMRTO.EMRText.nChars + 1, 0);
Move(pointer(2 + Cardinal(@nEMRTO.EMRText) + nEMRTO.EMRText.offString)^,
nStrA^, nEMRTO.EMRText.nChars);

New(nOTR);
with nOTR^ do
begin
Text := Trim(nStrA);
DisplayRect := nEMRTO.rclBounds;
LineKey := '';
end;

finally
StrDispose(nStrA);
end;

Winapi.Windows.GetTextExtentPoint32(FReferenceDC.Canvas.Handle,
nOTR^.Text, Length(nOTR^.Text), nSize);
nOTR^.DisplayRect.NormalizeRect;
_ShrinkRect(nOTR^.DisplayRect, nSize);

if (nOTR^.Text <> '') and _IsEffeetiveRect(nOTR^.DisplayRect) then
nEMFElementList.Add(nOTR)
else
Dispose(nOTR);
end
else if (EMFR.iType = EMR_EXTTEXTOUTW) then
begin
nEMRTO := PEMRExtTextOut(EMFR);
nStrW := WideStrAlloc(nEMRTO.EMRText.nChars + 1);
try
FillChar(nStrW^, (nEMRTO.EMRText.nChars + 1) * 2, 0);
Move(pointer(2 + Cardinal(@nEMRTO.EMRText) + nEMRTO.EMRText.offString div 2)^,
nStrW^, nEMRTO.EMRText.nChars * 2);

New(nOTR);
with nOTR^ do
begin
Text := Trim(nStrW);
DisplayRect := nEMRTO.rclBounds;
LineKey := '';
end;

finally
StrDispose(nStrW);
end;

Winapi.Windows.GetTextExtentPoint32(FReferenceDC.Canvas.Handle,
nOTR^.Text, Length(nOTR^.Text), nSize);
nOTR^.DisplayRect.NormalizeRect;
_ShrinkRect(nOTR^.DisplayRect, nSize);

if (nOTR^.Text <> '') and _IsEffeetiveRect(nOTR^.DisplayRect) then
nEMFElementList.Add(nOTR)
else
Dispose(nOTR);
end
else if EMFR.iType = EMR_SMALLTEXTOUT then
begin
nEMRSTOHead := PEMRSmallTextOutHead(EMFR);
New(nOTR);
if nEMRSTOHead.fuOptions and SMALLTEXT_TYPE_ANSI = SMALLTEXT_TYPE_ANSI then
begin
if nEMRSTOHead.fuOptions and SMALLTEXT_TYPE_WITHOUT_CLIP = SMALLTEXT_TYPE_WITHOUT_CLIP then
begin
nEMRSTO_A := Pointer(nEMRSTOHead);
nStrA := AnsiStrAlloc(nEMRSTO_A^.nChars + 1);
try
FillChar(nStrA^, nEMRSTO_A^.nChars + 1, 0);
Move(nEMRSTO_A^.cString[0], nStrA^, nEMRSTO_A^.nChars);

with nOTR^ do
begin
Text := Trim(nStrA);
DisplayRect := Rect(nEMRSTO_A^.ptlReference.X, nEMRSTO_A^.ptlReference.Y,
MAXWORD, MAXWORD);
LineKey := '';
end;
finally
StrDispose(nStrA);
end;
end
else
begin
nEMRSTO_AC := Pointer(nEMRSTOHead);
nStrA := AnsiStrAlloc(nEMRSTO_AC^.nChars + 1);
try
FillChar(nStrA^, nEMRSTO_AC^.nChars + 1, 0);
Move(nEMRSTO_AC^.cString[0], nStrA^, nEMRSTO_AC^.nChars);

with nOTR^ do
begin
Text := Trim(nStrA);
DisplayRect := nEMRSTO_AC^.rclClip;
DisplayRect.TopLeft := nEMRSTO_AC^.ptlReference;
LineKey := '';
end;
finally
StrDispose(nStrA);
end;
end;
end
else
begin
if nEMRSTOHead.fuOptions and SMALLTEXT_TYPE_WITHOUT_CLIP = SMALLTEXT_TYPE_WITHOUT_CLIP then
begin
nEMRSTO_W := Pointer(nEMRSTOHead);
nStrW := WideStrAlloc(nEMRSTO_W^.nChars + 1);
try
FillChar(nStrW^, (nEMRSTO_W^.nChars + 1) * 2, 0);
Move(nEMRSTO_W^.cString[0], nStrW^, nEMRSTO_W^.nChars * 2);

with nOTR^ do
begin
Text := Trim(nStrW);
DisplayRect := Rect(nEMRSTO_W^.ptlReference.X, nEMRSTO_W^.ptlReference.Y,
MAXWORD, MAXWORD);
LineKey := '';
end;
finally
StrDispose(nStrA);
end;
end
else
begin
nEMRSTO_WC := Pointer(nEMRSTOHead);
nStrW := WideStrAlloc(nEMRSTO_WC^.nChars + 1);
try
FillChar(nStrW^, (nEMRSTO_WC^.nChars + 1) * 2, 0);
Move(nEMRSTO_WC^.cString[0], nStrW^, nEMRSTO_WC^.nChars * 2);

with nOTR^ do
begin
Text := Trim(nStrW);
DisplayRect := nEMRSTO_AC^.rclClip;
DisplayRect.TopLeft := nEMRSTO_AC^.ptlReference;
LineKey := '';
end;
finally
StrDispose(nStrA);
end;
end;
end;

Winapi.Windows.GetTextExtentPoint32(FReferenceDC.Canvas.Handle,
nOTR^.Text, Length(nOTR^.Text), nSize);
nOTR^.DisplayRect.NormalizeRect;
_ShrinkRect(nOTR^.DisplayRect, nSize);

if (nOTR^.Text <> '') and _IsEffeetiveRect(nOTR^.DisplayRect) then
nEMFElementList.Add(nOTR)
else
Dispose(nOTR);
end;

Result := 1;
end;

type
TEMFStrInfoCompare = class(TComparer<PEMFStrInfo>)
public
function Compare(const Left, Right: PEMFStrInfo): Integer; override;
end;

{ TEMFStrInfoCompare }

function TEMFStrInfoCompare.Compare(const Left, Right: PEMFStrInfo): Integer;
var
nCPLeft, nCPRight: TPoint;
nLIR, nRIL: Int8;
nLineKey: string;
begin
nCPLeft := Left^.DisplayRect.CenterPoint;
nCPRight := Right^.DisplayRect.CenterPoint;

if nCPLeft.Y <= Right^.DisplayRect.Top then
nLIR := -1
else if nCPLeft.Y >= Right^.DisplayRect.Bottom then
nLIR := 1
else
nLIR := 0;

if nCPRight.Y <= Left^.DisplayRect.Top then
nRIL := -1
else if nCPRight.Y >= Left^.DisplayRect.Bottom then
nRIL := 1
else
nRIL := 0;

if (nLIR = 0) or (nRIL = 0) then
begin
if Left^.LineKey <> '' then
Right^.LineKey := Left^.LineKey
else if Right^.LineKey <> '' then
Left^.LineKey := Right^.LineKey
else
begin
Left^.LineKey := TGUID.NewGuid.ToString;
Right^.LineKey := Left^.LineKey;
end;

{有任意left或right在另一方区域内的, 认为在同一行, 通过x位置判断排序}
if nCPLeft.X < nCPRight.X then {根据左侧判断位置}
Result := -1
else if nCPLeft.X > nCPRight.X then
Result := 1
else if nCPLeft.Y < nCPRight.Y then
Result := -1
else if nCPLeft.Y > nCPRight.Y then
Result := 1
else
Result := 0;
end
else
begin
Result := nLIR;
end;
end;

{ TEMFStrInfoList }

procedure TEMFStrInfoList.Append(AEMF: TMetafile; var AHeight: Integer);
var
nList: TList<PEMFStrInfo>;
nInfoExists: Boolean;
nCheckPoint: TPoint;
i: Integer;
nCompare: TEMFStrInfoCompare;
nPI: PEMFStrInfo;
nTmpLineKey, nTmpJSONStr: string;
nJ, nJLine: TQJson;
begin
nList := TList<PEMFStrInfo>.Create;
try
{读取文件元素存入列表}
EnumEnhMetafile(0, AEMF.Handle, @EnumTextProc, Pointer(nList), Rect(0, 0, 0, 0));

nCompare := TEMFStrInfoCompare.Create;
try
{排序}
try
nList.Sort(nCompare);
finally
nCompare.Free;
end;
except
end;

{计算最大高度, 元素名称存入字典}
AHeight := 0;
nJ := TQJson.Create;
try
//      nJ.TryParse(FJSONStrs);
nJ.DataType := jdtArray;
nJLine := nil;
nTmpLineKey := '';
for i := 0 to nList.Count - 1 do
begin
nPI := nList[i];
if nPI^.LineKey = '' then
nPI^.LineKey := TGUID.NewGuid.ToString; {没有相同行标记的给一个标记}
{需要换行}
if (nTmpLineKey = '') or (not SameText(nTmpLineKey, nPI^.LineKey)) then
nJLine := nil;
{当前行标记}
nTmpLineKey := nPI^.LineKey;

if nPI^.DisplayRect.Bottom > AHeight then
AHeight := nPI^.DisplayRect.Bottom;

OffsetRect(nPI^.DisplayRect, 0, FMaxHeight);
FDic.AddOrSetValue(nPI^.Text, FList.Add(nPI));

if (nJLine = nil) then
nJLine := nJ.AddArray('');

nJLine.Add.AsString := nPI^.Text;
end;
nTmpJSONStr := nJ.Encode(False);
nTmpJSONStr := Copy(nTmpJSONStr, 2, Length(nTmpJSONStr) - 2);
if FJSONStrs = '' then
FJSONStrs := nTmpJSONStr
else
FJSONStrs := FJSONStrs + ',' + nTmpJSONStr;
finally
nJ.Free;
end;
FMaxHeight := FMaxHeight + AHeight;
finally
nList.Free;
end;
end;

procedure TEMFStrInfoList.Clear;
var
i: Integer;
begin
FMaxHeight := 0;
FJsonStrs := '';
for i := 0 to FList.Count - 1 do
Dispose(FList[i]);
FList.Clear;
FDic.Clear;
end;

constructor TEMFStrInfoList.Create;
begin
FList := TList<PEMFStrInfo>.Create;
FDic := TDictionary<string, UInt32>.Create;
FMaxHeight := 0;
FJsonStrs := '';
end;

destructor TEMFStrInfoList.Destroy;
var
i: Integer;
begin
for i := 0 to FList.Count - 1 do
Dispose(FList[i]);
FList.Free;
FDic.Free;
inherited;
end;

function TEMFStrInfoList.GetCount: UInt32;
begin
Result := FList.Count;
end;

function TEMFStrInfoList.GetItem(Index: UInt32): TEMFStrInfo;
begin
Result := FList[Index]^;
end;

function TEMFStrInfoList.GetJSONStrs: string;
begin
Result := '[' + FJSONStrs + ']';
end;

function TEMFStrInfoList.StrAnalyze(ALeavePattern: array of string; var AResult: string): Boolean;

function _RegExAnalyze(AData, APattern: string): string;
var
nMatches: TMatchCollection;
begin
nMatches := TRegEx.Matches(AData, APattern, [roMultiLine]);
if nMatches.Count > 0 then
Result := nMatches.Item[0].Value;
end;

var
i: Integer;
nTmpData: string;
begin
AResult := '';
try
nTmpData := FJSONStrs;
for i := Low(ALeavePattern) to High(ALeavePattern) do
begin
nTmpData := _RegExAnalyze(nTmpData, ALeavePattern[i]);
if nTmpData = '' then
Break;
end;
AResult := nTmpData;
except
on E: Exception do
raise Exception.CreateFmt('正则分析失败[%s]', [E.Message]);
end;
Result := AResult <> '';
end;

function TEMFStrInfoList.TryGetInfo(AInfoName: string; var AInfo: TEMFStrInfo; var AIndex: UInt32): Boolean;
begin
Result := FDic.TryGetValue(AInfoName, AIndex);
if Result then
AInfo := FList[AIndex]^;
end;

initialization
FReferenceDC := VCL.Graphics.TBitmap.Create;
with FReferenceDC do
begin
PixelFormat := pf24bit;
Width := 2048;
Height := 2048;
end;

finalization
FreeAndNil(FReferenceDC);

end.
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: