Commit a50af389 authored by SVN-Git Migration's avatar SVN-Git Migration

Imported Upstream version 20101017+dfsg

parent 6d7bb120
......@@ -25,10 +25,12 @@ pack: distclean MANIFEST
$(PYTHON) setup.py sdist
register: distclean MANIFEST
$(PYTHON) setup.py sdist upload register
MANIFEST:
$(GIT) ls-tree --name-only -r HEAD > MANIFEST
WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
publish:
$(CP) docs/*.html $(WEBDIR)
$(CP) docs/*.html docs/*.png $(WEBDIR)
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
CMAPSRC=cmaprsrc
......@@ -50,6 +52,3 @@ test: cmap
cd samples && $(MAKE) test CMP=cmp
test_clean:
-cd samples && $(MAKE) clean
MANIFEST:
$(GIT) ls-tree --name-only -r HEAD > MANIFEST
Metadata-Version: 1.0
Name: pdfminer
Version: 20100829
Version: 20101017
Summary: PDF parser and analyzer
Home-page: http://www.unixuser.org/~euske/python/pdfminer/index.html
Author: Yusuke Shinyama
......
......@@ -19,7 +19,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Sun Aug 29 06:43:25 UTC 2010
Last Modified: Sun Oct 17 05:13:01 UTC 2010
<!-- hhmts end -->
</div>
......@@ -185,7 +185,7 @@ Examples:
$ <strong>pdf2txt.py -o output.html samples/naacl06-shinyama.pdf</strong>
(extract text as an HTML file whose filename is output.html)
$ <strong>pdf2txt.py -c euc-jp -D tb-rl -o output.html samples/jo.pdf</strong>
$ <strong>pdf2txt.py -c euc-jp -o output.html samples/jo.pdf</strong>
(extract a Japanese HTML file in vertical writing, CMap is required)
$ <strong>pdf2txt.py -P mypassword -o output.txt secret.pdf</strong>
......@@ -222,10 +222,6 @@ Tags used here are defined in the PDF specification (See &sect;10.7 "<em>Tagged
<dd> Specifies the output directory for image extraction.
Currently only JPEG images are supported.
<p>
<dt> <code>-D <em>writing_mode</em></code>
<dd> Specifies the writing mode of text outputs.
Currently <code>lr-tb</code>, <code>tb-rl</code> and <code>auto</code> is supported.
<p>
<dt> <code>-M <em>char_margin</em></code>
<dt> <code>-L <em>line_margin</em></code>
<dt> <code>-W <em>word_margin</em></code>
......@@ -389,6 +385,8 @@ Also, check out <a href="http://denis.papathanasiou.org/?p=343">a more complete
<hr noshade>
<h2>Changes</h2>
<ul>
<li> 2010/10/17: A couple of bugfixes and a minor improvement. Thanks to standardabweichung and Alastair Irving.
<li> 2010/09/07: A minor bugfix. Thanks to Alexander Garden.
<li> 2010/08/29: A couple of bugfixes. Thanks to Sahan Malagi, pk, and Humberto Pereira.
<li> 2010/07/06: Minor bugfixes. Thanks to Federico Brega.
<li> 2010/06/13: Bugfixes and improvements on CMap data compression. Thanks to Jakub Wilk.
......
%TGIF 4.1.45-QPL
%TGIF 4.2.2
state(0,37,100.000,0,0,0,16,1,9,1,1,1,0,0,2,1,1,'Helvetica-Bold',1,69120,0,0,1,10,0,0,1,1,0,16,0,0,1,1,1,1,1050,1485,1,0,2880,0).
%
% @(#)$Header$
......@@ -30,6 +30,8 @@ script_frac("0.6").
fg_bg_colors('black','white').
dont_reencode("FFDingbests:ZapfDingbats").
objshadow_info('#c0c0c0',2,2).
rotate_pivot(0,0,0,0).
spline_tightness(1).
page(1,"",1,'').
oval('black','',350,380,450,430,2,2,1,88,0,0,0,0,0,'2',0,[
]).
......@@ -167,19 +169,19 @@ poly('black','',2,[
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
text('black',400,158,1,1,1,68,15,115,12,3,2,0,0,0,2,68,15,0,0,"",0,0,0,0,170,'',[
minilines(68,15,0,0,1,0,0,[
mini_line(68,12,3,0,0,0,[
str_block(0,68,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,68,12,3,0,0,0,0,0,0,0,
"page object")])
text('black',400,158,1,1,1,84,15,115,12,3,2,0,0,0,2,84,15,0,0,"",0,0,0,0,170,'',[
minilines(84,15,0,0,1,0,0,[
mini_line(84,12,3,0,0,0,[
str_block(0,84,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,84,12,3,0,-1,0,0,0,0,0,
"page contents")])
])
])]).
text('black',400,258,1,1,1,115,15,119,12,3,2,0,0,0,2,115,15,0,0,"",0,0,0,0,270,'',[
minilines(115,15,0,0,1,0,0,[
mini_line(115,12,3,0,0,0,[
str_block(0,115,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,115,12,3,0,-1,0,0,0,0,0,
"rendering sequence")])
text('black',400,258,1,1,1,129,15,119,12,3,2,0,0,0,2,129,15,0,0,"",0,0,0,0,270,'',[
minilines(129,15,0,0,1,0,0,[
mini_line(129,12,3,0,0,0,[
str_block(0,129,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,129,12,3,0,-1,0,0,0,0,0,
"rendering instructions")])
])
])]).
docs/objrel.png

1.96 KB | W: | H:

docs/objrel.png

1.99 KB | W: | H:

docs/objrel.png
docs/objrel.png
docs/objrel.png
docs/objrel.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -16,11 +16,51 @@ blockquote { background: #eeeeee; }
This document explains how to use PDFMiner as a library
from other applications.
<ul>
<li> <a href="#overview">Overview</a>
<li> <a href="#basic">Basic Usage</a>
<li> <a href="#layout">Layout Analysis</a>
<li> <a href="#toc">TOC Extraction</a>
<li> <a href="#more">more</a>
</ul>
<a name="overview">
<hr noshade>
<h2>Overview</h2>
<p>
<strong>PDF is evil.</strong> Although it is called a PDF
"document", it's nothing like Word or HTML. PDF is more like a
picture representation. PDF contents are just a bunch of
instructions that tell how to place the stuff at each exact
position on a display or paper. In most cases, it has no logical
structure such as sentences or paragraphs and it cannot adapt
itself when the paper size changes. PDFMiner attempts to
reconstruct some of those structures by guessing from its
positioning, but there's nothing guaranteed to work. Ugly, I
know. Again, PDF is evil.
<p>
Because a PDF file has such a big and complex structure,
parsing a PDF file as a whole is time and memory consuming. However,
not every part is needed for most PDF processing tasks. Therefore
PDFMiner takes a strategy of lazy parsing, which is to parse the
stuff only when it's necessary. To parse PDF files, you need to use at
least two classes: <code>PDFParser</code> and <code>PDFDocument</code>.
These two objects are associated with each other.
<code>PDFParser</code> fetches data from a file,
and <code>PDFDocument</code> stores it. You'll also need
<code>PDFPageInterpreter</code> to process the page contents
and <code>PDFDevice</code> to translate it to whatever you need.
<code>PDFResourceManager</code> is used to store
shared resources such as fonts or images.
<p>
Figure 1 shows the relationship between the classes in PDFMiner.
<div align=center>
<img src="objrel.png"><br>
<small>Figure 1. Relationships between PDFMiner classes</small>
</div>
<a name="basic">
<hr noshade>
<h2>Basic Usage</h2>
......@@ -57,25 +97,11 @@ for page in doc.get_pages():
interpreter.process_page(page)
</pre></blockquote>
<p>
In PDFMiner, there are several Python classes involved in parsing a PDF file,
as shown in Figure 1.
<div align=center>
<img src="objrel.png"><br>
<small>Figure 1. Relationships between PDFMiner objects</small>
</div>
<a name="layout">
<hr noshade>
<h2>Accessing Layout Objects</h2>
<p>
PDF documents are more like graphics, rather than text documents.
In most cases, it presents no logical structure such as sentences or paragraphs.
PDFMiner attempts to reconstruct some of them by performing
basic layout analysis.
<p>
Here is a typical way to do it:
Here is a typical way to use the layout analysis function:
<blockquote><pre>
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
......@@ -172,11 +198,20 @@ for (level,title,dest,a,se) in outlines:
</pre></blockquote>
<p>
In some PDF documents, destinations are referred to as page numbers.
In other PDF documents, destinations are referred to as page numbers plus
the location within the page. Since PDF does not provide a way to
point to graphical objects in a page, normally these in-page destinations
are specified by physical coordinates.
Some PDF documents use page numbers as destinations, while others
use page numbers and the physical location within the page. Since
PDF does not have a logical strucutre, and it does not provide a
way to refer to any in-page object from the outside, there's no
way to tell exactly which part of text these destinations are
refering to.
<a name="more">
<hr noshade>
<h2>More</h2>
<p>
You can extend <code>PDFPageInterpreter</code> and <code>PDFDevice</code> class
in order to process them differently / obtain other information.
<hr noshade>
<address>Yusuke Shinyama</address>
......
#!/usr/bin/env python
__version__ = '20100829'
__version__ = '20101017'
if __name__ == '__main__': print __version__
......@@ -121,7 +121,7 @@ class FileCMap(CMap):
return '<CMap: %s>' % self.attrs.get('CMapName')
def is_vertical(self):
return self.attrs.get('WMode', 0)
return self.attrs.get('WMode', 0) != 0
def set_attr(self, k, v):
self.attrs[k] = v
......
......@@ -5,7 +5,8 @@ from pdffont import PDFUnicodeNotDefined
from pdftypes import LITERALS_DCT_DECODE
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTPolygon
from layout import LTFigure, LTImage, LTChar, LTTextLine, LTTextBox, LTTextGroup
from layout import LTFigure, LTImage, LTChar, LTTextLine
from layout import LTTextBox, LTTextBoxVertical, LTTextGroup
from utils import apply_matrix_pt, mult_matrix
from utils import enc, bbox2str, create_bmp
......@@ -170,11 +171,11 @@ class TextConverter(PDFConverter):
def receive_layout(self, ltpage):
def render(item):
if isinstance(item, LTText):
self.write(item.text)
elif isinstance(item, LTContainer):
if isinstance(item, LTContainer):
for child in item:
render(child)
elif isinstance(item, LTText):
self.write(item.get_text())
if isinstance(item, LTTextBox):
self.write('\n')
if self.showpageno:
......@@ -231,20 +232,21 @@ class HTMLConverter(PDFConverter):
elif isinstance(item, LTChar):
self.write_text(item.text, item.x0, item.y1, item.get_size())
if self.debug:
self.write_rect('red', 1, item.x0, item.y1, item.width, item.height)
self.write_rect('green', 1, item.x0, item.y1, item.width, item.height)
elif isinstance(item, LTPolygon):
self.write_rect('black', 1, item.x0, item.y1, item.width, item.height)
elif isinstance(item, LTTextLine):
self.write_rect('magenta', 1, item.x0, item.y1, item.width, item.height)
for child in item:
render(child)
elif isinstance(item, LTTextBox):
self.write_rect('blue', 1, item.x0, item.y1, item.width, item.height)
self.write_rect('cyan', 1, item.x0, item.y1, item.width, item.height)
for child in item:
render(child)
if self.debug:
self.write_text(str(item.index+1), item.x0, item.y1, 20)
elif isinstance(item, LTFigure):
self.write_rect('green', 1, item.x0, item.y1, item.width, item.height)
self.write_rect('yellow', 1, item.x0, item.y1, item.width, item.height)
for child in item:
render(child)
elif isinstance(item, LTImage):
......@@ -315,17 +317,16 @@ class XMLConverter(PDFConverter):
render(child)
self.outfp.write('</textline>\n')
elif isinstance(item, LTTextBox):
self.outfp.write('<textbox id="%d" bbox="%s">\n' % (item.index, bbox2str(item.bbox)))
wmode = ''
if isinstance(item, LTTextBoxVertical):
wmode = ' wmode="vertical"'
self.outfp.write('<textbox id="%d" bbox="%s"%s>\n' % (item.index, bbox2str(item.bbox), wmode))
for child in item:
render(child)
self.outfp.write('</textbox>\n')
elif isinstance(item, LTChar):
vertical = ''
if item.is_vertical():
vertical = 'vertical="true" '
self.outfp.write('<text font="%s" %sbbox="%s" size="%.3f">' %
(enc(item.font.fontname), vertical,
bbox2str(item.bbox), item.get_size()))
self.outfp.write('<text font="%s" bbox="%s" size="%.3f">' %
(enc(item.font.fontname), bbox2str(item.bbox), item.get_size()))
self.write(item.text)
self.outfp.write('</text>\n')
elif isinstance(item, LTText):
......
This diff is collapsed.
......@@ -5,9 +5,11 @@ try:
except ImportError:
from StringIO import StringIO
from cmapdb import CMapDB, CMapParser, FileUnicodeMap, CMap
from encodingdb import EncodingDB
from encodingdb import EncodingDB, name2unicode
from struct import pack, unpack
from psparser import LIT, STRICT
from psparser import PSStackParser
from psparser import PSSyntaxError, PSEOF
from psparser import LIT, KWD, STRICT
from psparser import PSLiteral, literal_name
from pdftypes import PDFException, resolve1
from pdftypes import int_value, float_value, num_value
......@@ -70,6 +72,46 @@ class FontMetricsDB(object):
return FONT_METRICS[fontname]
## Type1FontHeaderParser
##
class Type1FontHeaderParser(PSStackParser):
KEYWORD_BEGIN = KWD('begin')
KEYWORD_END = KWD('end')
KEYWORD_DEF = KWD('def')
KEYWORD_PUT = KWD('put')
KEYWORD_DICT = KWD('dict')
KEYWORD_ARRAY = KWD('array')
KEYWORD_READONLY = KWD('readonly')
KEYWORD_FOR = KWD('for')
KEYWORD_FOR = KWD('for')
def __init__(self, data):
PSStackParser.__init__(self, data)
self._cid2unicode = {}
return
def get_encoding(self):
while 1:
try:
(cid,name) = self.nextobject()
except PSEOF:
break
try:
self._cid2unicode[cid] = name2unicode(name)
except KeyError:
pass
return self._cid2unicode
def do_keyword(self, pos, token):
if token is self.KEYWORD_PUT:
((_,key),(_,value)) = self.pop(2)
if (isinstance(key, int) and
isinstance(value, PSLiteral)):
self.add_results((key, literal_name(value)))
return
## CFFFont
## (Format specified in Adobe Technical Note: #5176
## "The Compact Font Format Specification")
......@@ -360,7 +402,7 @@ class TrueTypeFont(object):
# create unicode map
unicode_map = FileUnicodeMap()
for (char,gid) in char2gid.iteritems():
unicode_map.add_cid2code(gid, char)
unicode_map.add_cid2unichr(gid, char)
return unicode_map
......@@ -445,9 +487,9 @@ class PDFSimpleFont(PDFFont):
if isinstance(encoding, dict):
name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
diff = list_value(encoding.get('Differences', None))
self.encoding = EncodingDB.get_encoding(name, diff)
self.cid2unicode = EncodingDB.get_encoding(name, diff)
else:
self.encoding = EncodingDB.get_encoding(literal_name(encoding))
self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
self.unicode_map = None
if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode'])
......@@ -463,7 +505,7 @@ class PDFSimpleFont(PDFFont):
except KeyError:
pass
try:
return self.encoding[cid]
return self.cid2unicode[cid]
except KeyError:
raise PDFUnicodeNotDefined(None, cid)
......@@ -486,6 +528,13 @@ class PDFType1Font(PDFSimpleFont):
widths = list_value(spec.get('Widths', [0]*256))
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
PDFSimpleFont.__init__(self, descriptor, widths, spec)
if 'Encoding' not in spec and 'FontFile' in descriptor:
# try to recover the missing encoding info from the font file.
self.fontfile = stream_value(descriptor.get('FontFile'))
length1 = int_value(self.fontfile['Length1'])
data = self.fontfile.get_data()[:length1]
parser = Type1FontHeaderParser(StringIO(data))
self.cid2unicode = parser.get_encoding()
return
def __repr__(self):
......
......@@ -30,6 +30,7 @@ from utils import decode_text, ObjIdRange
class PDFSyntaxError(PDFException): pass
class PDFNoValidXRef(PDFSyntaxError): pass
class PDFNoOutlines(PDFException): pass
class PDFDestinationNotFound(PDFException): pass
class PDFEncryptionError(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass
......@@ -517,7 +518,7 @@ class PDFDocument(object):
def get_outlines(self):
if 'Outlines' not in self.catalog:
raise PDFNoOutlines('No /Outlines defined!')
raise PDFNoOutlines
def search(entry, level):
entry = dict_value(entry)
if 'Title' in entry:
......@@ -558,6 +559,20 @@ class PDFDocument(object):
raise KeyError((cat,key))
return lookup(d0)
def get_dest(self, name):
try:
# PDF-1.2 or later
obj = self.lookup_name('Dests', name)
except KeyError:
# PDF-1.1 or prior
if 'Dests' not in self.catalog:
raise PDFDestinationNotFound(name)
d0 = dict_value(self.catalog['Dests'])
if name not in d0:
raise PDFDestinationNotFound(name)
obj = d0[name]
return obj
## PDFParser
##
......
......@@ -193,19 +193,6 @@ class PDFStream(PDFObject):
if isinstance(filters, list): return filters
return [ filters ]
def decomp(self,data):
buf = data
# some FlateDecode streams have garbage (newlines, etc) appended to the
# end. remove chars from the end to try and decompress the buffer
while 8 <= len(buf):
try:
# will get errors if the document is encrypted.
dco = zlib.decompressobj()
return dco.decompress(buf)
except zlib.error:
buf = buf[:-1]
raise Exception, "zlib.error while decompressing data"
def decode(self):
assert self.data is None and self.rawdata != None
data = self.rawdata
......@@ -220,7 +207,10 @@ class PDFStream(PDFObject):
for f in filters:
if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted.
data = self.decomp(data)
try:
data = zlib.decompress(data)
except zlib.error:
data = ''
elif f in LITERALS_LZW_DECODE:
data = lzwdecode(data)
elif f in LITERALS_ASCII85_DECODE:
......
......@@ -99,8 +99,8 @@ PSLiteralTable = PSSymbolTable(PSLiteral)
PSKeywordTable = PSSymbolTable(PSKeyword)
LIT = PSLiteralTable.intern
KWD = PSKeywordTable.intern
KEYWORD_BRACE_BEGIN = KWD('{')
KEYWORD_BRACE_END = KWD('}')
KEYWORD_PROC_BEGIN = KWD('{')
KEYWORD_PROC_END = KWD('}')
KEYWORD_ARRAY_BEGIN = KWD('[')
KEYWORD_ARRAY_END = KWD(']')
KEYWORD_DICT_BEGIN = KWD('<<')
......@@ -542,6 +542,15 @@ class PSStackParser(PSBaseParser):
self.push((pos, d))
except PSTypeError:
if STRICT: raise
elif token == KEYWORD_PROC_BEGIN:
# begin proc
self.start_type(pos, 'p')
elif token == KEYWORD_PROC_END:
# end proc
try:
self.push(self.end_type('p'))
except PSTypeError:
if STRICT: raise
else:
if 2 <= self.debug:
print >>stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \
......
#!/usr/bin/env python
from sys import maxint as INF
from struct import pack, unpack
......@@ -28,6 +29,17 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
## Utility functions
##
# get_bound
def get_bound(pts):
'''Compute a minimal rectangle that covers all the points.'''
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
for (x,y) in pts:
x0 = min(x0, x)
y0 = min(y0, y)
x1 = max(x1, x)
y1 = max(y1, y)
return (x0,y0,x1,y1)
# pick
def pick(seq, func, maxobj=None):
'''Picks the object obj where func(obj) has the highest value.'''
......@@ -86,7 +98,7 @@ def nunpack(s, default=0):
elif l == 4:
return unpack('>L', s)[0]
else:
return TypeError('invalid length: %d' % l)
raise TypeError('invalid length: %d' % l)
# decode_text
PDFDocEncoding = ''.join( unichr(x) for x in (
......
......@@ -48,6 +48,9 @@ XMLS_NONFREE= \
nonfree/naacl06-shinyama.xml \
nonfree/nlp2004slides.xml
all:
$(MAKE) test CMP=cmp
test: htmls texts xmls
clean:
......
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -3,53 +3,61 @@
</head><body>
<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
<span style="position:absolute; border: blue 1px solid; left:100px; top:119px; width:61px; height:27px;"></span>
<span style="position:absolute; border: cyan 1px solid; left:100px; top:119px; width:61px; height:27px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:100px; top:119px; width:61px; height:27px;"></span>
<span style="position:absolute; left:100px; top:119px; font-size:27px;">H</span>
<span style="position:absolute; left:117px; top:119px; font-size:27px;">e</span>
<span style="position:absolute; left:130px; top:119px; font-size:27px;">l</span>
<span style="position:absolute; left:136px; top:119px; font-size:27px;">l</span>
<span style="position:absolute; left:141px; top:119px; font-size:27px;">o</span>
<span style="position:absolute; left:154px; top:119px; font-size:27px;"> </span>
<span style="position:absolute; border: blue 1px solid; left:261px; top:119px; width:62px; height:27px;"></span>
<span style="position:absolute; border: cyan 1px solid; left:261px; top:119px; width:62px; height:27px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:261px; top:119px; width:62px; height:27px;"></span>
<span style="position:absolute; left:261px; top:119px; font-size:27px;">W</span>
<span style="position:absolute; left:283px; top:119px; font-size:27px;">o</span>
<span style="position:absolute; left:297px; top:119px; font-size:27px;">r</span>
<span style="position:absolute; left:305px; top:119px; font-size:27px;">l</span>
<span style="position:absolute; left:310px; top:119px; font-size:27px;">d</span>
<span style="position:absolute; border: blue 1px solid; left:100px; top:219px; width:61px; height:27px;"></span>
<span style="position:absolute; border: cyan 1px solid; left:100px; top:219px; width:61px; height:27px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:100px; top:219px; width:61px; height:27px;"></span>
<span style="position:absolute; left:100px; top:219px; font-size:27px;">H</span>
<span style="position:absolute; left:117px; top:219px; font-size:27px;">e</span>
<span style="position:absolute; left:130px; top:219px; font-size:27px;">l</span>
<span style="position:absolute; left:136px; top:219px; font-size:27px;">l</span>
<span style="position:absolute; left:141px; top:219px; font-size:27px;">o</span>
<span style="position:absolute; left:154px; top:219px; font-size:27px;"> </span>
<span style="position:absolute; border: blue 1px solid; left:261px; top:219px; width:62px; height:27px;"></span>
<span style="position:absolute; border: cyan 1px solid; left:261px; top:219px; width:62px; height:27px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:261px; top:219px; width:62px; height:27px;"></span>
<span style="position:absolute; left:261px; top:219px; font-size:27px;">W</span>
<span style="position:absolute; left:284px; top:219px; font-size:27px;">o</span>
<span style="position:absolute; left:297px; top:219px; font-size:27px;">r</span>
<span style="position:absolute; left:305px; top:219px; font-size:27px;">l</span>
<span style="position:absolute; left:310px; top:219px; font-size:27px;">d</span>
<span style="position:absolute; border: blue 1px solid; left:100px; top:319px; width:111px; height:27px;"></span>
<span style="position:absolute; border: cyan 1px solid; left:100px; top:319px; width:111px; height:27px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:100px; top:319px; width:111px; height:27px;"></span>
<span style="position:absolute; left:100px; top:319px; font-size:27px;">H</span>
<span style="position:absolute; left:127px; top:319px; font-size:27px;">e</span>
<span style="position:absolute; left:150px; top:319px; font-size:27px;">l</span>
<span style="position:absolute; left:166px; top:319px; font-size:27px;">l</span>
<span style="position:absolute; left:181px; top:319px; font-size:27px;">o</span>
<span style="position:absolute; left:204px; top:319px; font-size:27px;"> </span>
<span style="position:absolute; border: blue 1px solid; left:321px; top:319px; width:102px; height:27px;"></span>
<span style="position:absolute; border: cyan 1px solid; left:321px; top:319px; width:102px; height:27px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:321px; top:319px; width:102px; height:27px;"></span>
<span style="position:absolute; left:321px; top:319px; font-size:27px;">W</span>
<span style="position:absolute; left:354px; top:319px; font-size:27px;">o</span>
<span style="position:absolute; left:377px; top:319px; font-size:27px;">r</span>
<span style="position:absolute; left:395px; top:319px; font-size:27px;">l</span>
<span style="position:absolute; left:410px; top:319px; font-size:27px;">d</span>
<span style="position:absolute; border: blue 1px solid; left:100px; top:419px; width:111px; height:27px;"></span>
<span style="position:absolute; border: cyan 1px solid; left:100px; top:419px; width:111px; height:27px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:100px; top:419px; width:111px; height:27px;"></span>
<span style="position:absolute; left:100px; top:419px; font-size:27px;">H</span>
<span style="position:absolute; left:127px; top:419px; font-size:27px;">e</span>
<span style="position:absolute; left:150px; top:419px; font-size:27px;">l</span>
<span style="position:absolute; left:165px; top:419px; font-size:27px;">l</span>
<span style="position:absolute; left:181px; top:419px; font-size:27px;">o</span>
<span style="position:absolute; left:204px; top:419px; font-size:27px;"> </span>
<span style="position:absolute; border: blue 1px solid; left:321px; top:419px; width:102px; height:27px;"></span>
<span style="position:absolute; border: cyan 1px solid; left:321px; top:419px; width:102px; height:27px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:321px; top:419px; width:102px; height:27px;"></span>
<span style="position:absolute; left:321px; top:419px; font-size:27px;">W</span>
<span style="position:absolute; left:353px; top:419px; font-size:27px;">o</span>
<span style="position:absolute; left:377px; top:419px; font-size:27px;">r</span>
......
......@@ -3,7 +3,8 @@
</head><body>
<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
<span style="position:absolute; border: blue 1px solid; left:0px; top:72px; width:344px; height:554px;"></span>
<span style="position:absolute; border: cyan 1px solid; left:0px; top:72px; width:218px; height:79px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:0px; top:72px; width:218px; height:79px;"></span>
<span style="position:absolute; left:0px; top:96px; font-size:55px;">H</span>
<span style="position:absolute; left:34px; top:96px; font-size:55px;">e</span>
<span style="position:absolute; left:61px; top:96px; font-size:55px;">l</span>
......@@ -14,6 +15,8 @@
<span style="position:absolute; left:170px; top:72px; font-size:55px;">l</span>
<span style="position:absolute; left:181px; top:72px; font-size:55px;">l</span>
<span style="position:absolute; left:192px; top:72px; font-size:55px;">o</span>
<span style="position:absolute; border: cyan 1px solid; left:194px; top:136px; width:48px; height:490px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:194px; top:136px; width:48px; height:490px;"></span>
<span style="position:absolute; left:194px; top:136px; font-size:48px;"></span>
<span style="position:absolute; left:194px; top:184px; font-size:48px;"></span>
<span style="position:absolute; left:194px; top:232px; font-size:48px;"></span>
......@@ -24,12 +27,15 @@
<span style="position:absolute; left:194px; top:448px; font-size:48px;"></span>
<span style="position:absolute; left:194px; top:496px; font-size:48px;"></span>
<span style="position:absolute; left:194px; top:544px; font-size:48px;"></span>
<span style="position:absolute; left:218px; top:599px; font-size:27px;">W</span>
<span style="position:absolute; border: cyan 1px solid; left:241px; top:575px; width:102px; height:51px;"></span>
<span style="position:absolute; border: magenta 1px solid; left:281px; top:575px; width:62px; height:27px;"></span>
<span style="position:absolute; left:281px; top:575px; font-size:27px;">W</span>
<span style="position:absolute; left:304px; top:575px; font-size:27px;">o</span>
<span style="position:absolute; left:317px; top:575px; font-size:27px;">r</span>
<span style="position:absolute; left:325px; top:575px; font-size:27px;">l</span>
<span style="position:absolute; left:330px; top:575px; font-size:27px;">d</span>
<span style="position:absolute; left:218px; top:599px; font-size:27px;">W</span>
<span style="position:absolute; border: magenta 1px solid; left:241px; top:599px; width:40px; height:27px;"></span>
<span style="position:absolute; left:241px; top:599px; font-size:27px;">o</span>
<span style="position:absolute; left:254px; top:599px; font-size:27px;">r</span>
<span style="position:absolute; left:262px; top:599px; font-size:27px;">l</span>
......
HelloHello
World
あいうえおあいうえお W
World
orld
\ No newline at end of file
<?xml version="1.0" encoding="utf-8" ?>
<pages>
<page id="1" bbox="0.000,0.000,612.000,792.000" rotate="0">
<textbox id="0" bbox="0.000,215.032,344.016,769.552">
<textbox id="0" bbox="0.000,690.064,218.688,769.552">
<textline bbox="0.000,690.064,218.688,769.552">
<text font="Helvetica" bbox="0.000,690.064,34.656,745.552" size="55.488">H</text>
<text font="Helvetica" bbox="34.656,690.064,61.344,745.552" size="55.488">e</text>
......@@ -16,56 +16,26 @@
<text>
</text>
</textline>
<textline bbox="194.688,657.760,242.688,705.760">
<text font="unknown" vertical="true" bbox="194.688,657.760,242.688,705.760" size="48.000"></text>
<text>
</text>
</textline>
<textline bbox="194.688,609.760,242.688,657.760">
<text font="unknown" vertical="true" bbox="194.688,609.760,242.688,657.760" size="48.000"></text>
<text>
</text>
</textline>
<textline bbox="194.688,561.760,242.688,609.760">
<text font="unknown" vertical="true" bbox="194.688,561.760,242.688,609.760" size="48.000"></text>
<text>
</text>
</textline>
<textline bbox="194.688,513.760,242.688,561.760">
<text font="unknown" vertical="true" bbox="194.688,513.760,242.688,561.760" size="48.000"></text>
<text>
</text>
</textline>
<textline bbox="194.688,465.760,242.688,513.760">
<text font="unknown" vertical="true" bbox="194.688,465.760,242.688,513.760" size="48.000"></text>
<text>
</text>
</textline>
<textline bbox="194.688,441.760,242.688,489.760">
<text font="unknown" vertical="true" bbox="194.688,441.760,242.688,489.760" size="48.000"></text>
<text>
</text>
</textline>
<textline bbox="194.688,393.760,242.688,441.760">
<text font="unknown" vertical="true" bbox="194.688,393.760,242.688,441.760" size="48.000"></text>
<text>
</text>
</textline>
<textline bbox="194.688,345.760,242.688,393.760">
<text font="unknown" vertical="true" bbox="194.688,345.760,242.688,393.760" size="48.000"></text>