Commit a4c89e61 authored by SVN-Git Migration's avatar SVN-Git Migration

Imported Upstream version 20100619p1+dfsg

parent fbc2adf9
MANIFEST
Makefile
README.txt
cmaprsrc/README.txt
cmaprsrc/cid2code_Adobe_CNS1.txt
cmaprsrc/cid2code_Adobe_GB1.txt
cmaprsrc/cid2code_Adobe_Japan1.txt
cmaprsrc/cid2code_Adobe_Korea1.txt
docs/cid.obj
docs/cid.png
docs/index.html
docs/layout.obj
docs/layout.png
docs/miningpdf.html
docs/objrel.obj
docs/objrel.png
docs/programming.html
pdfminer/Makefile
pdfminer/__init__.py
pdfminer/arcfour.py
pdfminer/ascii85.py
pdfminer/cmap/Makefile
pdfminer/cmap/__init__.py
pdfminer/cmapdb.py
pdfminer/converter.py
pdfminer/encodingdb.py
pdfminer/fontmetrics.py
pdfminer/glyphlist.py
pdfminer/latin_enc.py
pdfminer/layout.py
pdfminer/lzw.py
pdfminer/pdfcolor.py
pdfminer/pdfdevice.py
pdfminer/pdffont.py
pdfminer/pdfinterp.py
pdfminer/pdfparser.py
pdfminer/pdftypes.py
pdfminer/psparser.py
pdfminer/rijndael.py
pdfminer/runlength.py
pdfminer/utils.py
samples/Makefile
samples/README
samples/jo.html.ref
samples/jo.pdf
samples/jo.tex
samples/jo.txt.ref
samples/jo.xml.ref
samples/nonfree/dmca.html.ref
samples/nonfree/dmca.pdf
samples/nonfree/dmca.txt.ref
samples/nonfree/dmca.xml.ref
samples/nonfree/f1040nr.html.ref
samples/nonfree/f1040nr.pdf
samples/nonfree/f1040nr.txt.ref
samples/nonfree/f1040nr.xml.ref
samples/nonfree/i1040nr.html.ref
samples/nonfree/i1040nr.pdf
samples/nonfree/i1040nr.txt.ref
samples/nonfree/i1040nr.xml.ref
samples/nonfree/kampo.html.ref
samples/nonfree/kampo.pdf
samples/nonfree/kampo.txt.ref
samples/nonfree/kampo.xml.ref
samples/nonfree/naacl06-shinyama.html.ref
samples/nonfree/naacl06-shinyama.pdf
samples/nonfree/naacl06-shinyama.txt.ref
samples/nonfree/naacl06-shinyama.xml.ref
samples/nonfree/nlp2004slides.html.ref
samples/nonfree/nlp2004slides.pdf
samples/nonfree/nlp2004slides.txt.ref
samples/nonfree/nlp2004slides.xml.ref
samples/simple1.html.ref
samples/simple1.pdf
samples/simple1.txt.ref
samples/simple1.xml.ref
samples/simple2.html.ref
samples/simple2.pdf
samples/simple2.txt.ref
samples/simple2.xml.ref
setup.py
tools/Makefile
tools/conv_afm.py
tools/conv_cmap.py
tools/dumppdf.py
tools/latin2ascii.py
tools/pdf2html.cgi
tools/pdf2txt.py
tools/prof.py
......@@ -4,7 +4,6 @@
PACKAGE=pdfminer
PREFIX=/usr/local
SVN=svn
PYTHON=python
RM=rm -f
CP=cp -f
......@@ -16,17 +15,15 @@ install:
clean:
-$(PYTHON) setup.py clean
-$(RM) -r build dist
-$(RM) -r build dist MANIFEST
-cd $(PACKAGE) && $(MAKE) clean
-cd tools && $(MAKE) clean
distclean: clean test_clean cmap_clean
commit: distclean
$(SVN) commit
pack: distclean
pack: distclean MANIFEST
$(PYTHON) setup.py sdist
register: distclean
register: distclean MANIFEST
$(PYTHON) setup.py sdist upload register
WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
......@@ -36,20 +33,28 @@ publish:
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
CMAPSRC=cmaprsrc
CMAPDST=pdfminer/cmap
cmap: $(CMAPDST)/TO_UNICODE_Adobe_CNS1.py $(CMAPDST)/TO_UNICODE_Adobe_GB1.py \
$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py
cmap: $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz \
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz
cmap_clean:
cd $(CMAPDST) && make cmap_clean
$(CMAPDST)/TO_UNICODE_Adobe_CNS1.py:
$(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz:
$(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
$(CMAPDST)/TO_UNICODE_Adobe_GB1.py:
$(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz:
$(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py:
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz:
$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
$(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
$(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz:
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
test: cmap
cd samples && $(MAKE) test CMP=cmp
test_clean:
-cd samples && $(MAKE) clean
SED=sed
FIND=find . '(' -name .git -o -name .svn -o -name CVS -o -name dev -o -name dist -o -name build ')' -prune -false -o
SORT=sort
TOUCH=touch
MANIFEST:
$(TOUCH) MANIFEST
$(FIND) -type f '!' -name '.*' | $(SED) 's:./::' | $(SORT) > MANIFEST
Metadata-Version: 1.0
Name: pdfminer
Version: 20100424
Version: 20100619p1
Summary: PDF parser and analyzer
Home-page: http://www.unixuser.org/~euske/python/pdfminer/index.html
Author: Yusuke Shinyama
......@@ -14,7 +14,7 @@ Description: PDFMiner is a tool for extracting information from PDF documents.
It includes a PDF converter that can transform PDF files
into other text formats (such as HTML). It has an extensible
PDF parser that can be used for other purposes instead of text analysis.
Keywords: pdf parser,pdf converter,text mining
Keywords: pdf parser,pdf converter,layout analysis,text mining
Platform: UNKNOWN
Classifier: Development Status :: 4 - Beta
Classifier: Environment :: Console
......
README.txt for cmaprsrc
This directory contains Adobe CMap resources. CMaps are required
to decode text data written in Chinese, Japanese or Korean language.
to decode text data written in CJK (Chinese, Japanese, Korean) language.
CMap resources are now available freely from Adobe web site:
http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
......
%TGIF 4.1.45-QPL
state(0,37,100.000,0,0,0,16,1,9,1,1,2,0,1,0,1,1,'NewCenturySchlbk-Bold',1,103680,0,0,1,10,0,0,1,1,0,16,0,0,1,1,1,1,1050,1485,1,0,2880,0).
%
% @(#)$Header$
% %W%
%
unit("1 pixel/pixel").
color_info(19,65535,0,[
"magenta", 65535, 0, 65535, 65535, 0, 65535, 1,
"red", 65535, 0, 0, 65535, 0, 0, 1,
"green", 0, 65535, 0, 0, 65535, 0, 1,
"blue", 0, 0, 65535, 0, 0, 65535, 1,
"yellow", 65535, 65535, 0, 65535, 65535, 0, 1,
"pink", 65535, 49344, 52171, 65535, 49344, 52171, 1,
"cyan", 0, 65535, 65535, 0, 65535, 65535, 1,
"CadetBlue", 24415, 40606, 41120, 24415, 40606, 41120, 1,
"white", 65535, 65535, 65535, 65535, 65535, 65535, 1,
"black", 0, 0, 0, 0, 0, 0, 1,
"DarkSlateGray", 12079, 20303, 20303, 12079, 20303, 20303, 1,
"#00000000c000", 0, 0, 49344, 0, 0, 49152, 1,
"#820782070000", 33410, 33410, 0, 33287, 33287, 0, 1,
"#3cf3fbee34d2", 15420, 64507, 13364, 15603, 64494, 13522, 1,
"#3cf3fbed34d3", 15420, 64507, 13364, 15603, 64493, 13523, 1,
"#ffffa6990000", 65535, 42662, 0, 65535, 42649, 0, 1,
"#ffff0000fffe", 65535, 0, 65535, 65535, 0, 65534, 1,
"#fffe0000fffe", 65535, 0, 65535, 65534, 0, 65534, 1,
"#fffe00000000", 65535, 0, 0, 65534, 0, 0, 1
]).
script_frac("0.6").
fg_bg_colors('black','white').
dont_reencode("FFDingbests:ZapfDingbats").
objshadow_info('#c0c0c0',2,2).
page(1,"",1,'').
text('black',90,95,1,1,1,66,20,0,15,5,0,0,0,0,2,66,20,0,0,"",0,0,0,0,110,'',[
minilines(66,20,0,0,1,0,0,[
mini_line(66,15,5,0,0,0,[
str_block(0,66,15,5,0,-1,0,0,0,[
str_seg('black','Courier-Bold',1,103680,66,15,5,0,-1,0,0,0,0,0,
"U+30FC")])
])
])]).
text('black',100,285,1,1,1,66,20,3,15,5,0,0,0,0,2,66,20,0,0,"",0,0,0,0,300,'',[
minilines(66,20,0,0,1,0,0,[
mini_line(66,15,5,0,0,0,[
str_block(0,66,15,5,0,-2,0,0,0,[
str_seg('black','Courier-Bold',1,103680,66,15,5,0,-2,0,0,0,0,0,
"U+5199")])
])
])]).
text('black',400,38,2,1,1,119,30,5,12,3,0,0,0,0,2,119,30,0,0,"",0,0,0,0,50,'',[
minilines(119,30,0,0,1,0,0,[
mini_line(83,12,3,0,0,0,[
str_block(0,83,12,3,0,-3,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,83,12,3,0,-3,0,0,0,0,0,
"Adobe-Japan1")])
]),
mini_line(119,12,3,0,0,0,[
str_block(0,119,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,119,12,3,0,-1,0,0,0,0,0,
"CID:660 (horizontal)")])
])
])]).
text('black',400,118,2,1,1,114,30,8,12,3,0,0,0,0,2,114,30,0,0,"",0,0,0,0,130,'',[
minilines(114,30,0,0,1,0,0,[
mini_line(83,12,3,0,0,0,[
str_block(0,83,12,3,0,-3,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,83,12,3,0,-3,0,0,0,0,0,
"Adobe-Japan1")])
]),
mini_line(114,12,3,0,0,0,[
str_block(0,114,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,114,12,3,0,-1,0,0,0,0,0,
"CID:7891 (vertical)")])
])
])]).
text('black',400,238,2,1,1,125,30,15,12,3,0,0,0,0,2,125,30,0,0,"",0,0,0,0,250,'',[
minilines(125,30,0,0,1,0,0,[
mini_line(83,12,3,0,0,0,[
str_block(0,83,12,3,0,-3,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,83,12,3,0,-3,0,0,0,0,0,
"Adobe-Japan1")])
]),
mini_line(125,12,3,0,0,0,[
str_block(0,125,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,125,12,3,0,-1,0,0,0,0,0,
"CID:2296 (Japanese)")])
])
])]).
text('black',400,318,2,1,1,115,30,16,12,3,0,0,0,0,2,115,30,0,0,"",0,0,0,0,330,'',[
minilines(115,30,0,0,1,0,0,[
mini_line(67,12,3,0,0,0,[
str_block(0,67,12,3,0,-3,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,67,12,3,0,-3,0,0,0,0,0,
"Adobe-GB1")])
]),
mini_line(115,12,3,0,0,0,[
str_block(0,115,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,115,12,3,0,-1,0,0,0,0,0,
"CID:3967 (Chinese)")])
])
])]).
text('black',200,84,2,1,1,116,38,20,16,3,0,0,0,0,2,116,38,0,0,"",0,0,0,0,100,'',[
minilines(116,38,0,0,1,0,0,[
mini_line(70,16,3,0,0,0,[
str_block(0,70,16,3,0,-1,0,0,0,[
str_seg('black','NewCenturySchlbk-Roman',0,97920,70,16,3,0,-1,0,0,0,0,0,
"Japanese")])
]),
mini_line(116,16,3,0,0,0,[
str_block(0,116,16,3,0,-1,0,0,0,[
str_seg('black','NewCenturySchlbk-Roman',0,97920,116,16,3,0,-1,0,0,0,0,0,
"long-vowel sign")])
])
])]).
oval('black','',30,70,280,140,0,1,1,49,0,0,0,0,0,'1',0,[
]).
oval('black','',30,260,280,330,0,1,1,51,0,0,0,0,0,'1',0,[
]).
text('black',200,274,2,1,1,85,38,53,16,3,0,0,0,0,2,85,38,0,0,"",0,0,0,0,290,'',[
minilines(85,38,0,0,1,0,0,[
mini_line(61,16,3,0,0,0,[
str_block(0,61,16,3,0,-1,0,0,0,[
str_seg('black','NewCenturySchlbk-Roman',0,97920,61,16,3,0,-1,0,0,0,0,0,
"Chinese")])
]),
mini_line(85,16,3,0,0,0,[
str_block(0,85,16,3,0,-1,0,0,0,[
str_seg('black','NewCenturySchlbk-Roman',0,97920,85,16,3,0,-1,0,0,0,0,0,
"letter \"sha\"")])
])
])]).
box('black','',330,30,560,80,0,1,1,57,0,0,0,0,0,'1',0,[
]).
box('black','',330,110,560,160,0,1,1,59,0,0,0,0,0,'1',0,[
]).
box('black','',330,230,560,280,0,1,1,60,0,0,0,0,0,'1',0,[
]).
box('black','',330,310,560,360,0,1,1,61,0,0,0,0,0,'1',0,[
]).
group([
poly('black','',4,[
506,246,501,235,541,235,536,246],0,2,1,68,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]),
poly('black','',5,[
519,238,516,252,529,252,524,275,516,272],0,2,1,69,0,0,0,0,0,0,0,'2',0,0,
"00","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]),
poly('black','',2,[
501,261,541,261],0,2,1,70,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]),
poly('black','',2,[
519,244,529,244],0,2,1,71,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
])
],
76,0,0,[
]).
group([
poly('black','',3,[
519,119,524,127,524,152],0,2,1,67,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
])
],
78,0,0,[
]).
group([
poly('black','',3,[
540,57,509,57,501,49],0,2,1,66,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
])
],
80,0,0,[
]).
group([
poly('black','',4,[
506,326,501,315,541,315,536,326],0,2,1,90,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]),
poly('black','',5,[
519,318,515,332,531,332,526,355,519,352],0,2,1,89,0,0,0,0,0,0,0,'2',0,0,
"00","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]),
poly('black','',2,[
501,341,526,341],0,2,1,88,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]),
poly('black','',2,[
519,324,529,324],0,2,1,87,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
])
],
134,0,0,[
]).
poly('black','',2,[
270,90,320,70],1,3,1,158,0,0,0,0,0,0,0,'3',0,0,
"0","",[
0,12,5,0,'12','5','0'],[0,12,5,0,'12','5','0'],[
]).
poly('black','',2,[
280,110,320,130],1,3,1,159,0,0,0,0,0,0,0,'3',0,0,
"0","",[
0,12,5,0,'12','5','0'],[0,12,5,0,'12','5','0'],[
]).
poly('black','',2,[
270,280,310,250],1,3,1,160,0,0,0,0,0,0,0,'3',0,0,
"0","",[
0,12,5,0,'12','5','0'],[0,12,5,0,'12','5','0'],[
]).
poly('black','',2,[
270,300,310,330],1,3,1,161,0,0,0,0,0,0,0,'3',0,0,
"0","",[
0,12,5,0,'12','5','0'],[0,12,5,0,'12','5','0'],[
]).
......@@ -19,7 +19,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Sat Apr 24 04:30:10 UTC 2010
Last Modified: Sun Jun 13 04:20:47 UTC 2010
<!-- hhmts end -->
</div>
......@@ -29,7 +29,9 @@ Last Modified: Sat Apr 24 04:30:10 UTC 2010
<li> <a href="#install">Install</a>
&nbsp; <small>(<a href="#cmap">for CJK languages</a>)</small>
<li> <a href="#usage">How to Use</a>
&nbsp; <small>(<a href="#pdf2txt">pdf2txt.py</a>, <a href="#dumppdf">dumppdf.py</a>)</small>
&nbsp; <small>(<a href="#pdf2txt">pdf2txt.py</a>,
<a href="#dumppdf">dumppdf.py</a>,
<a href="programming.html">use as library</a>)</small>
<li> <a href="#techdocs">Technical Documents</a>
<li> <a href="#todos">TODOs</a>
<li> <a href="#changes">Changes</a>
......@@ -64,7 +66,8 @@ PDF parser that can be used for other purposes instead of text analysis.
<li> Reconstruct the original layout by grouping text chunks.
</ul>
<p>
On the performance, PDFMiner is about 20 times slower than
On the performance side,
PDFMiner is about 20 times slower than
other C/C++-based software such as XPdf.
<a name="source"></a>
......@@ -348,6 +351,13 @@ no stream header is displayed for the ease of saving it to a file.
<dd> Increases the debug level.
</dl>
<a name="library"></a>
<h3>Use as Library</h3>
<p>
PDFMiner can be used as a library by other Python programs.
<p>
For details, see the <a href="programming.html">Programming with PDFMiner</a> page.
<a name="techdocs"></a>
<hr noshade>
<h2>Technical Documents</h2>
......@@ -367,7 +377,7 @@ no stream header is displayed for the ease of saving it to a file.
<li> <A href="http://www.python.org/dev/peps/pep-0008/">PEP-8</a> and
<a href="http://www.python.org/dev/peps/pep-0257/">PEP-257</a> conformance.
<li> Better documentation.
<li> Better text extraction / layout analysis.
<li> Better text extraction / layout analysis. (writing mode detection, Type1 font file analysis, etc.)
<li> Robust error handling.
<li> Crypt stream filter support. (More sample documents are needed!)
<li> CCITTFax stream filter support.
......@@ -377,7 +387,8 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade>
<h2>Changes</h2>
<ul>
<li> 2010/04/24: Bugfixes and tiny improvements on TOC extraction. Thanks to Jose Maria.
<li> 2010/06/13: Bugfixes and improvements on CMap data compression. Thanks to Jakub Wilk.
<li> 2010/04/24: Bugfixes and improvements on TOC extraction. Thanks to Jose Maria.
<li> 2010/03/26: Bugfixes. Thanks to Brian Berry and Lubos Pintes.
<li> 2010/03/22: Improved layout analysis. Added regression tests.
<li> 2010/03/12: A couple of bugfixes. Thanks to Sean Manefield.
......
This diff is collapsed.
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<title>Mining PDF files</title>
<style type="text/css"><!--
blockquote { background: #eeeeee; }
--></style>
</head><body>
<h1>Mining PDF files</h1>
<p>
<p>
<a href="http://www.unixuser.org/~euske/python/pdfminer/index.html">Homepage</a>
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Sat Nov 14 21:09:01 JST 2009
<!-- hhmts end -->
</div>
<h2>What is PDF?</h2>
<p>
<h3>What PDF is ...</h3>
<ul>
<li> A weird mixture of texts and binaries. (Yikes!)
<li> Generated sequentially, but needs random access to read.
</ul>
<h3>What PDF is not ...</h3>
<ul>
<li> Editable document format (like Word or HTML).
<li> Nice for accessility point of view.
</ul>
<h2>Structure of PDF</h2>
<p>
From a data structure's point of view, PDF is a total mess in the
computer history. Originally, Adobe had a document format called
PostScript (which is also more like "graphics" format rather than
text format). It has nice graphic representation and is able to
express commercial quality typesetting. However, it has to be for
a specific printer and its file size tends to get bloated because
almost everything is represented as text. PDF is Adobe's attempt
to create a less printer dependent format with a reduced data size
(that's why it was named "portable" document format). To some
degree, PDF can be seen as a "compressed" version of PostScript
with seekable index tables. Since its drawing model and concepts
(coordinations, color spaces, etc.) remains pretty much the same
as its precedessor, Adobe decided to reuse the original PostScript
notation partially in PDF. However, this eclectic position ended
up with a disastrous situation.
<h3>Format Disaster</h2>
<p>
When designing a data format, there are two different strategies:
using text or using binary. They both have obvious merits and
demerits. The biggest merit of having textual representation is
that they are human readable and can be modified with any text
editor. The demerits of textual representation is its bloted size,
especially if you want to put something like pictures and
multimedia data like audio or video. Another demerit of textual
representation is that you need a program to serialize/deserialize
(parse) the data, which can be very complex and buggy. On the
other hand, binary representation normally doesn't require a
complex parser and takes much less space than texts. However,
they're not readable for humans. Now, Adobe decided to take the
good parts from both worlds by making PDF a partially text and
partially binary format, and as a result, PDF inherits the
drawbacks of both worlds without having much of their merits, i.e.
PDF is a human *unreadable* document format that still requires a
complex and error-prone parser and has a bloated file size.
<p>
Adobe has been probably aware of this problem from early on, and
they tried to fix this over years. So they gradually dropped text
representations and more inclided toward binaries. For example,
in PDF specification 1.5, they introduce a new notation called
"object stream" (which is different from a "stream object" that
was already there in the specification).
However, by this time there are already tons of PDFs that were
produced by the original standard, which still requires every PDF
viewer to support.
<h2>Problem of Text Extraction from PDF Documents</h2>
<p>
Many people tend to think that a PDF document is somewhat similar
to a Word or HTML document, which is not true. In fact, the primary
focus of PDF is printing and showing on a computer display, so
it is extremely versatile for showing the details of "looks"
of text typography, picture and graphics. All the texts in a PDF document is
just a bunch of string objects floating at various locations on a
blank slate. There is no text flow control and no contexual clue
about its content, except few special "tagged" PDF documents with
extra annotations that denote headlines or page boundaries, which
require specialized tools to create.
<p>
(OpenOffice, for example, has ability to create tagged PDF
documents. But the degree of the annotations is varied depending
on its implementation, and in many cases it is not possible to
obtain the full layout information by only using tags.)
<p>
Besides tagged documents, PDF doesn't care the order of text
strings rendered in a page. You can completely jumble up every
piece of strings in a PDF and still make it look like a
perfect document on the surface. Even worse, PDF allows a word to
be split in the middle and drawn as multiple unrelated strings in
order to represent precise text positioning. For example, a
certain word processing software creates a PDF that splits a word
"You" into two separate strings "Y" and "ou" because of the subtle
kerning between the letters.
<p>
So there's a huge problem associated with extracting texts properly
from PDF files. They require almost similar kinds of analysis
to optical character recognition (OCR).
<hr noshade>
<address>Yusuke Shinyama</address>
</body>
%TGIF 4.1.45-QPL
state(0,37,100.000,0,0,0,16,1,9,1,1,1,0,0,2,1,1,'Helvetica-Bold',1,69120,0,0,1,10,0,0,1,1,0,16,0,0,1,1,1,1,1050,1485,1,0,2880,0).
%
% @(#)$Header$
% %W%
%
unit("1 pixel/pixel").
color_info(19,65535,0,[
"magenta", 65535, 0, 65535, 65535, 0, 65535, 1,
"red", 65535, 0, 0, 65535, 0, 0, 1,
"green", 0, 65535, 0, 0, 65535, 0, 1,
"blue", 0, 0, 65535, 0, 0, 65535, 1,
"yellow", 65535, 65535, 0, 65535, 65535, 0, 1,
"pink", 65535, 49344, 52171, 65535, 49344, 52171, 1,
"cyan", 0, 65535, 65535, 0, 65535, 65535, 1,
"CadetBlue", 24415, 40606, 41120, 24415, 40606, 41120, 1,
"white", 65535, 65535, 65535, 65535, 65535, 65535, 1,
"black", 0, 0, 0, 0, 0, 0, 1,
"DarkSlateGray", 12079, 20303, 20303, 12079, 20303, 20303, 1,
"#00000000c000", 0, 0, 49344, 0, 0, 49152, 1,
"#820782070000", 33410, 33410, 0, 33287, 33287, 0, 1,
"#3cf3fbee34d2", 15420, 64507, 13364, 15603, 64494, 13522, 1,
"#3cf3fbed34d3", 15420, 64507, 13364, 15603, 64493, 13523, 1,
"#ffffa6990000", 65535, 42662, 0, 65535, 42649, 0, 1,
"#ffff0000fffe", 65535, 0, 65535, 65535, 0, 65534, 1,
"#fffe0000fffe", 65535, 0, 65535, 65534, 0, 65534, 1,
"#fffe00000000", 65535, 0, 0, 65534, 0, 0, 1
]).
script_frac("0.6").
fg_bg_colors('black','white').
dont_reencode("FFDingbests:ZapfDingbats").
objshadow_info('#c0c0c0',2,2).
page(1,"",1,'').
oval('black','',350,380,450,430,2,2,1,88,0,0,0,0,0,'2',0,[
]).
poly('black','',2,[
270,270,350,230],1,2,1,54,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
poly('black','',2,[
270,280,350,320],1,2,1,55,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
box('black','',350,100,450,150,2,2,1,2,0,0,0,0,0,'2',0,[
]).
text('black',400,118,1,1,1,84,15,3,12,3,0,0,0,0,2,84,15,0,0,"",0,0,0,0,130,'',[
minilines(84,15,0,0,1,0,0,[
mini_line(84,12,3,0,0,0,[
str_block(0,84,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,84,12,3,0,0,0,0,0,0,0,
"PDFDocument")])
])
])]).
box('black','',150,100,250,150,2,2,1,13,0,0,0,0,0,'2',0,[
]).
text('black',200,118,1,1,1,63,15,14,12,3,0,0,0,0,2,63,15,0,0,"",0,0,0,0,130,'',[
minilines(63,15,0,0,1,0,0,[
mini_line(63,12,3,0,0,0,[
str_block(0,63,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,63,12,3,0,0,0,0,0,0,0,
"PDFParser")])
])
])]).
box('black','',350,200,450,250,2,2,1,20,0,0,0,0,0,'2',0,[
]).
text('black',400,218,1,1,1,88,15,21,12,3,0,0,0,0,2,88,15,0,0,"",0,0,0,0,230,'',[
minilines(88,15,0,0,1,0,0,[
mini_line(88,12,3,0,0,0,[
str_block(0,88,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,88,12,3,0,0,0,0,0,0,0,
"PDFInterpreter")])
])
])]).
box('black','',350,300,450,350,2,2,1,23,0,0,0,0,0,'2',0,[
]).
text('black',400,318,1,1,1,65,15,24,12,3,0,0,0,0,2,65,15,0,0,"",0,0,0,0,330,'',[
minilines(65,15,0,0,1,0,0,[
mini_line(65,12,3,0,0,0,[
str_block(0,65,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,65,12,3,0,-1,0,0,0,0,0,
"PDFDevice")])
])
])]).
box('black','',180,250,280,300,2,2,1,29,0,0,0,0,0,'2',0,[
]).
text('black',230,268,1,1,1,131,15,30,12,3,2,0,0,0,2,131,15,0,0,"",0,0,0,0,280,'',[
minilines(131,15,0,0,1,0,0,[
mini_line(131,12,3,0,0,0,[
str_block(0,131,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,131,12,3,0,0,0,0,0,0,0,
"PDFResourceManager")])
])
])]).
poly('black','',2,[
250,140,350,140],1,2,1,45,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
poly('black','',2,[
350,110,250,110],1,2,1,46,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
poly('black','',2,[
400,150,400,200],1,2,1,47,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
poly('black','',2,[
400,250,400,300],1,2,1,56,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
poly('black','',2,[
400,350,400,380],0,2,1,65,0,0,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
text('black',400,388,3,1,1,44,41,71,12,3,0,-2,0,0,2,44,41,0,0,"",0,0,0,0,400,'',[
minilines(44,41,0,0,1,-2,0,[
mini_line(44,12,3,0,0,0,[
str_block(0,44,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,44,12,3,0,-1,0,0,0,0,0,
"Display")])
]),
mini_line(20,12,3,0,0,0,[
str_block(0,20,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,20,12,3,0,-1,0,0,0,0,0,
"File")])
]),
mini_line(23,12,3,0,0,0,[
str_block(0,23,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,23,12,3,0,-1,0,0,0,0,0,
"etc.")])
])
])]).
text('black',300,88,1,1,1,92,15,79,12,3,0,0,0,0,2,92,15,0,0,"",0,0,0,0,100,'',[
minilines(92,15,0,0,1,0,0,[
mini_line(92,12,3,0,0,0,[
str_block(0,92,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,92,12,3,0,-1,0,0,0,0,0,
"request objects")])
])
])]).
text('black',300,148,1,1,1,78,15,84,12,3,0,0,0,0,2,78,15,0,0,"",0,0,0,0,160,'',[
minilines(78,15,0,0,1,0,0,[
mini_line(78,12,3,0,0,0,[
str_block(0,78,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,78,12,3,0,-1,0,0,0,0,0,
"store objects")])
])
])]).
oval('black','',20,100,120,150,2,2,1,106,0,0,0,0,0,'2',0,[
]).
text('black',70,118,1,1,1,46,15,107,12,3,0,0,0,0,2,46,15,0,0,"",0,0,0,0,130,'',[
minilines(46,15,0,0,1,0,0,[
mini_line(46,12,3,0,0,0,[
str_block(0,46,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,46,12,3,0,-1,0,0,0,0,0,
"PDF file")])
])
])]).
poly('black','',2,[
120,120,150,120],0,2,1,114,0,2,0,0,0,0,0,'2',0,0,
"0","",[
0,10,4,0,'10','4','0'],[0,10,4,0,'10','4','0'],[
]).
text('black',400,158,1,1,1,68,15,115,12,3,2,0,0,0,2,68,15,0,0,"",0,0,0,0,170,'',[
minilines(68,15,0,0,1,0,0,[
mini_line(68,12,3,0,0,0,[
str_block(0,68,12,3,0,0,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,68,12,3,0,0,0,0,0,0,0,
"page object")])
])
])]).
text('black',400,258,1,1,1,115,15,119,12,3,2,0,0,0,2,115,15,0,0,"",0,0,0,0,270,'',[
minilines(115,15,0,0,1,0,0,[
mini_line(115,12,3,0,0,0,[
str_block(0,115,12,3,0,-1,0,0,0,[
str_seg('black','Helvetica-Bold',1,69120,115,12,3,0,-1,0,0,0,0,0,
"rendering sequence")])
])
])]).