Commit bdc59684 authored by Stefano Rivera's avatar Stefano Rivera

Imported Upstream version 20140328+dfsg

parent d8689de3
......@@ -7,6 +7,7 @@ PYTHON=python2
GIT=git
RM=rm -f
CP=cp -f
MKDIR=mkdir
all:
......@@ -26,7 +27,7 @@ sdist: distclean MANIFEST.in
register: distclean MANIFEST.in
$(PYTHON) setup.py sdist upload register
WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
WEBDIR=../euske.github.io/$(PACKAGE)
publish:
$(CP) docs/*.html docs/*.png docs/*.css $(WEBDIR)
......@@ -36,15 +37,21 @@ CMAPDST=pdfminer/cmap
cmap: $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz \
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz
cmap_clean:
cd $(CMAPDST) && make cmap_clean
$(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz:
$(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
$(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz:
$(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz:
$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
$(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz:
$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
-$(RM) -r $(CMAPDST)
$(CMAPDST):
$(MKDIR) $(CMAPDST)
$(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz: $(CMAPDST)
$(CONV_CMAP) -c B5=cp950 -c UniCNS-UTF8=utf-8 \
$(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt
$(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz: $(CMAPDST)
$(CONV_CMAP) -c GBK-EUC=cp936 -c UniGB-UTF8=utf-8 \
$(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz: $(CMAPDST)
$(CONV_CMAP) -c RKSJ=cp932 -c EUC=euc-jp -c UniJIS-UTF8=utf-8 \
$(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt
$(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)
$(CONV_CMAP) -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 \
$(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt
test: cmap
cd samples && $(MAKE) test
......
Metadata-Version: 1.0
Metadata-Version: 1.1
Name: pdfminer
Version: 20110515
Version: 20140328
Summary: PDF parser and analyzer
Home-page: http://www.unixuser.org/~euske/python/pdfminer/index.html
Home-page: http://euske.github.io/pdfminer/index.html
Author: Yusuke Shinyama
Author-email: yusuke at cs dot nyu dot edu
License: MIT/X
Description: PDFMiner is a tool for extracting information from PDF documents.
Unlike other PDF-related tools, it focuses entirely on getting
Unlike other PDF-related tools, it focuses entirely on getting
and analyzing text data. PDFMiner allows to obtain
the exact location of texts in a page, as well as
the exact location of texts in a page, as well as
other information such as fonts or lines.
It includes a PDF converter that can transform PDF files
into other text formats (such as HTML). It has an extensible
......
See docs/index.html
......@@ -9,7 +9,7 @@
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Sat May 14 16:33:16 UTC 2011
Last Modified: Fri Mar 28 09:17:06 UTC 2014
<!-- hhmts end -->
</div>
......@@ -139,7 +139,7 @@ In order to process CJK languages, you need an additional step to take
during installation:
<blockquote><pre>
# <strong>make cmap</strong>
python tools/conv_cmap.py pdfminer/cmap Adobe-CNS1 cmaprsrc/cid2code_Adobe_CNS1.txt cp950 big5
python tools/conv_cmap.py pdfminer/cmap Adobe-CNS1 cmaprsrc/cid2code_Adobe_CNS1.txt
reading 'cmaprsrc/cid2code_Adobe_CNS1.txt'...
writing 'CNS1_H.py'...
...
......@@ -152,10 +152,11 @@ writing 'CNS1_H.py'...
On Windows machines which don't have <code>make</code> command,
paste the following commands on a command line prompt:
<blockquote><pre>
<strong>python tools\conv_cmap.py pdfminer\cmap Adobe-CNS1 cmaprsrc\cid2code_Adobe_CNS1.txt cp950 big5</strong>
<strong>python tools\conv_cmap.py pdfminer\cmap Adobe-GB1 cmaprsrc\cid2code_Adobe_GB1.txt cp936 gb2312</strong>
<strong>python tools\conv_cmap.py pdfminer\cmap Adobe-Japan1 cmaprsrc\cid2code_Adobe_Japan1.txt cp932 euc-jp</strong>
<strong>python tools\conv_cmap.py pdfminer\cmap Adobe-Korea1 cmaprsrc\cid2code_Adobe_Korea1.txt cp949 euc-kr</strong>
<strong>mkdir pdfminer\cmap</strong>
<strong>python tools\conv_cmap.py -c B5=cp950 -c UniCNS-UTF8=utf-8 pdfminer\cmap Adobe-CNS1 cmaprsrc\cid2code_Adobe_CNS1.txt</strong>
<strong>python tools\conv_cmap.py -c GBK-EUC=cp936 -c UniGB-UTF8=utf-8 pdfminer\cmap Adobe-GB1 cmaprsrc\cid2code_Adobe_GB1.txt</strong>
<strong>python tools\conv_cmap.py -c RKSJ=cp932 -c EUC=euc-jp -c UniJIS-UTF8=utf-8 pdfminer\cmap Adobe-Japan1 cmaprsrc\cid2code_Adobe_Japan1.txt</strong>
<strong>python tools\conv_cmap.py -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 pdfminer\cmap Adobe-Korea1 cmaprsrc\cid2code_Adobe_Korea1.txt</strong>
<strong>python setup.py install</strong>
</pre></blockquote>
......@@ -263,6 +264,12 @@ are M = 1.0, L = 0.3, and W = 0.2, respectively.
<td style="border-top:1px blue solid" align=right>&uarr;</td>
</tr></table>
<p>
<dt> <code>-F <em>boxes_flow</em></code>
<dd> Specifies how much a horizontal and vertical position of a text matters
when determining a text order. The value should be within the range of
-1.0 (only horizontal position matters) to +1.0 (only vertical position matters).
The default value is 0.5.
<p>
<dt> <code>-C</code>
<dd> Suppress object caching.
This will reduce the memory consumption but also slows down the process.
......@@ -285,6 +292,9 @@ including text contained in figures.
<li> <code>loose</code> : preserve the overall location of each text block.
</ul>
<p>
<dt> <code>-E <em>extractdir</em></code>
<dd> Specifies the extraction directory of embedded files.
<p>
<dt> <code>-s <em>scale</em></code>
<dd> Specifies the output scale. Can be used in HTML format only.
<p>
......@@ -352,6 +362,9 @@ no stream header is displayed for the ease of saving it to a file.
<dt> <code>-T</code>
<dd> Shows the table of contents.
<p>
<dt> <code>-E <em>directory</em></code>
<dd> Extracts embedded files from the pdf into the given directory.
<p>
<dt> <code>-P <em>password</em></code>
<dd> Provides the user password to access PDF contents.
<p>
......@@ -361,12 +374,32 @@ no stream header is displayed for the ease of saving it to a file.
<h2><a name="changes">Changes</a></h2>
<ul>
<li> 2010/05/15: Speed improvements for layout analysis.
<li> 2010/05/15: API changes. <code>LTText.get_text()</code> is added.
<li> 2010/04/20: API changes. LTPolygon class was renamed as LTCurve.
<li> 2010/04/20: LTLine now represents horizontal/vertical lines only. Thanks to Koji Nakagawa.
<li> 2010/03/07: Documentation improvements by Jakub Wilk. Memory usage patch by Jonathan Hunt.
<li> 2010/02/27: Bugfixes and layout analysis improvements. Thanks to fujimoto.report.
<li> 2014/03/28: Further bugfixes.
<li> 2014/03/24: Bugfixes and improvements for fauly PDFs.<br>
API changes:
<ul>
<li> <code>PDFDocument.initialize()</code> method is removed and no longer needed.
A password is given as an argument of a PDFDocument constructor.
</ul>
<li> 2013/11/13: Bugfixes and minor improvements.<br>
As of November 2013, there were a few changes made to the PDFMiner API
prior to October 2013. This is the result of code restructuring. Here
is a list of the changes:
<ul>
<li> <code>PDFDocument</code> class is moved to <code>pdfdocument.py</code>.
<li> <code>PDFDocument</code> class now takes a <code>PDFParser</code> object as an argument.
<li> <code>PDFDocument.set_parser()</code> and <code>PDFParser.set_document()</code> is removed.
<li> <code>PDFPage</code> class is moved to <code>pdfpage.py</code>.
<li> <code>process_pdf</code> function is implemented as <code>PDFPage.get_pages</code>.
</ul>
<li> 2013/10/22: Sudden resurge of interests. API changes.
Incorporated a lot of patches and robust handling of broken PDFs.
<li> 2011/05/15: Speed improvements for layout analysis.
<li> 2011/05/15: API changes. <code>LTText.get_text()</code> is added.
<li> 2011/04/20: API changes. LTPolygon class was renamed as LTCurve.
<li> 2011/04/20: LTLine now represents horizontal/vertical lines only. Thanks to Koji Nakagawa.
<li> 2011/03/07: Documentation improvements by Jakub Wilk. Memory usage patch by Jonathan Hunt.
<li> 2011/02/27: Bugfixes and layout analysis improvements. Thanks to fujimoto.report.
<li> 2010/12/26: A couple of bugfixes and minor improvements. Thanks to Kevin Brubeck Unhammer and Daniel Gerber.
<li> 2010/10/17: A couple of bugfixes and minor improvements. Thanks to standardabweichung and Alastair Irving.
<li> 2010/09/07: A minor bugfix. Thanks to Alexander Garden.
......@@ -423,9 +456,7 @@ no stream header is displayed for the ease of saving it to a file.
<a href="http://www.python.org/dev/peps/pep-0257/">PEP-257</a> conformance.
<li> Better documentation.
<li> Better text extraction / layout analysis. (writing mode detection, Type1 font file analysis, etc.)
<li> Robust error handling.
<li> Crypt stream filter support. (More sample documents are needed!)
<li> CCITTFax stream filter support.
</ul>
<h2><a name="related">Related Projects</a></h2>
......@@ -441,7 +472,7 @@ no stream header is displayed for the ease of saving it to a file.
(This is so-called MIT/X License)
<p>
<small>
Copyright (c) 2004-2010 Yusuke Shinyama &lt;yusuke at cs dot nyu dot edu&gt;
Copyright (c) 2004-2013 Yusuke Shinyama &lt;yusuke at cs dot nyu dot edu&gt;
<p>
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
......
......@@ -9,7 +9,7 @@
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Sat May 14 16:36:12 UTC 2011
Last Modified: Mon Mar 24 11:49:28 UTC 2014
<!-- hhmts end -->
</div>
......@@ -23,9 +23,9 @@ from other applications.
<ul>
<li> <a href="#overview">Overview</a>
<li> <a href="#basic">Basic Usage</a>
<li> <a href="#layout">Layout Analysis</a>
<li> <a href="#tocextract">TOC Extraction</a>
<li> <a href="#extend">Parser Extension</a>
<li> <a href="#layout">Performing Layout Analysis</a>
<li> <a href="#tocextract">Obtaining Table of Contents</a>
<li> <a href="#extend">Extending Functionality</a>
</ul>
<h2><a name="overview">Overview</a></h2>
......@@ -75,8 +75,12 @@ Figure 1 shows the relationship between the classes in PDFMiner.
<p>
A typical way to parse a PDF file is the following:
<blockquote><pre>
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
<span class="comment"># Open a PDF file.</span>
......@@ -84,15 +88,10 @@ fp = open('mypdf.pdf', 'rb')
<span class="comment"># Create a PDF parser object associated with the file object.</span>
parser = PDFParser(fp)
<span class="comment"># Create a PDF document object that stores the document structure.</span>
doc = PDFDocument()
<span class="comment"># Connect the parser and document objects.</span>
parser.set_document(doc)
doc.set_parser(parser)
<span class="comment"># Supply the password for initialization.</span>
<span class="comment"># (If no password is set, give an empty string.)</span>
doc.initialize(password)
document = PDFDocument(parser, password)
<span class="comment"># Check if the document allows text extraction. If not, abort.</span>
if not doc.is_extractable:
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
<span class="comment"># Create a PDF resource manager object that stores shared resources.</span>
rsrcmgr = PDFResourceManager()
......@@ -101,11 +100,11 @@ device = PDFDevice(rsrcmgr)
<span class="comment"># Create a PDF interpreter object.</span>
interpreter = PDFPageInterpreter(rsrcmgr, device)
<span class="comment"># Process each page contained in the document.</span>
for page in doc.get_pages():
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
</pre></blockquote>
<h2><a name="layout">Accessing Layout Objects</a></h2>
<h2><a name="layout">Performing Layout Analysis</a></h2>
<p>
Here is a typical way to use the layout analysis function:
<blockquote><pre>
......@@ -117,15 +116,15 @@ laparams = LAParams()
<span class="comment"># Create a PDF page aggregator object.</span>
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in doc.get_pages():
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
<span class="comment"># receive the LTPage object for the page.</span>
layout = device.get_result()
</pre></blockquote>
The layout analyzer gives a "<code>LTPage</code>" object for each page
in the PDF document. The object contains child objects within the page,
forming a tree-like structure. Figure 2 shows the relationship between
A layout analyzer returns a <code>LTPage</code> object for each page
in the PDF document. This object contains child objects within the page,
forming a tree structure. Figure 2 shows the relationship between
these objects.
<div align=center>
......@@ -153,10 +152,10 @@ or vertically, depending on the text's writing mode.
<code>get_text()</code> method returns the text content.
<dt> <code>LTChar</code>
<dt> <code>LTAnon</code>
<dt> <code>LTAnno</code>
<dd> Represent an actual letter in the text as a Unicode string.
Note that, while a <code>LTChar</code> object has actual boundaries,
<code>LTAnon</code> objects does not, as these are "virtual" characters,
<code>LTAnno</code> objects does not, as these are "virtual" characters,
inserted by a layout analyzer according to the relationship between two characters
(e.g. a space).
......@@ -179,29 +178,28 @@ Could be used for separating text or figures.
Could be used for framing another pictures or figures.
<dt> <code>LTCurve</code>
<dd> Represents a generic bezier curve.
<dd> Represents a generic Bezier curve.
</dl>
<p>
Also, check out <a href="http://denis.papathanasiou.org/?p=343">a more complete example by Denis Papathanasiou</a>.
<h2><a name="tocextract">TOC Extraction</a></h2>
<h2><a name="tocextract">Obtaining Table of Contents</a></h2>
<p>
PDFMiner provides functions to access the document's table of contents
("Outlines").
<blockquote><pre>
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
<span class="comment"># Open a PDF document.</span>
fp = open('mypdf.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
document = PDFDocument(parser, password)
<span class="comment"># Get the outlines of the document.</span>
outlines = doc.get_outlines()
outlines = document.get_outlines()
for (level,title,dest,a,se) in outlines:
print (level, title)
</pre></blockquote>
......@@ -209,12 +207,12 @@ for (level,title,dest,a,se) in outlines:
<p>
Some PDF documents use page numbers as destinations, while others
use page numbers and the physical location within the page. Since
PDF does not have a logical strucutre, and it does not provide a
PDF does not have a logical structure, and it does not provide a
way to refer to any in-page object from the outside, there's no
way to tell exactly which part of text these destinations are
refering to.
referring to.
<h2><a name="extend">Parser Extension</a></h2>
<h2><a name="extend">Extending Functionality</a></h2>
<p>
You can extend <code>PDFPageInterpreter</code> and <code>PDFDevice</code> class
......
blockquote { background: #eeeeee; }
h1 { border-bottom: solid black 2px; }
h2 { border-bottom: solid black 1px; }
.comment { color: darkgreen; }
#!/usr/bin/env python2
__version__ = '20110515'
#!/usr/bin/env python
__version__ = '20140328'
if __name__ == '__main__': print __version__
if __name__ == '__main__':
print __version__
#!/usr/bin/env python2
#!/usr/bin/env python
""" Python implementation of Arcfour encryption algorithm.
......@@ -6,6 +6,7 @@ This code is in the public domain.
"""
## Arcfour
##
class Arcfour(object):
......
#!/usr/bin/env python2
#!/usr/bin/env python
""" Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
......@@ -9,6 +9,7 @@ This code is in the public domain.
import re
import struct
# ascii85decode(data)
def ascii85decode(data):
"""
......@@ -16,13 +17,13 @@ def ascii85decode(data):
letters, using 85 different types of characters (as 256**4 < 85**5).
When the length of the original bytes is not a multiple of 4, a special
rule is used for round up.
The Adobe's ASCII85 implementation is slightly different from
its original in handling the last characters.
The sample string is taken from:
http://en.wikipedia.org/w/index.php?title=Ascii85
>>> ascii85decode('9jqo^BlbD-BleB1DJ+*+F(f,q')
'Man is distinguished'
>>> ascii85decode('E,9)oF*2M7/c~>')
......@@ -35,7 +36,7 @@ def ascii85decode(data):
n += 1
b = b*85+(ord(c)-33)
if n == 5:
out += struct.pack('>L',b)
out += struct.pack('>L', b)
n = b = 0
elif c == 'z':
assert n == 0
......@@ -44,13 +45,15 @@ def ascii85decode(data):
if n:
for _ in range(5-n):
b = b*85+84
out += struct.pack('>L',b)[:n-1]
out += struct.pack('>L', b)[:n-1]
break
return out
# asciihexdecode(data)
hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
def asciihexdecode(data):
"""
ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
......@@ -60,7 +63,7 @@ def asciihexdecode(data):
EOD. Any other characters will cause an error. If the filter encounters
the EOD marker after reading an odd number of hexadecimal digits, it
will behave as if a 0 followed the last digit.
>>> asciihexdecode('61 62 2e6364 65')
'ab.cde'
>>> asciihexdecode('61 62 2e6364 657>')
......
This diff is collapsed.
# Makefile for pdfminer.cmap
all:
clean:
-rm *.pyc *.pyo
cmap_clean:
rm -f *.pickle.gz
#!/usr/bin/env python2
#!/usr/bin/env python
""" Adobe character mapping (CMap) support.
......@@ -12,22 +12,21 @@ More information is available on the Adobe website:
"""
import sys
import re
import os
import os.path
import gzip
import cPickle as pickle
import cmap
import struct
from psparser import PSStackParser
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
from psparser import PSLiteral, PSKeyword
from psparser import literal_name, keyword_name
from psparser import PSSyntaxError, PSEOF
from psparser import PSLiteral
from psparser import literal_name
from encodingdb import name2unicode
from utils import choplist, nunpack
class CMapError(Exception): pass
class CMapError(Exception):
pass
## CMap
......@@ -45,8 +44,9 @@ class CMap(object):
def use_cmap(self, cmap):
assert isinstance(cmap, CMap)
def copy(dst, src):
for (k,v) in src.iteritems():
for (k, v) in src.iteritems():
if isinstance(v, dict):
d = {}
dst[k] = d
......@@ -75,14 +75,14 @@ class CMap(object):
if code2cid is None:
code2cid = self.code2cid
code = ()
for (k,v) in sorted(code2cid.iteritems()):
for (k, v) in sorted(code2cid.iteritems()):
c = code+(k,)
if isinstance(v, int):
out.write('code %r = cid %d\n' % (c,v))
out.write('code %r = cid %d\n' % (c, v))
else:
self.dump(out=out, code2cid=v, code=c)
return
## IdentityCMap
##
......@@ -96,13 +96,12 @@ class IdentityCMap(object):
return self.vertical
def decode(self, code):
n = len(code)/2
n = len(code)//2
if n:
return struct.unpack('>%dH' % n, code)
else:
return ()
## UnicodeMap
##
......@@ -120,8 +119,8 @@ class UnicodeMap(object):
return self.cid2unichr[cid]
def dump(self, out=sys.stdout):
for (k,v) in sorted(self.cid2unichr.iteritems()):
out.write('cid %d = unicode %r\n' % (k,v))
for (k, v) in sorted(self.cid2unichr.iteritems()):
out.write('cid %d = unicode %r\n' % (k, v))
return
......@@ -154,7 +153,7 @@ class FileCMap(CMap):
else:
t = {}
d[c] = t
d =t
d = t
c = ord(code[-1])
d[c] = cid
return
......@@ -163,7 +162,7 @@ class FileCMap(CMap):
## FileUnicodeMap
##
class FileUnicodeMap(UnicodeMap):
def __init__(self):
UnicodeMap.__init__(self)
self.attrs = {}
......@@ -206,12 +205,12 @@ class PyCMap(CMap):
def is_vertical(self):
return self._is_vertical
## PyUnicodeMap
##
class PyUnicodeMap(UnicodeMap):
def __init__(self, name, module, vertical):
if vertical:
cid2unichr = module.CID2UNICHR_V
......@@ -232,16 +231,18 @@ class CMapDB(object):
debug = 0
_cmap_cache = {}
_umap_cache = {}
class CMapNotFound(CMapError): pass
class CMapNotFound(CMapError):
pass
@classmethod
def _load_data(klass, name):
filename = '%s.pickle.gz' % name
if klass.debug:
print >>sys.stderr, 'loading:', name
default_path = os.environ.get('CMAP_PATH', '/usr/share/pdfminer/')
for directory in (os.path.dirname(cmap.__file__), default_path):
cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
os.path.join(os.path.dirname(__file__), 'cmap'),)
for directory in cmap_paths:
path = os.path.join(directory, filename)
if os.path.exists(path):
gzfile = gzip.open(path)
......@@ -284,7 +285,8 @@ class CMapParser(PSStackParser):
def __init__(self, cmap, fp):
PSStackParser.__init__(self, fp)
self.cmap = cmap
self._in_cmap = False
# some ToUnicode maps don't have "begincmap" keyword.
self._in_cmap = True
return
def run(self):
......@@ -303,11 +305,12 @@ class CMapParser(PSStackParser):
elif name == 'endcmap':
self._in_cmap = False
return
if not self._in_cmap: return
if not self._in_cmap:
return
#
if name == 'def':
try:
((_,k),(_,v)) = self.pop(2)
((_, k), (_, v)) = self.pop(2)
self.cmap.set_attr(literal_name(k), v)
except PSSyntaxError:
pass
......@@ -315,7 +318,7 @@ class CMapParser(PSStackParser):
if name == 'usecmap':
try:
((_,cmapname),) = self.pop(1)
((_, cmapname),) = self.pop(1)
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError:
pass
......@@ -334,13 +337,15 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endcidrange':
objs = [ obj for (_,obj) in self.popall() ]
for (s,e,cid) in choplist(3, objs):
objs = [obj for (__, obj) in self.popall()]
for (s, e, cid) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
not isinstance(cid, int) or len(s) != len(e)): continue
not isinstance(cid, int) or len(s) != len(e)):
continue
sprefix = s[:-4]
eprefix = e[:-4]
if sprefix != eprefix: continue
if sprefix != eprefix:
continue
svar = s[-4:]
evar = e[-4:]
s1 = nunpack(svar)
......@@ -348,7 +353,7 @@ class CMapParser(PSStackParser):
vlen = len(svar)
#assert s1 <= e1
for i in xrange(e1-s1+1):
x = sprefix+struct.pack('>L',s1+i)[-vlen:]
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
self.cmap.add_code2cid(x, cid+i)
return
......@@ -356,8 +361,8 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endcidchar':
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str):
self.cmap.add_code2cid(code, nunpack(cid))
return
......@@ -366,10 +371,11 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endbfrange':
objs = [ obj for (_,obj) in self.popall() ]
for (s,e,code) in choplist(3, objs):
objs = [obj for (__, obj) in self.popall()]
for (s, e, code) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
len(s) != len(e)): continue
len(s) != len(e)):
continue
s1 = nunpack(s)
e1 = nunpack(e)
#assert s1 <= e1
......@@ -382,7 +388,7 @@ class CMapParser(PSStackParser):
prefix = code[:-4]
vlen = len(var)
for i in xrange(e1-s1+1):
x = prefix+struct.pack('>L',base+i)[-vlen:]
x = prefix+struct.pack('>L', base+i)[-vlen:]
self.cmap.add_cid2unichr(s1+i, x)
return
......@@ -390,8 +396,8 @@ class CMapParser(PSStackParser):
self.popall()
return
if name == 'endbfchar':
objs = [ obj for (_,obj) in self.popall() ]
for (cid,code) in choplist(2, objs):
objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str):
self.cmap.add_cid2unichr(nunpack(cid), code)
return
......@@ -406,6 +412,7 @@ class CMapParser(PSStackParser):
self.push((pos, token))
return
# test
def main(argv):
args = argv[1:]
......@@ -418,4 +425,5 @@ def main(argv):
cmap.dump()
return
if __name__ == '__main__': sys.exit(main(sys.argv))
if __name__ == '__main__':
sys.exit(main(sys.argv))
This diff is collapsed.
#!/usr/bin/env python2
#!/usr/bin/env python
import re
from psparser import PSLiteral
......@@ -6,15 +6,18 @@ from glyphlist import glyphname2unicode
from latin_enc import ENCODING
STRIP_NAME = re.compile(r'[0-9]+')
## name2unicode
##
STRIP_NAME = re.compile(r'[0-9]+')
def name2unicode(name):
"""Converts Adobe glyph names to Unicode numbers."""
if name in glyphname2unicode:
return glyphname2unicode[name]
m = STRIP_NAME.search(name)
if not m: raise KeyError(name)
if not m:
raise KeyError(name)
return unichr(int(m.group(0)))
......@@ -26,19 +29,23 @@ class EncodingDB(object):
mac2unicode = {}
win2unicode = {}
pdf2unicode = {}
for (name,std,mac,win,pdf) in ENCODING:
for (name, std, mac, win, pdf) in ENCODING: