pdf2html.cgi 7.37 KB
Newer Older
1
#!/usr/bin/env python -O
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
#
# pdf2html.cgi - Gateway script for converting PDF into HTML.
#
# Security consideration for public access:
#
#   Limit the process size and/or maximum cpu time.
#   The process should be chrooted.
#   The user should be imposed quota.
#
# How to Setup:
#   $ mkdir $CGIDIR
#   $ mkdir $CGIDIR/var
#   $ python setup.py install_lib --install-dir=$CGIDIR
#   $ cp pdfminer/tools/pdf2html.cgi $CGIDIR
#

18 19
import sys, os, os.path, re, time
import cgi, logging, traceback, random
20 21 22
# comment out at this at runtime.
#import cgitb; cgitb.enable()
import pdfminer
23 24 25
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
from pdfminer.converter import HTMLConverter, TextConverter
from pdfminer.layout import LAParams


# quote HTML metacharacters
def q(x):
    return x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')

# encode parameters as a URL
Q = re.compile(r'[^a-zA-Z0-9_.-=]')
def url(base, **kw):
    r = []
    for (k,v) in kw.iteritems():
        v = Q.sub(lambda m: '%%%02X' % ord(m.group(0)), encoder(q(v), 'replace')[0])
        r.append('%s=%s' % (k, v))
    return base+'&'.join(r)


##  convert
##
class FileSizeExceeded(ValueError): pass
47
def convert(infp, outfp, path, codec='utf-8',
48 49 50
            maxpages=0, maxfilesize=0, pagenos=None,
            html=True):
    # save the input file.
51
    src = open(path, 'wb')
52 53 54 55 56 57 58 59 60 61 62 63
    nbytes = 0
    while 1:
        data = infp.read(4096)
        nbytes += len(data)
        if maxfilesize and maxfilesize < nbytes:
            raise FileSizeExceeded(maxfilesize)
        if not data: break
        src.write(data)
    src.close()
    infp.close()
    # perform conversion and
    # send the results over the network.
64
    rsrcmgr = PDFResourceManager()
65 66
    laparams = LAParams()
    if html:
67 68
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               layoutmode='exact')
69
    else:
70
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
71 72 73 74
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages):
        interpreter.process_page(page)
75
    fp.close()
76
    device.close()
77 78 79
    return


80
##  WebApp
81
##
82
class WebApp(object):
83

84
    TITLE = 'pdf2html demo'
85 86
    MAXFILESIZE = 10000000             # set to zero if unlimited.
    MAXPAGES = 100                     # set to zero if unlimited.
87

88 89 90
    def __init__(self, infp=sys.stdin, outfp=sys.stdout, environ=os.environ,
                 codec='utf-8', apppath='/'):
        self.infp = infp
91
        self.outfp = outfp
92
        self.environ = environ
93
        self.codec = codec
94 95 96 97 98 99
        self.apppath = apppath
        self.remote_addr = self.environ.get('REMOTE_ADDR')
        self.path_info = self.environ.get('PATH_INFO')
        self.method = self.environ.get('REQUEST_METHOD', 'GET').upper()
        self.server = self.environ.get('SERVER_SOFTWARE', '')
        self.tmpdir = self.environ.get('TEMP', './var/')
100
        self.content_type = 'text/html; charset=%s' % codec
101
        self.logger = logging.getLogger()
102 103 104 105 106 107 108 109 110 111
        return

    def put(self, *args):
        for x in args:
            if isinstance(x, str):
                self.outfp.write(x)
            elif isinstance(x, unicode):
                self.outfp.write(x.encode(self.codec, 'xmlcharrefreplace'))
        return

112
    def response_200(self):
113 114 115 116 117 118 119
        if self.server.startswith('cgi-httpd'):
            # required for cgi-httpd
            self.outfp.write('HTTP/1.0 200 OK\r\n')
        self.outfp.write('Content-type: %s\r\n' % self.content_type)
        self.outfp.write('Connection: close\r\n\r\n')
        return

120
    def response_404(self):
121 122 123 124 125 126 127 128
        if self.server.startswith('cgi-httpd'):
            # required for cgi-httpd
            self.outfp.write('HTTP/1.0 404 Not Found\r\n')
        self.outfp.write('Content-type: text/html\r\n')
        self.outfp.write('Connection: close\r\n\r\n')
        self.outfp.write('<html><body>page does not exist</body></body>\n')
        return

129
    def response_301(self, url):
130 131 132 133 134 135 136 137
        if self.server.startswith('cgi-httpd'):
            # required for cgi-httpd
            self.outfp.write('HTTP/1.0 301 Moved\r\n')
        self.outfp.write('Location: %s\r\n\r\n' % url)
        return

    def coverpage(self):
        self.put(
138 139
          '<html><head><title>%s</title></head><body>\n' % q(self.TITLE),
          '<h1>%s</h1><hr>\n' % q(self.TITLE),
140
          '<form method="POST" action="%s" enctype="multipart/form-data">\n' % q(self.apppath),
141 142 143 144 145 146 147 148 149 150 151 152 153 154
          '<p>Upload PDF File: <input name="f" type="file" value="">\n',
          '&nbsp; Page numbers (comma-separated):\n',
          '<input name="p" type="text" size="10" value="">\n',
          '<p>(Text extraction is limited to maximum %d pages.\n' % self.MAXPAGES,
          'Maximum file size for input is %d bytes.)\n' % self.MAXFILESIZE,
          '<p><input type="submit" name="c" value="Convert to HTML">\n',
          '<input type="submit" name="c" value="Convert to TEXT">\n',
          '<input type="reset" value="Reset">\n',
          '</form><hr>\n',
          '<p>Powered by <a href="http://www.unixuser.org/~euske/python/pdfminer/">PDFMiner</a>-%s\n' % pdfminer.__version__,
          '</body></html>\n',
          )
        return

155 156 157
    def setup(self):
        self.run = self.response_404
        status = 404
158
        if not os.path.isdir(self.tmpdir):
159 160 161 162 163 164 165 166
            self.logger.error('no tmpdir')
            status = 304
        elif self.path_info == self.apppath:
            self.run = self.convert
            status = 200
        return status

    def convert(self):
167
        form = cgi.FieldStorage(fp=self.infp, environ=self.environ)
168
        if (self.method != 'POST' or
169 170
            'c' not in form or
            'f' not in form):
171
            self.response_200()
172
            self.coverpage()
173
            return
174
        item = form['f']
175
        if not (item.file and item.filename):
176
            self.response_200()
177
            self.coverpage()
178
            return
179
        cmd = form.getvalue('c')
180 181
        html = (cmd == 'Convert to HTML')
        pagenos = []
182 183
        if 'p' in form:
            for m in re.finditer(r'\d+', form.getvalue('p')):
184 185 186 187 188
                try:
                    pagenos.append(int(m.group(0)))
                except ValueError:
                    pass
        h = abs(hash((random.random(), self.remote_addr, item.filename)))
189 190 191
        tmppath = os.path.join(self.tmpdir, '%08x%08x.pdf' % (time.time(), h))
        self.logger.info('received: host=%s, name=%r, pagenos=%r, tmppath=%r' %
                         (self.remote_addr, item.filename, pagenos, tmppath))
192
        try:
193 194
            if not html:
                self.content_type = 'text/plain; charset=%s' % self.codec
195
            self.response_200()
196
            try:
197
                convert(item.file, self.outfp, tmppath, pagenos=pagenos, codec=self.codec,
198
                        maxpages=self.MAXPAGES, maxfilesize=self.MAXFILESIZE, html=html)
199 200
            except Exception as e:
                self.put('<p>Sorry, an error has occurred: %s' % q(repr(e)))
201
                self.logger.error('convert: %r: path=%r: %s' % (e, traceback.format_exc()))
202 203 204 205 206 207 208 209 210
        finally:
            try:
                os.remove(tmppath)
            except:
                pass
        return


# main
211 212 213 214
if __name__ == '__main__':
    app = WebApp()
    app.setup()
    sys.exit(app.run())