converter.py 20.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28

# -*- coding: utf-8 -*-
import logging
import re
from .pdfdevice import PDFTextDevice
from .pdffont import PDFUnicodeNotDefined
from .layout import LTContainer
from .layout import LTPage
from .layout import LTText
from .layout import LTLine
from .layout import LTRect
from .layout import LTCurve
from .layout import LTFigure
from .layout import LTImage
from .layout import LTChar
from .layout import LTTextLine
from .layout import LTTextBox
from .layout import LTTextBoxVertical
from .layout import LTTextGroup
from .utils import apply_matrix_pt
from .utils import mult_matrix
from .utils import enc
from .utils import bbox2str
from . import utils

import six  # Python 2+3 compatibility

log = logging.getLogger(__name__)
29 30


31
##  PDFLayoutAnalyzer
32
##
33
class PDFLayoutAnalyzer(PDFTextDevice):
34 35 36 37

    def __init__(self, rsrcmgr, pageno=1, laparams=None):
        PDFTextDevice.__init__(self, rsrcmgr)
        self.pageno = pageno
38 39
        self.laparams = laparams
        self._stack = []
40 41 42
        return

    def begin_page(self, page, ctm):
43 44 45
        (x0, y0, x1, y1) = page.mediabox
        (x0, y0) = apply_matrix_pt(ctm, (x0, y0))
        (x1, y1) = apply_matrix_pt(ctm, (x1, y1))
46 47 48 49
        mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
        self.cur_item = LTPage(self.pageno, mediabox)
        return

50
    def end_page(self, page):
51 52
        assert not self._stack, str(len(self._stack))
        assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
53 54
        if self.laparams is not None:
            self.cur_item.analyze(self.laparams)
55
        self.pageno += 1
56 57
        self.receive_layout(self.cur_item)
        return
58 59

    def begin_figure(self, name, bbox, matrix):
60
        self._stack.append(self.cur_item)
61 62 63 64 65
        self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
        return

    def end_figure(self, _):
        fig = self.cur_item
66
        assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
67
        self.cur_item = self._stack.pop()
68 69 70 71
        self.cur_item.add(fig)
        return

    def render_image(self, name, stream):
72
        assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
73 74 75 76 77 78 79 80 81 82
        item = LTImage(name, stream,
                       (self.cur_item.x0, self.cur_item.y0,
                        self.cur_item.x1, self.cur_item.y1))
        self.cur_item.add(item)
        return

    def paint_path(self, gstate, stroke, fill, evenodd, path):
        shape = ''.join(x[0] for x in path)
        if shape == 'ml':
            # horizontal/vertical line
83 84 85 86
            (_, x0, y0) = path[0]
            (_, x1, y1) = path[1]
            (x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
            (x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
87
            if x0 == x1 or y0 == y1:
88 89
                self.cur_item.add(LTLine(gstate.linewidth, (x0, y0), (x1, y1),
                    stroke, fill, evenodd, gstate.scolor, gstate.ncolor))
90 91
                return
        if shape == 'mlllh':
92
            # rectangle
93 94 95 96 97 98 99 100
            (_, x0, y0) = path[0]
            (_, x1, y1) = path[1]
            (_, x2, y2) = path[2]
            (_, x3, y3) = path[3]
            (x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
            (x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
            (x2, y2) = apply_matrix_pt(self.ctm, (x2, y2))
            (x3, y3) = apply_matrix_pt(self.ctm, (x3, y3))
101 102
            if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
                (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
103 104
                self.cur_item.add(LTRect(gstate.linewidth, (x0, y0, x2, y2),
                    stroke, fill, evenodd, gstate.scolor, gstate.ncolor))
105 106 107 108
                return
        # other shapes
        pts = []
        for p in path:
109
            for i in range(1, len(p), 2):
110
                pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
111 112
        self.cur_item.add(LTCurve(gstate.linewidth, pts, stroke, fill,
            evenodd, gstate.scolor, gstate.ncolor))
113 114
        return

115
    def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate):
116 117
        try:
            text = font.to_unichr(cid)
118
            assert isinstance(text, six.text_type), str(type(text))
119 120 121 122
        except PDFUnicodeNotDefined:
            text = self.handle_undefined_char(font, cid)
        textwidth = font.char_width(cid)
        textdisp = font.char_disp(cid)
123
        item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp, ncs, graphicstate)
124 125 126
        self.cur_item.add(item)
        return item.adv

127
    def handle_undefined_char(self, font, cid):
128
        log.info('undefined: %r, %r', font, cid)
129 130
        return '(cid:%d)' % cid

131 132 133 134 135 136 137 138 139 140 141 142
    def receive_layout(self, ltpage):
        return


##  PDFPageAggregator
##
class PDFPageAggregator(PDFLayoutAnalyzer):

    def __init__(self, rsrcmgr, pageno=1, laparams=None):
        PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
        self.result = None
        return
143

144 145 146 147 148 149 150
    def receive_layout(self, ltpage):
        self.result = ltpage
        return

    def get_result(self):
        return self.result

151 152 153

##  PDFConverter
##
154
class PDFConverter(PDFLayoutAnalyzer):
155 156

    def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
157
        PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
158 159
        self.outfp = outfp
        self.codec = codec
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
        if hasattr(self.outfp, 'mode'):
            if 'b' in self.outfp.mode:
                self.outfp_binary = True
            else:
                self.outfp_binary = False
        else:
            import io
            if isinstance(self.outfp, io.BytesIO):
                self.outfp_binary = True
            elif isinstance(self.outfp, io.StringIO):
                self.outfp_binary = False
            else:
                try:
                    self.outfp.write(u"é")
                    self.outfp_binary = False
                except TypeError:
                    self.outfp_binary = True
177 178 179 180 181 182 183 184
        return


##  TextConverter
##
class TextConverter(PDFConverter):

    def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
185
                 showpageno=False, imagewriter=None):
186 187
        PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
        self.showpageno = showpageno
188
        self.imagewriter = imagewriter
189 190
        return

191
    def write_text(self, text):
192 193 194 195
        text = utils.compatible_encode_method(text, self.codec, 'ignore')
        if six.PY3 and self.outfp_binary:
            text = text.encode()
        self.outfp.write(text)
196 197
        return

198
    def receive_layout(self, ltpage):
199
        def render(item):
200
            if isinstance(item, LTContainer):
201 202
                for child in item:
                    render(child)
203
            elif isinstance(item, LTText):
204
                self.write_text(item.get_text())
205
            if isinstance(item, LTTextBox):
206
                self.write_text('\n')
207 208 209
            elif isinstance(item, LTImage):
                if self.imagewriter is not None:
                    self.imagewriter.export_image(item)
210
        if self.showpageno:
211
            self.write_text('Page %s\n' % ltpage.pageid)
212
        render(ltpage)
213
        self.write_text('\f')
214 215
        return

216
    # Some dummy functions to save memory/CPU when all that is wanted
217
    # is text.  This stops all the image and drawing output from being
218
    # recorded and taking up RAM.
219
    def render_image(self, name, stream):
220 221 222 223 224
        if self.imagewriter is None:
            return
        PDFConverter.render_image(self, name, stream)
        return

225
    def paint_path(self, gstate, stroke, fill, evenodd, path):
226
        return
227

228 229 230 231 232

##  HTMLConverter
##
class HTMLConverter(PDFConverter):

233 234
    RECT_COLORS = {
        #'char': 'green',
235 236 237 238
        'figure': 'yellow',
        'textline': 'magenta',
        'textbox': 'cyan',
        'textgroup': 'red',
239
        'curve': 'black',
240
        'page': 'gray',
241 242
    }

243
    TEXT_COLORS = {
244
        'textbox': 'blue',
245
        'char': 'black',
246
    }
247

248 249
    def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
                 scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
250
                 pagemargin=50, imagewriter=None, debug=0,
251 252
                 rect_colors={'curve': 'black', 'page': 'gray'},
                 text_colors={'char': 'black'}):
253
        PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
254 255 256
        self.scale = scale
        self.fontscale = fontscale
        self.layoutmode = layoutmode
257
        self.showpageno = showpageno
258
        self.pagemargin = pagemargin
259
        self.imagewriter = imagewriter
260 261
        self.rect_colors = rect_colors
        self.text_colors = text_colors
262
        if debug:
263 264 265
            self.rect_colors.update(self.RECT_COLORS)
            self.text_colors.update(self.TEXT_COLORS)
        self._yoffset = self.pagemargin
266 267 268 269 270 271
        self._font = None
        self._fontstack = []
        self.write_header()
        return

    def write(self, text):
272 273
        if self.codec:
            text = text.encode(self.codec)
274
        self.outfp.write(text)
275 276
        return

277 278
    def write_header(self):
        self.write('<html><head>\n')
279 280 281 282
        if self.codec:
            self.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
        else:
            self.write('<meta http-equiv="Content-Type" content="text/html">\n')
283
        self.write('</head><body>\n')
284 285
        return

286 287
    def write_footer(self):
        self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
288
                   ', '.join('<a href="#%s">%s</a>' % (i, i) for i in range(1, self.pageno)))
289 290 291 292
        self.write('</body></html>\n')
        return

    def write_text(self, text):
293
        self.write(enc(text, None))
294 295 296
        return

    def place_rect(self, color, borderwidth, x, y, w, h):
297
        color = self.rect_colors.get(color)
298 299 300 301
        if color is not None:
            self.write('<span style="position:absolute; border: %s %dpx solid; '
                       'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
                       (color, borderwidth,
302
                        x*self.scale, (self._yoffset-y)*self.scale,
303 304 305
                        w*self.scale, h*self.scale))
        return

306 307 308 309
    def place_border(self, color, borderwidth, item):
        self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
        return

310
    def place_image(self, item, borderwidth, x, y, w, h):
311 312
        if self.imagewriter is not None:
            name = self.imagewriter.export_image(item)
313 314
            self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" '
                       'width="%d" height="%d" />\n' %
315
                       (enc(name, None), borderwidth,
316
                        x*self.scale, (self._yoffset-y)*self.scale,
317 318 319 320
                        w*self.scale, h*self.scale))
        return

    def place_text(self, color, text, x, y, size):
321
        color = self.text_colors.get(color)
322 323
        if color is not None:
            self.write('<span style="position:absolute; color:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
324
                       (color, x*self.scale, (self._yoffset-y)*self.scale, size*self.scale*self.fontscale))
325 326 327 328
            self.write_text(text)
            self.write('</span>\n')
        return

329
    def begin_div(self, color, borderwidth, x, y, w, h, writing_mode=False):
330 331 332 333 334
        self._fontstack.append(self._font)
        self._font = None
        self.write('<div style="position:absolute; border: %s %dpx solid; writing-mode:%s; '
                   'left:%dpx; top:%dpx; width:%dpx; height:%dpx;">' %
                   (color, borderwidth, writing_mode,
335
                    x*self.scale, (self._yoffset-y)*self.scale,
336 337
                    w*self.scale, h*self.scale))
        return
338 339 340 341 342 343 344 345

    def end_div(self, color):
        if self._font is not None:
            self.write('</span>')
        self._font = self._fontstack.pop()
        self.write('</div>')
        return

346 347 348 349 350 351
    def put_text(self, text, fontname, fontsize):
        font = (fontname, fontsize)
        if font != self._font:
            if self._font is not None:
                self.write('</span>')
            self.write('<span style="font-family: %s; font-size:%dpx">' %
352
                       (enc(fontname), fontsize * self.scale * self.fontscale))
353 354 355 356 357 358 359 360
            self._font = font
        self.write_text(text)
        return

    def put_newline(self):
        self.write('<br>')
        return

361
    def receive_layout(self, ltpage):
362
        def show_group(item):
363 364 365
            if isinstance(item, LTTextGroup):
                self.place_border('textgroup', 1, item)
                for child in item:
366
                    show_group(child)
367
            return
368

369 370
        def render(item):
            if isinstance(item, LTPage):
371 372
                self._yoffset += item.y1
                self.place_border('page', 1, item)
373
                if self.showpageno:
374
                    self.write('<div style="position:absolute; top:%dpx;">' %
375
                               ((self._yoffset-item.y1)*self.scale))
376
                    self.write('<a name="%s">Page %s</a></div>\n' % (item.pageid, item.pageid))
377 378
                for child in item:
                    render(child)
379 380 381 382 383
                if item.groups is not None:
                    for group in item.groups:
                        show_group(group)
            elif isinstance(item, LTCurve):
                self.place_border('curve', 1, item)
384
            elif isinstance(item, LTFigure):
385
                self.begin_div('figure', 1, item.x0, item.y1, item.width, item.height)
386 387
                for child in item:
                    render(child)
388
                self.end_div('figure')
389
            elif isinstance(item, LTImage):
390 391 392 393
                self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
            else:
                if self.layoutmode == 'exact':
                    if isinstance(item, LTTextLine):
394
                        self.place_border('textline', 1, item)
395 396 397
                        for child in item:
                            render(child)
                    elif isinstance(item, LTTextBox):
398
                        self.place_border('textbox', 1, item)
399 400 401 402
                        self.place_text('textbox', str(item.index+1), item.x0, item.y1, 20)
                        for child in item:
                            render(child)
                    elif isinstance(item, LTChar):
403
                        self.place_border('char', 1, item)
404
                        self.place_text('char', item.get_text(), item.x0, item.y1, item.size)
405 406 407 408 409 410 411
                else:
                    if isinstance(item, LTTextLine):
                        for child in item:
                            render(child)
                        if self.layoutmode != 'loose':
                            self.put_newline()
                    elif isinstance(item, LTTextBox):
412 413
                        self.begin_div('textbox', 1, item.x0, item.y1, item.width, item.height,
                                       item.get_writing_mode())
414 415
                        for child in item:
                            render(child)
416
                        self.end_div('textbox')
417
                    elif isinstance(item, LTChar):
418
                        self.put_text(item.get_text(), item.fontname, item.size)
419
                    elif isinstance(item, LTText):
420
                        self.write_text(item.get_text())
421
            return
422
        render(ltpage)
423
        self._yoffset += self.pagemargin
424 425 426
        return

    def close(self):
427
        self.write_footer()
428 429 430 431 432 433 434
        return


##  XMLConverter
##
class XMLConverter(PDFConverter):

435 436
    CONTROL = re.compile(u'[\x00-\x08\x0b-\x0c\x0e-\x1f]')

437
    def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
438
                 laparams=None, imagewriter=None, stripcontrol=False):
439
        PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
440
        self.imagewriter = imagewriter
441
        self.stripcontrol = stripcontrol
442 443 444
        self.write_header()
        return

445 446 447 448 449 450
    def write(self, text):
        if self.codec:
            text = text.encode(self.codec)
        self.outfp.write(text)
        return

451
    def write_header(self):
452 453 454 455 456
        if self.codec:
            self.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
        else:
            self.write('<?xml version="1.0" ?>\n')
        self.write('<pages>\n')
457 458
        return

459
    def write_footer(self):
460
        self.write('</pages>\n')
461
        return
462

463
    def write_text(self, text):
464 465 466
        if self.stripcontrol:
            text = self.CONTROL.sub(u'', text)
        self.write(enc(text, None))
467 468
        return

469
    def receive_layout(self, ltpage):
470
        def show_group(item):
471
            if isinstance(item, LTTextBox):
472
                self.write('<textbox id="%d" bbox="%s" />\n' %
473 474
                                 (item.index, bbox2str(item.bbox)))
            elif isinstance(item, LTTextGroup):
475
                self.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
476
                for child in item:
477
                    show_group(child)
478
                self.write('</textgroup>\n')
479
            return
480

481 482
        def render(item):
            if isinstance(item, LTPage):
483
                self.write('<page id="%s" bbox="%s" rotate="%d">\n' %
484 485 486
                                 (item.pageid, bbox2str(item.bbox), item.rotate))
                for child in item:
                    render(child)
487
                if item.groups is not None:
488
                    self.write('<layout>\n')
489 490
                    for group in item.groups:
                        show_group(group)
491 492
                    self.write('</layout>\n')
                self.write('</page>\n')
493
            elif isinstance(item, LTLine):
494
                self.write('<line linewidth="%d" bbox="%s" />\n' %
495 496
                                 (item.linewidth, bbox2str(item.bbox)))
            elif isinstance(item, LTRect):
497
                self.write('<rect linewidth="%d" bbox="%s" />\n' %
498
                                 (item.linewidth, bbox2str(item.bbox)))
499
            elif isinstance(item, LTCurve):
500
                self.write('<curve linewidth="%d" bbox="%s" pts="%s"/>\n' %
501 502
                                 (item.linewidth, bbox2str(item.bbox), item.get_pts()))
            elif isinstance(item, LTFigure):
503
                self.write('<figure name="%s" bbox="%s">\n' %
504 505 506
                                 (item.name, bbox2str(item.bbox)))
                for child in item:
                    render(child)
507
                self.write('</figure>\n')
508
            elif isinstance(item, LTTextLine):
509
                self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
510 511
                for child in item:
                    render(child)
512
                self.write('</textline>\n')
513
            elif isinstance(item, LTTextBox):
514 515 516
                wmode = ''
                if isinstance(item, LTTextBoxVertical):
                    wmode = ' wmode="vertical"'
517
                self.write('<textbox id="%d" bbox="%s"%s>\n' %
518
                                 (item.index, bbox2str(item.bbox), wmode))
519 520
                for child in item:
                    render(child)
521
                self.write('</textbox>\n')
522
            elif isinstance(item, LTChar):
523 524 525
                self.write('<text font="%s" bbox="%s" colourspace="%s" ncolour="%s" size="%.3f">' %
                           (enc(item.fontname, None), bbox2str(item.bbox),
                            item.ncs.name, item.graphicstate.ncolor, item.size))
526
                self.write_text(item.get_text())
527
                self.write('</text>\n')
528
            elif isinstance(item, LTText):
529
                self.write('<text>%s</text>\n' % item.get_text())
530
            elif isinstance(item, LTImage):
531 532
                if self.imagewriter is not None:
                    name = self.imagewriter.export_image(item)
533 534
                    self.write('<image src="%s" width="%d" height="%d" />\n' %
                                     (enc(name, None), item.width, item.height))
535
                else:
536
                    self.write('<image width="%d" height="%d" />\n' %
537 538
                                     (item.width, item.height))
            else:
539
                assert False, str(('Unhandled', item))
540
            return
541
        render(ltpage)
542 543 544
        return

    def close(self):
545
        self.write_footer()
546
        return