pdf2jp2.c 4.34 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
/*
 * Copyright (c) 2014, Mathieu Malaterre <mathieu.malaterre@voxxl.com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Extract all JP2 files contained within a PDF file.
 *
 * Technically you could simply use mutool, eg:
 *
 * $ mutool show -be -o obj58.jp2 Bug691816.pdf 58
 *
 * to extract a given JP2 file from within a PDF
 * However it happens sometimes that the PDF is itself corrupted, this tools is
 * a lame PDF parser which only extract stream contained in JPXDecode box
 * only work on linux since I need memmem function
 */

40 41 42 43 44
/*
 * Add support for other signatures:
 * 
 * obj<</Subtype/Image/Length 110494/Filter/JPXDecode/BitsPerComponent 8/ColorSpace/DeviceRGB/Width 712/Height 1052>>stream
 */
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
#define _GNU_SOURCE
#include <string.h>
#include <stdio.h>
#include <stddef.h>
#include <assert.h>

int main(int argc, char *argv[])
{
#define NUMJP2 32
  int i, c = 0;
  long offets[NUMJP2];
  char buffer[512];
#define BUFLEN 4096
  int cont = 1;
  FILE *f;
  size_t nread;
  char haystack[BUFLEN];
  const char needle[] = "JPXDecode";

  const size_t nlen = strlen( needle );
  const size_t flen = BUFLEN - nlen;
  char *fpos = haystack + nlen;
  const char *filename;
  if( argc < 2 ) return 1;

  filename = argv[1];

  memset( haystack, 0, nlen );

  f = fopen( filename, "rb" );
  while( cont )
    {
    const char *ret;
    size_t hlen;
    nread = fread(fpos, 1, flen, f);
    hlen = nlen + nread;
    ret = memmem( haystack, hlen, needle, nlen);
    if( ret )
      {
      const long cpos = ftell(f);
      const ptrdiff_t diff = ret - haystack;
      assert( diff >= 0 );
      /*fprintf( stdout, "Found it: %lx\n", (ptrdiff_t)cpos - (ptrdiff_t)hlen + diff);*/
      offets[c++] = (ptrdiff_t)cpos - (ptrdiff_t)hlen + diff;
      }
    cont = (nread == flen);
    memcpy( haystack, haystack + nread, nlen );
    }

  assert( feof( f ) );

  for( i = 0; i < c; ++i )
    {
    int s, len = 0;
    char *r;
    const int ret = fseek(f, offets[i], SEEK_SET);
    assert( ret == 0 );
    r = fgets(buffer, sizeof(buffer), f);
    assert( r );
    /*fprintf( stderr, "DEBUG: %s", r );*/
    s = sscanf(r, "JPXDecode]/Length  %d/Width %*d/BitsPerComponent %*d/Height %*d", &len);
106 107 108 109 110 111 112 113 114 115 116
    if( s == 0 )
      { // try again harder
      const int ret = fseek(f, offets[i] - 40, SEEK_SET); // 40 is magic number
      assert( ret == 0 );
      r = fgets(buffer, sizeof(buffer), f);
      assert( r );
      const char needle2[] = "/Length";
      char * s2 = strstr(buffer, needle2);
      s = sscanf(s2, "/Length  %d/", &len);
      }
    if( s == 1 )
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
      {
      FILE *jp2;
      int j;
      char jp2fn[512];
      sprintf( jp2fn, "%s.%d.jp2", filename, i );
      jp2 = fopen( jp2fn, "wb" );
      for( j = 0; j < len; ++j )
        {
        int v = fgetc(f);
        int ret2 = fputc(v, jp2);
        assert( ret2 != EOF );
        }
      fclose( jp2 );
#if 0
      /* TODO need to check we reached endstream */
      r = fgets(buffer, sizeof(buffer), f);
      fprintf( stderr, "DEBUG: [%s]", r );
      r = fgets(buffer, sizeof(buffer), f);
      fprintf( stderr, "DEBUG: [%s]", r );
#endif
      }
    }
  fclose(f);

  return 0;
}