Skip to content
Commits on Source (6)
......@@ -7,5 +7,19 @@ target
*.iml
*.ipr
*.iws
*.bin
nbactions.xml
nb-configuration.xml
*.DS_Store
*.tmp-inception
*.snap
tika-deployment/tika-snap-app/parts/
tika-deployment/tika-snap-app/prime/
tika-deployment/tika-snap-app/snap/
tika-deployment/tika-snap-app/stage/
tika-deployment/tika-snap-app/test/
tika-deployment/tika-snap-server/parts/
tika-deployment/tika-snap-server/prime/
tika-deployment/tika-snap-server/snap/
tika-deployment/tika-snap-server/stage/
This diff is collapsed.
......@@ -342,3 +342,194 @@ JcWAy7md7XR9MiVgSQuw040wqSzcSA5M6RCFZ9gN+G0kP1CNZ5vDz+JktV4nJZzh
JF8xV9E4P/Msl8hqmOOocZ4LDJdw/nt1UWlUmattMLBVWdSeuu0=
=pYQ7
-----END PGP PUBLIC KEY BLOCK-----
pub 4096R/EF0CF38A 2017-05-16
uid [ultimate] Tim Allison (ASF signing key) <tallison@apache.org>
sig 3 EF0CF38A 2017-05-16 Tim Allison (ASF signing key) <tallison@apache.org>
sig DE7B39EC 2017-05-17 [User ID not found]
sig 13B86349 2017-05-18 Ismaël Mejía <iemejia@gmail.com>
sig 5ECBB314 2017-05-17 Rob Tompkins <chtompki@apache.org>
sig 00B6899D 2017-05-17 Christopher L Tubbs II (Christopher) <ctubbsii@gmail.com>
sig 26518FEE 2017-05-18 Dale LaBossiere (CODE SIGNING KEY) <dlaboss@apache.org>
sig 3 A400FD50 2017-05-16 Akira Ajisaka <aajisaka@apache.org>
sig 9FCC82D0 2017-05-17 Benedikt Ritter (CODE SIGNING KEY) <britter@apache.org>
sig 1AD84DFF 2017-05-26 Daniel Ruggeri (http://home.apache.org/~druggeri/) <druggeri@apache.org>
sig 9C1750C5 2017-05-26 Coty Sutherland <csutherl@fedoraproject.org>
sig 8BD1DCE8 2017-05-26 BigBlueHat <byoung@bigbluehat.com>
sig 47085518 2017-05-26 Marcus Christie (CODE SIGNING KEY) <machristie@apache.org>
sig 3 0FB52BC6 2017-05-24 Ashish Paliwal <apaliwal@apache.org>
sub 4096R/B0007A00 2017-05-16
sig EF0CF38A 2017-05-16 Tim Allison (ASF signing key) <tallison@apache.org>
-----BEGIN PGP PUBLIC KEY BLOCK-----
Version: GnuPG v2
mQINBFkbU+0BEADI5xZ5rYbBfbZb52mxVwzhNcVoOBOC3zc/AQ2QuBgLo5MNfWFv
C+ns8Ze/H63r/5CKXz8k2pHbdqggRKJopW37L3/L4IpG3i2331XUNnFJ4UtPkIHN
FKW1FRs+nSaWCjy4gpQ7qyFMWS1G6iI3HvuvpdBWYzbBc2XvmIRTBL45AJFp3Z+p
4jOc0nKVIxjeTkk8RVOrzn3OemNQX5LlyVwBXhAcDZeVij8qyjh+/cpBBP8Z8T8t
jyoCc0YEz9DD+p9feFdJ4s4MmGO8XNU9AjkrgVy9CtkrUk7rbE4JRfUP1RAQZgBs
uzCcZ110oggPimk4wIf3rKsHZwglz4jO3Tfw3asC5YSsuUL3DTaXzohBeMvY5d9y
QHBC850IRBGlYTeP03Luwkdh+NQ++Hb4wk6wO9CUTREHxXRhQeOrk/1FUEZJrr5d
RknXX542dNQrmMO9Tv+I7AFljuNui2VO0nnZXypO1/PH2TsFeg+VoP2eNdGqOaKT
y9gv87g/fBDcQrqYBRdh4Sk1pGwd4wurfLWLie9GOeZSw6yfmY4xi+3qkzzy/A+8
YzXxRuSaSPN2S1wrQgtt0ldkVd8Ls1k4VTKZOu85dNk/yAoXv7gQSi7L3+SOgmur
NfeCae4tNWDGLOWZcQ+qhESTZIjZADtPgJSesdU91OWmy9ZEbx7QQfkolwARAQAB
tDNUaW0gQWxsaXNvbiAoQVNGIHNpZ25pbmcga2V5KSA8dGFsbGlzb25AYXBhY2hl
Lm9yZz6JAjkEEwEIACMFAlkbU+0CGwMHCwkIBwMCAQYVCAIJCgsEFgIDAQIeAQIX
gAAKCRDkAy3E7wzzigaOD/wP+TsPCzYASlMaZARXD6rNUSKx1CCasJFKBUL5vbhs
8X3LBp9KTVkuSsURQkTnT5swOOWWuDvCASxmbtbjZJfs1b7/lDnDl9ggeU9wGjQ7
/9tyzRvhf1a56qiIQ6Tc0F91rdeBssIIe95jgw4IhoCxp121RWTXgUaSVfUuQru8
aQbpdRt5qWCs3s3x3X1UWh4L2i7f7+3JUEiq/1LIVMZZRbkm6MaoBuKm9GXJJX29
zbg243vA7cGZJNgdZSWz3B5qmmjwEovsKqnY/oV8f2Kom0Ggm1fGiWJUnVc5N4FF
qmXooraY04dW374osKK4MZopp4OSxBhmEGMsfE/YX0jK2tsjTCHZlMDQZBfj/5J/
XJusRSXYkQBKIWbIJQz5sT2FgfCrAoI1VQAmAZS7IPr2s7px9xoIDAZ9w2XA6k0j
tjJXXZ6bMJiV6PzCHXnQ5BEKTOE59+7TDxrlUrSYaDO5Aix3qEiyMhZZdZGv+2J2
vXoThynV5N1GL3WDlctrg/r/BfbDW6y1OnhqZCdWy23wa6XV/IcNlr/YpwWNeYwC
M/THN3qdwRj4o0FRbZIytQT+f0J1aPiKrTvZA30S3ocm99vwdI+zAW6iuEqZlVbB
nPWFf4tJS69CjQKjHJ5WDjjs78vj+mIfoz/KUOln8ilwnl9Nq70hWGl1LIRgGA6C
WIkBMwQQAQgAHRYhBIc4OGrKFRr/qK4z3S1tQQDeeznsBQJZG6h+AAoJEC1tQQDe
eznsj4oIAJDeoDhPzqFCXoaFrDXMl6NYwz4JJxUsacEclvwXTwUT2xgt4HIsp/mg
7JjTjyl7jZt05yKl9tciKhhXANpLErFWDdAZf6ogb5n8rnxwmqQ0V/kEHt+4NqiT
ijPJ2USmvbuD6X8n2pq9q5G3HzVflHOi9QWzySa9Zbtbbx8oECL/b0T1Z4xyJY/i
/Q51lpRpw5o8bh9vfFwAbKK0GBaXsqJFlkXEv5iRdjCOAwK6rUnrQKFrGj4AUVKi
cHEomUA9jmTAR9otI1FWF/1e3R6AMj7127GOqi/hDsxnq8mBfwmbvqS/c1vF/EQT
TERu7apr2uTGgfymLn3QPD+Cpgb9x3WJAhwEEAECAAYFAlkdeUkACgkQCp2vZxO4
Y0lMMQ//TY92TfHGNrSHzC2Al+zMbyTJpasOIKLG57ZqiG0/kNacUPeURo/nrIso
OmjsoAkgw6eIglrPmIrcpT8LzPOGI2FSkYqXf3rsWkbBTLYGbx/cGXwjTZLxVOR2
uDPKaDN4pSt2IviiKqdYRt0vC8DUedCj77NeY3oG3R/dwBStYI6iSTh4Fpd1KjkV
V4RS/Q1UX44iRyZECAYjO50YrQcF3Pe0Ki/UWVGpozC5chi1ilHlwyNLkqzT0di6
S8wVp2RG0T+B6+UQejmAeSQPJopBtcXWz1RCu3CiT5uSG7fdQO7HTLWtdC4cvi01
idReRbywE4sqe1UMm0d7LfTfi+60GPN5Okmx5aHuXJEIg61R+k8Y8S1mn/dyc5HP
V/nvCyoDItaNNMUOBgQDDWGgndOX4BSqlIUG0JYi97rvhdxtyKo20Bzn//CF7bea
ABbi2LuVMl5IuNtw5XwaYJr5nF/aVQUSwXUghps2bfaHpCDIdjF/hLJ6d/YT1PX0
Cpzxs91RSkWRePO+FYQ4gkKJpcT+kaEd09xfwjDXoi1loRJM20IDmR4Skp9lWCVl
9qcL+ynJ6KWEKnRdDpIz+n0FBnzg/FEqJKQnjUyFPhLS1e8O3stMOiQax4N4U1tT
JID2KtzXxmd3TvwOVAyGBH9abYybp7PEWRtX8WlGe5VliQ3zMtCJAhwEEAEIAAYF
AlkclMoACgkQP6rSzV7LsxRrLw//Rnl2dHj1e+2F2cckT8YQWv29Pp/2M3LV5cC5
i35igmow5d+SSFZUqYCQTa+WCAMaGt27ImL1Bkw73dPf7eTAvOcZEoLQ384W4g4b
5CYAysvoCY1nEr4PnZ9MYLW6W1F8OKCrD1elB5HEaDrfzbaqGB3dxU5DsZHYeGzT
1C/MqV31SDwztiZ7wwAmT2mgEHwFesqKNeolP45knh6YMaNjrdGw08ecaiEUe1Xl
iL1CKeFO8OMPxEErsQdlFNK9AG0/boqTjDqYMZtztPy3WMCEKhyDXOwdWwlyY0t3
PpMwH3VWEGngyFtnZgMEUwsMDGGQKTugmTt7Apjpb0Ytv9HPwIZNMkG1BdaVITcB
imZATH+zYEVQIPBZ69ZOib/0eyVwUBtIDjdAAmxXdT4H1RcyCIV3yz1AbWU/KBa1
Pxri+bFGnbpL9dia4mG1ODbM2Ffdhl0YDn/p5UPWXTTrd/tL6SCna2VG/8i75yJZ
apxd2lIRBIPNiFZpPYRjSVd5WnSinMHzNudjXjAWDNPVe/B5pcjCgzxTbhD0Qh/3
6RGq0mduHLnoDtndqQXS0N73Unx3kO5/ED1mULSMFXtC/NVDiZrODpFx3bVTIlt6
pnUQyD4QuinKYrVYCmIdI5IZkqd4yCO8x7IfZ/jxCWaF6NOU6M8oVjh6qWDmzkoi
NOflf5eJAhwEEAEKAAYFAlkbqTsACgkQbwza5wC2iZ0CbQ/8CAAfDCFUCkOPadT4
U+ZzNXRUY4OVFi6dRipjYk01G7UJEkMSKBe7agEYFU31sx2NZYsn4GPTDAChRHzO
w2WQPx6pjHyY7Skj4B54p5cv8RZJZp+F3hAoRJ6/P1/zqEN4PdtPhbQvGrDk8S/u
Y24MH0EfACaPhCRnDsrXwToEOlN2YYk651ZWLb04X3rSMb6C4aSU79HBbKTrScHc
t6sWKqISNztQzzFUXIDZWW97iuWw5+QNTAJ9bPxTOTDz4eb8hfpjRTu1FDn23yP4
CDW9O0mSc7ad5R9Vrji+EDMvTHYORW/hAJ8XvxlPlmOgbuCN9wQlKtYwjZ4L+by4
3bKqSpG19soURgfjmYubD0Prkl7wQx1N1uh0S/T1R5EjlQ4pro3vWdlRliPQs3/j
ue3x3eFXIRqHlvTVbtgTE+VuhKTEBb8eLtvQdjhN4GpWIqY1VUllb8sDCbyOQiVl
v4eXBHssS/6onNPGIFjyKDjxseXjhsrjs+Ovo53NnU+rkoTomfT0KHWdgn3h/Inz
31OXCqNzsgfrW2pDLCQn/GG8Kb5nOOfMaWmG2CsjmHyNXwAyRnzS0bVmSLbdI4wd
R/8093GvSgGB8NpPIRMTYcwPNZMNz64+Uk0A2SfuH+u7Exsk44OpMPtZzrTbP1bP
4W/ftBGkeiIHmFCltNDrji/yviCJAhwEEAEKAAYFAlkeK/MACgkQoqsIHyZRj+7v
dA//RxcgtmpcaUOZaEUZgay7x8oqVntf4BKuj5BmWmjOFbShA3V5erqm2o9OlC1V
B5pG0JTA7EnQ26XlVSHe3TyfcJyuHubHKoYjZBUoOT0Wj+9CwmXZyfuoDmq6zTpK
Fn9jQXnJniF58soxW0vtlUMzgi2uuU7VOKjG5P27aoFaqSLPc3U7apSwZkU15JjV
6Q2P2PC25cRLgA9WRLCOwptkzqlGi1RihXSmyX5i2ROTEwbhcCCw/C6U7xu+Vh3T
RkL58PGpmMGngXCDZNJzlqeqh9qIxIVI/lNONEh62ig35AhmZp8aZ/zDOX8+s7PH
fHCIAsKM+LdTGtkFxdSXbo4n4Ov5v/QWzdGcWTl4j08gG/ikToR3Jc/BYqPbPHcl
Ib2AFAFSmQysdV5hsHQNojWSc6NBbpfdJmg4rgL0VPou5aUazm+Izv8aLcyNlDmf
cP93o19hVbu25LT5ZEBtBI1dFfjSwXTUWSFonkm9LY7i+wy4/o/sJf9Yp39jAWb9
ZwumVjQJOSSM9AQFw6Aia3JyRvUI4E/4FBJi9MUuIi3b4ojlqO7cvQs8x3ngdr9C
0WUgGTHq3mWvb+zXEJxM4jx2sPzHMGvvufx4WIqICovvXtrlGn8cwCt3iTSp5yQG
w2WDNgw3N/pAHc4FU9dftj0uSwf5NRpWNUPZKvyzQECc4haJAiIEEwEKAAwFAlkb
kPcFgweGH4AACgkQwe27nKQA/VBeQhAAmimPj9Vb6zMvpVDopj2v7rdKbLH3wijg
ntvhRCQn7RApw9I3sm6p3QXc9SXiRthHEef4zJyR3DFt38okqjWuN1UZGGPVJuTp
Fwpig1IheVzHXucltpa9WtMJDLS38MVg4jo5b/a/dG96rNZMi5nc0SR7yg7L6FsL
jzhMeeJ9MoIVtNQHFQ2+rShF+8JrQFji1RrXqMIxS/vglWYdt2DSfLHgQ7MRstyW
sbAi0mLdh91anPX9tCAnHTc3X0Uomp/pM8BVHEINJTOTqyfHLf7fSna98rB+kkKN
eUjGN6cPGa0oPCSWrTquC8AhxFCwgpd+8x3fuTMwrPkqFC3pvReXrUwTvjJsJotn
QfQVZ3I2OcVCavPN7Topwwg/sryw0qJ2TvTciXzAIC6rgQSkajYO2vrY+UOAdsNv
W9lYhovSL70N8WTmDDB6Nk5U91oZ4Oq03OoL8Nxoqk+r6Oj0qbWO4QMefcEoAuSh
PyROrKfrl2dPbqsYQGeCBjeUjvm3ipaMpp3zjw3m8NsbmlI95NCdcW1eKG+LSrTE
N7g5c46b2XHLOilrwDogwPCvx8+NnFh+YdqMS8Cc/+gTvi2JTyg1g0wSATGzCQO0
VkWHH5/n3Ds4U3mnGUL2TqC2xx9QPH+u537j4b+AnJe8xQGfsBqLvBsYMGyBYYwu
JSBpR+cFkRSJAjMEEAEKAB0WIQTNVGQxXwuYx35ujs2dqtwcn8yC0AUCWRxKegAK
CRCdqtwcn8yC0FhuD/9l+pvZaEsZtn/5E8E6EUFJzQ+lLb0SG2bE4gRa48eLfGdd
6hbz/wOYm3u/PBlL3hd7Yj5eMo40/WUdHbOk7e/x4yBe5RH+i/ZwMhGfZsodYQF7
b90z/EAbClZ0ni7qg1MFbL9ZQYPLXJAP0mzCBKHjz9FefpHgO1wjY99hMmZakJjh
gpg6axCr6AJRKZBWSWusw9BwGQQtdgeva+1SBS4ZNkN/jXnyBbqbtt1Q9ZQJX7IN
RUG93SyWWNQ0FHEs9q5jHbFcQm6jpgxPfNEDWluZt+Z6AGLUEdlBgg1saou/NPUA
xfrLG9TYCl8F9LdiZlvXk5EkNFDmsYHc5iRFPnXPPLm3kDoVf5AXEmBfwJVOqeOu
o1BmEeJkf3GTsVaVww3In3UniqLhdYzgQGuZaJNsq5nzIcmcx2APjEjC8ih9r6FD
xBeS2s2PdmlcCBhpjaftBfbpfWh9tL8pi10ZWKbk/ZfSGLobjLxFpIz0jt4ya9EN
fEIREOWPug/d2qCAliPwl0+ckXU2JVp6Oo6b192OYxmjpUH5AzrbSvoE5MNeP55J
jvxxwL1ue3J6oiZ7b9d/OURoQhJDEHK2Fz4HSHUxSgS56bhWuu0z0SZF6mmTTpMY
R8et+0WFBBp3jOMAeEyzYp0gAjlBkQGIgMg0Zr7QjZ/VlYnelddQpDjXNowJ4IkB
HAQQAQIABgUCWSib9gAKCRCZXjUiGthN/4oUB/oDXVmi9aIT4409stS9VdKCxSaP
DUs9V00n6QUSoPREiIL9nCh8W66J08vB14WlCP36unhWgctLFDD+UMzyV0nLxCIp
qGkYLuai370JvbQcFLgp+cJeC7igaIA5HOGCEhhmlxMLfVMePZqWszFccgk2lxDp
9LVCJSImKIC8kRjSaORLqHQrugvWPkQGOCJR0TuS6CTkVBejZalV/iJcwql5dZAu
Mf0QZEMu2Pxf701SzyMSMn+9J0OCNZ+xgN26af3ZPrhQKLP8c3q1Gg+tkhd70l89
XkOkevkg0r6x576SoL74OEzrQep795AjnFGqNYtFhkjtJGBkwWZoyGTKwUpziQEc
BBABCAAGBQJZKDiUAAoJEAPClpScF1DFYdoH/jv0kHnPk971+MLmYDHjaLJzkwC6
m42nHIFTQruVJfYr8JBPZBhRb+mfeNTbeJfuzxUqBJnsp8ZqhW6JLTm5A4l6im6o
6w1wE0hfKqIdSvZDtqZ/qNIqEcuRh8t9lLDgvQcpwwIfxbWgqqEcSnPnZ0JkZVD5
KQXIjYJngJns4e3RA7sfFPXlGAW5jPM3DwEub6/nYvj3IlSVpBWd5jzJB9y1VcDs
Jt782/PBFiPwuKoSWGLrjPII9+zzs7RiZzvML+ajDgaAYDH8qq2lBP6K5QKgnIff
mzTHg4JMYAt4DkcidQJsGE/FBCrZY7V1NJpm24MFDQLYTu1VZf73ScrMrSSJAhwE
EAECAAYFAlkoorYACgkQIXRkoovR3OhwnBAAhbnzAYPjoabyyUvOABzvT7KEAyBe
TJaQ7bpWyzDpULvSef4HECEhiC5VVGq+vU6xLhjuVG0vwV+ipx4jIxtFjRhDh07m
AT0q07FnGQEDLOGQCPcbSy2L57bXEh8HRyaTMU0tzz+hKrG9ZdDxsLnPk1rxbBa5
y4FwUzdA1odZmCAYyHvBMpcCbXrFnNvcncFsmxCs3PfE1EkxoTRJ6soqO18xELRV
Jo4J5BBDoul2u4u1nuNVtHm2nGHiu3jmrk49YcBaw+OTKZ9Dx2tQbQMlld/4CGr7
H09l7NUcsDIbS06V6icCACJ86KtTGZRwalQKm7ScBf9tpmAnt+0mJIYyhykZWdzi
swW5TOzxghfg/wa0bAiJ6LRwoh4pJ6qSk6XLPdf7Kgje+ivD32Jv9uG/AtG9mNu1
8jmzaNcrdvqzIsTqGTJOISvTX3dT+5IgdjTgBp65e40uFMeQ3aT3c1EMAqU806zq
QOGgESRid9F6gtxXwXpdmE0Sp8s7MFsOubsdbCxuvupXIeFZ2TQqsqJMyWLiDYWZ
vWTbK4nmryNj+M8HsPRY3mqVIOqw0SUTIGg9MvyyxKCspI6R9NhLw/TCqGbRfizN
HX6ZhLhtkESVym1tGpl4aWmEy8mYl5fccntUrBiIMJv1XbXFfG/dEnfQSRYXuktn
mjIqJaicTCgVoBCJAhwEEAEKAAYFAlkojR4ACgkQnEn0IUcIVRg5Pg//R0OdbG/9
pZmGIxJeRExWdurVLUxrzwI3YCOJ0U/9an6QeLm13J6o3UTqPwaLwMZsX/9GUmcr
YHRiDmOkL0RLPYbRTaEsEnOmaxrse2jCXhFl1A0pFUfpeMs8iatRc28DOFztLeyo
rvUiidkScwvhBnRV8N0S54dAIUwXnLD5ApoE0xqPNTzKmVsl1/vdL7TAnWL19JVQ
nVN1UHPB4+8rDkTlierA7uTfzAe5VTIVYhglAPan/bRzwXnW025nHQttd4px5mO/
iEVWjOwg5/JTbJVDLLKkgKdI2TGRZH5xkV96rlvLuMs26pC3UfzGu7lBgf3Y2UP+
touSJqnXFyKWt0JBLcQwNzSOZR7Ryfp9bWjbFd+vF+749fgIsTZejcAAk0pf3W94
hgNEYUauO3GevDIXJB8eVTJq2DKKDEW/3+naU+Ezs16DUicWPGM/KquB4Bifn1Bp
qPWxC0GzBYFVCBEJIaQnlxRXRFuZ3QoSTO5i1K5nZdOhR77/mQuuyQz9bbEpTsGD
9NxlSIvKennUE2NiYHleXRbiREpPTRsG0spTYp/iGJ4shAlZrE0CMaCSv2mJELsI
thJ/xbIEaeZzqgbBtKPiuK95eAgbxXQFl0bDMXTtzXrjiH2/EHIRMxYuQQ7ByyyN
3Ve/wH3GxPS5xxRyNJhZ52NsPdvKnXSYJjaJAiIEEwEKAAwFAlkk9GkFgweGH4AA
CgkQA+K/Hg+1K8aPzg//e52C5JNm6BpMSQSrwpG1YV8J52dJZ+JizpI0iOYyLVAu
FSJ4q8oe+gMW4wG4QAbyuwKpEGC/bM/zJLs2UxgUF1tYkMWPx1FTGOOzCauNFSOo
/PgJZoVHbyNY0Ltxb1MIFPbSuSz9V/jkOmbFXDwibvQgLH4Iy0opyHbOAjuf+55T
7oQ+2CVtMjY+NFDx1Z2AfWXwOjsttmqGEeCA/NGtOJzNrARArM05kBBY7BQUNJvD
iluoYWAGCduseGnUL+aTxLdefZ4VU+8f7cbYELhaYFSkdSORroWYYcyvl6i8foVi
0G3DT3UIZl029oSbxyZRoa09X02oqPluyq3/KaGmDNGIYBchxF0Za7+x97IoYtVi
wUgh76UGnPU16GKBCjevnWYMW2beJ+0ry0PG3fMDOe+0q4uhWSEx89yr9vjNP2q2
KnW6hiRAvIOC0QUo/7A8hLcdQjb7xQd76VVinxNsUhVtpf3N6q8NudIrR6HLKNlf
W6RXR0U3Y6bW2AiQWyehN095GounpOK4XU7JvetP6+79wB7gPAcbKxIWfsh1N5Kc
VqbE0q5kac+C2t7gO7N5Ac7DkgxfZgSZ0/QojC8Hn15OswwSVmpPdjAXj1YkktYl
3htV4+uDr/3LYNrllkbvLkgnybJcF2dyP7myDdvsBVf1J7ygKt7IqE9iqS3JRTi5
Ag0EWRtT7QEQALcx2V96b1JAX5Bp+xjcU7YGOQK0bVx61AQTIW2uCwyjFHqJ5Tmc
/u8ndQA6Fw7ANzrpXEmMQfVzg/1tqpYBJa8xfDTOSnDHGvbXvqz3IthpatbJszqC
xL0qh/8pHHIJcW1FzF6+aL9BjuMTsOF/jIOmr08T29Tl7WH2lS5V9XDgFgooXumh
5Vm/dGNqqszWHdrrqLkDgdo+CjjicdADLoEhPK9f6PDTL1GZfTmIfES0jDmv1bG3
Zosj/XZQRbkrhkvKwXjOolTCaArqc5G5ZvBccAP3ZpyZvASM2KbdASapcHc4I8UW
TtsYBm8ecjntnxZHLfQxKNrj+EFOOzNelsVESjmNdu2Q4d70tD2Bjm3pfnRAGp/t
qwBi7P/9mYTYECH3LfyFR8/ZiSKl6+DSDeGh0eojhgLbIJi1/mZhvd8i5Ezr50ys
yHW7UGq+zyXkB86sDsHTh30N7y4gxfHxdKq0TjC1Vc7aNtA6Cpv3QKj9s0FCwvBE
awMtAG5TJXu5LjcH3aXIfnwkJqzntB3dAzh70YUaID19ySzPc4vUzH9rj6ccoN9s
zgmSTM8I2IcDcbIKbT2dmqr07ma/VrJr1rTTG6vJ4CaO/4qHHCi+ZmHGcaf7mR4b
kGZoXP3hCGSDvU+QNkC7DV5wstoMvzxnqAKr7+evN024H4iUM9Qt5sQTABEBAAGJ
Ah8EGAEIAAkFAlkbU+0CGwwACgkQ5AMtxO8M84rlthAAi9XH8Y7N3OmIv5MLB+MZ
C3BjZs08C0CcThsdPg1mzyEzOJp+FF1rHTMA3edHBuo6l72rUqdk7/Ddt/4p/oEE
rWSfwj+iLUoP0Zl4bNg5EH7wGfufQ4AAl7Vi3S8L0xliF1w9QNgEy5pu342EC8jJ
gcxXVwqpU8c+p+3M1xW6hWO/qODjhngHPRmnK2E8ULdsVL2eoz63ZXuNjNnac+fY
SYfAjdOuXYIOZFO9tX9b1Kll6XZPy4fvhSYKPzunxmaJ2S2Rlv7tAcKSg0Hu8VkU
h3hie2AHr2saJnSqlNx675Aw85F8D43/OT617V+/pzMDRnxlPHsBe8gqhYadSQCy
v36VNlXrHe1YNqiSNmI6QvmnYTdcXa80g9zqybBS2Uju22EMHP/7O/VyJnPH9AaV
LL1YUuOepblLnJYKVqF2mnPpDg2vwhmrqKMWsWHVxHMNalwXN4AfeFFuy5BrZ6JG
j2+9b4FHq2BFRpEJKSiUw7HTii20blgFr41A8M37VHifSWObGw2ynmIKJ++yRTIn
ZDUTTDfgOsmugADD2JBNbm6ydRtwtknsJPUlhmTovePrTi3QyoBvLMhK9Po3Eg5x
L0CyRJS02Y9AQHH8UVAOMihn/qUkdP5sQNYqjl4DyOXO/GfCiVmjLJEMGrTW4Ozu
sgtLYRPuFHFHzap0YZCqEGM=
=/R0q
-----END PGP PUBLIC KEY BLOCK-----
......@@ -370,3 +370,41 @@ JUnRAR (https://github.com/edmund-wagner/junrar/)
Sqlite (bundled in org.xerial's sqlite-jdbc)
This product bundles Sqlite, which is in the Public Domain. For details
see: https://www.sqlite.org/copyright.html
Sample DXF file testDXF.dxf (in tika-parsers/src/test/resources/test-documents)
Copyright 2012 Ho Thanh Tam, www.cadkit.net
Permission to use, copy, modify, and distribute this software and its
documentation for any purpose is hereby granted without fee, provided
that the above copyright notice, author statement appear in all copies
of this software and related documentation.
H2 Database in tika-eval
This software contains unmodified binary redistributions for
H2 database engine (http://www.h2database.com/),
which is dual licensed and available under the MPL 2.0
(Mozilla Public License) or under the EPL 1.0 (Eclipse Public License).
An original copy of the license agreement can be found at:
http://www.h2database.com/html/license.html
org.brotli.dec dependency of commons-compress (MIT License)
Copyright (c) 2009, 2010, 2013-2016 by the Brotli Authors.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
\ No newline at end of file
......@@ -26,6 +26,9 @@
<excludes>
<exclude>**/target/**</exclude>
<exclude>**/.*/**</exclude>
<exclude>**/opennlp/*.bin</exclude>
<exclude>**/recognition/*.bin</exclude>
<exclude>**/*.releaseBackup</exclude>
</excludes>
</fileSet>
</fileSets>
......
......@@ -25,7 +25,7 @@
<parent>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parent</artifactId>
<version>1.8</version>
<version>1.18</version>
<relativePath>tika-parent/pom.xml</relativePath>
</parent>
......@@ -34,16 +34,6 @@
<name>Apache Tika</name>
<url>http://tika.apache.org</url>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/tika/tags/1.8-rc2
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/tika/tags/1.8-rc2
</developerConnection>
<url>http://svn.apache.org/viewvc/tika/tags/1.8-rc2</url>
</scm>
<modules>
<module>tika-parent</module>
<module>tika-core</module>
......@@ -55,7 +45,12 @@
<module>tika-bundle</module>
<module>tika-server</module>
<module>tika-translate</module>
<module>tika-langdetect</module>
<module>tika-example</module>
<module>tika-java7</module>
<module>tika-eval</module>
<module>tika-dl</module>
<module>tika-nlp</module>
</modules>
<profiles>
......@@ -106,21 +101,16 @@
<include name="target/*-src.zip*" />
<include name="tika-app/target/tika-app-${project.version}.jar*" />
<include name="tika-server/target/tika-server-${project.version}.jar*" />
<include name="tika-eval/target/tika-eval-${project.version}.jar*" />
</fileset>
</copy>
<checksum algorithm="MD5" fileext=".md5">
<fileset dir="${basedir}/target/${project.version}">
<include name="*.zip" />
<include name="*.?ar" />
</fileset>
</checksum>
<checksum algorithm="SHA1" fileext=".sha">
<checksum algorithm="SHA-512" fileext=".sha512">
<fileset dir="${basedir}/target/${project.version}">
<include name="*.zip" />
<include name="*.?ar" />
</fileset>
</checksum>
<checksum file="${basedir}/target/${project.version}/tika-${project.version}-src.zip" algorithm="SHA1" property="checksum" />
<checksum file="${basedir}/target/${project.version}/tika-${project.version}-src.zip" algorithm="SHA-512" property="checksum" />
<echo file="${basedir}/target/vote.txt">
From: ${username}@apache.org
To: dev@tika.apache.org
......@@ -131,9 +121,9 @@ A candidate for the Tika ${project.version} release is available at:
https://dist.apache.org/repos/dist/dev/tika/
The release candidate is a zip archive of the sources in:
http://svn.apache.org/repos/asf/tika/tags/${project.version}-rcN/
https://github.com/apache/tika/tree/{project.version}-rcN/
The SHA1 checksum of the archive is
The SHA-512 checksum of the archive is
${checksum}.
In addition, a staged maven repository is available here:
......@@ -176,17 +166,21 @@ least three +1 Tika PMC votes are cast.
</plugins>
</build>
</profile>
<profile>
<id>java7</id>
<activation>
<jdk>[1.7,]</jdk>
</activation>
<modules>
<module>tika-java7</module>
</modules>
</profile>
</profiles>
<build>
<plugins>
<plugin>
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
<configuration>
<excludes>
<exclude>CHANGES.txt</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
<description>The Apache Tika™ toolkit detects and extracts metadata and structured text content from various documents
using existing parser libraries.
</description>
......
......@@ -25,7 +25,7 @@
<parent>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parent</artifactId>
<version>1.8</version>
<version>1.18</version>
<relativePath>../tika-parent/pom.xml</relativePath>
</parent>
......@@ -42,16 +42,11 @@
<groupId>${project.groupId}</groupId>
<artifactId>tika-parsers</artifactId>
<version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-langdetect</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
......@@ -69,33 +64,29 @@
<version>${project.version}</version>
</dependency>
<!-- logging -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jul-to-slf4j</artifactId>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
<groupId>org.slf4j</groupId>
<artifactId>jul-to-slf4j</artifactId>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<artifactId>commons-io</artifactId>
<groupId>commons-io</groupId>
<version>2.1</version>
<scope>test</scope>
<version>${commons.io.version}</version>
</dependency>
</dependencies>
......@@ -162,6 +153,9 @@
<resource>META-INF/DEPENDENCIES</resource>
<file>target/classes/META-INF/DEPENDENCIES</file>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/cxf/bus-extensions.txt</resource>
</transformer>
</transformers>
</configuration>
</execution>
......@@ -178,6 +172,15 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
<configuration>
<excludes>
<exclude>src/test/resources/test-data/**</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
......@@ -256,11 +259,6 @@
<name>The Apache Software Foundation</name>
<url>http://www.apache.org</url>
</organization>
<scm>
<url>http://svn.apache.org/viewvc/tika/tags/1.8-rc2/tika-app</url>
<connection>scm:svn:http://svn.apache.org/repos/asf/tika/tags/1.8-rc2/tika-app</connection>
<developerConnection>scm:svn:https://svn.apache.org/repos/asf/tika/tags/1.8-rc2/tika-app</developerConnection>
</scm>
<issueManagement>
<system>JIRA</system>
<url>https://issues.apache.org/jira/browse/TIKA</url>
......
......@@ -896,229 +896,7 @@ juniversalchardet library (juniversalchardet)
use the text of this Exhibit A rather than the text found in the
Original Code Source Code for Your Modifications.]
AspectJ runtime library (aspectjrt)
Sqlite (included in the "provided" org.xerial's sqlite-jdbc)
Sqlite is in the Public Domain. For details
see: https://www.sqlite.org/copyright.html
Eclipse Public License - v 1.0
THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE
PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF
THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
1. DEFINITIONS
"Contribution" means:
a) in the case of the initial Contributor, the initial code and
documentation distributed under this Agreement, and
b) in the case of each subsequent Contributor:
i) changes to the Program, and
ii) additions to the Program;
where such changes and/or additions to the Program originate from and
are distributed by that particular Contributor. A Contribution
'originates' from a Contributor if it was added to the Program by
such Contributor itself or anyone acting on such Contributor's behalf.
Contributions do not include additions to the Program which: (i) are
separate modules of software distributed in conjunction with the
Program under their own license agreement, and (ii) are not derivative
works of the Program.
"Contributor" means any person or entity that distributes the Program.
"Licensed Patents " mean patent claims licensable by a Contributor which
are necessarily infringed by the use or sale of its Contribution alone or
when combined with the Program.
"Program" means the Contributions distributed in accordance with this
Agreement.
"Recipient" means anyone who receives the Program under this Agreement,
including all Contributors.
2. GRANT OF RIGHTS
a) Subject to the terms of this Agreement, each Contributor hereby grants
Recipient a non-exclusive, worldwide, royalty-free copyright license to
reproduce, prepare derivative works of, publicly display, publicly
perform, distribute and sublicense the Contribution of such
Contributor, if any, and such derivative works, in source code and
object code form.
b) Subject to the terms of this Agreement, each Contributor hereby grants
Recipient a non-exclusive, worldwide, royalty-free patent license under
Licensed Patents to make, use, sell, offer to sell, import and
otherwise transfer the Contribution of such Contributor, if any, in
source code and object code form. This patent license shall apply to
the combination of the Contribution and the Program if, at the time
the Contribution is added by the Contributor, such addition of the
Contribution causes such combination to be covered by the Licensed
Patents. The patent license shall not apply to any other combinations
which include the Contribution. No hardware per se is licensed hereunder.
c) Recipient understands that although each Contributor grants the
licenses to its Contributions set forth herein, no assurances are
provided by any Contributor that the Program does not infringe the
patent or other intellectual property rights of any other entity. Each
Contributor disclaims any liability to Recipient for claims brought by
any other entity based on infringement of intellectual property rights
or otherwise. As a condition to exercising the rights and licenses
granted hereunder, each Recipient hereby assumes sole responsibility
to secure any other intellectual property rights needed, if any. For
example, if a third party patent license is required to allow Recipient
to distribute the Program, it is Recipient's responsibility to acquire
that license before distributing the Program.
d) Each Contributor represents that to its knowledge it has sufficient
copyright rights in its Contribution, if any, to grant the copyright
license set forth in this Agreement.
3. REQUIREMENTS
A Contributor may choose to distribute the Program in object code form
under its own license agreement, provided that:
a) it complies with the terms and conditions of this Agreement; and
b) its license agreement:
i) effectively disclaims on behalf of all Contributors all warranties
and conditions, express and implied, including warranties or
conditions of title and non-infringement, and implied warranties
or conditions of merchantability and fitness for a particular
purpose;
ii) effectively excludes on behalf of all Contributors all liability
for damages, including direct, indirect, special, incidental and
consequential damages, such as lost profits;
iii) states that any provisions which differ from this Agreement are
offered by that Contributor alone and not by any other party; and
iv) states that source code for the Program is available from such
Contributor, and informs licensees how to obtain it in a
reasonable manner on or through a medium customarily used for
software exchange.
When the Program is made available in source code form:
a) it must be made available under this Agreement; and
b) a copy of this Agreement must be included with each copy of the
Program.
Contributors may not remove or alter any copyright notices contained
within the Program.
Each Contributor must identify itself as the originator of its
Contribution, if any, in a manner that reasonably allows subsequent
Recipients to identify the originator of the Contribution.
4. COMMERCIAL DISTRIBUTION
Commercial distributors of software may accept certain responsibilities
with respect to end users, business partners and the like. While this
license is intended to facilitate the commercial use of the Program,
the Contributor who includes the Program in a commercial product offering
should do so in a manner which does not create potential liability for
other Contributors. Therefore, if a Contributor includes the Program in
a commercial product offering, such Contributor ("Commercial Contributor")
hereby agrees to defend and indemnify every other Contributor
("Indemnified Contributor") against any losses, damages and costs
(collectively "Losses") arising from claims, lawsuits and other legal
actions brought by a third party against the Indemnified Contributor to
the extent caused by the acts or omissions of such Commercial Contributor
in connection with its distribution of the Program in a commercial
product offering. The obligations in this section do not apply to any
claims or Losses relating to any actual or alleged intellectual property
infringement. In order to qualify, an Indemnified Contributor must:
a) promptly notify the Commercial Contributor in writing of such claim,
and b) allow the Commercial Contributor to control, and cooperate with
the Commercial Contributor in, the defense and any related settlement
negotiations. The Indemnified Contributor may participate in any such
claim at its own expense.
For example, a Contributor might include the Program in a commercial
product offering, Product X. That Contributor is then a Commercial
Contributor. If that Commercial Contributor then makes performance claims,
or offers warranties related to Product X, those performance claims and
warranties are such Commercial Contributor's responsibility alone. Under
this section, the Commercial Contributor would have to defend claims
against the other Contributors related to those performance claims and
warranties, and if a court requires any other Contributor to pay any
damages as a result, the Commercial Contributor must pay those damages.
5. NO WARRANTY
EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED
ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER
EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR
CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A
PARTICULAR PURPOSE. Each Recipient is solely responsible for determining
the appropriateness of using and distributing the Program and assumes all
risks associated with its exercise of rights under this Agreement ,
including but not limited to the risks and costs of program errors,
compliance with applicable laws, damage to or loss of data, programs or
equipment, and unavailability or interruption of operations.
6. DISCLAIMER OF LIABILITY
EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR
ANY CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING
WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR
DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED
HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
7. GENERAL
If any provision of this Agreement is invalid or unenforceable under
applicable law, it shall not affect the validity or enforceability of
the remainder of the terms of this Agreement, and without further action
by the parties hereto, such provision shall be reformed to the minimum
extent necessary to make such provision valid and enforceable.
If Recipient institutes patent litigation against any entity (including
a cross-claim or counterclaim in a lawsuit) alleging that the Program
itself (excluding combinations of the Program with other software or
hardware) infringes such Recipient's patent(s), then such Recipient's
rights granted under Section 2(b) shall terminate as of the date such
litigation is filed.
All Recipient's rights under this Agreement shall terminate if it fails
to comply with any of the material terms or conditions of this Agreement
and does not cure such failure in a reasonable period of time after
becoming aware of such noncompliance. If all Recipient's rights under
this Agreement terminate, Recipient agrees to cease use and distribution
of the Program as soon as reasonably practicable. However, Recipient's
obligations under this Agreement and any licenses granted by Recipient
relating to the Program shall continue and survive.
Everyone is permitted to copy and distribute copies of this Agreement,
but in order to avoid inconsistency the Agreement is copyrighted and may
only be modified in the following manner. The Agreement Steward reserves
the right to publish new versions (including revisions) of this Agreement
from time to time. No one other than the Agreement Steward has the right
to modify this Agreement. The Eclipse Foundation is the initial Agreement
Steward. The Eclipse Foundation may assign the responsibility to serve as
the Agreement Steward to a suitable separate entity. Each new version of
the Agreement will be given a distinguishing version number. The Program
(including Contributions) may always be distributed subject to the version
of the Agreement under which it was received. In addition, after a new
version of the Agreement is published, Contributor may elect to distribute
the Program (including its Contributions) under the new version. Except as
expressly stated in Sections 2(a) and 2(b) above, Recipient receives no
rights or licenses to the intellectual property of any Contributor under
this Agreement, whether expressly, by implication, estoppel or otherwise.
All rights in the Program not expressly granted under this Agreement
are reserved.
This Agreement is governed by the laws of the State of New York and the
intellectual property laws of the United States of America. No party to
this Agreement will bring a legal action under this Agreement more than
one year after the cause of action arose. Each party waives its rights to
a jury trial in any resulting litigation.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.batch;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.Parser;
public class DigestingAutoDetectParserFactory extends ParserFactory {
private DigestingParser.Digester digester = null;
@Override
public Parser getParser(TikaConfig config) {
Parser p = new AutoDetectParser(config);
if (digester == null) {
return p;
}
DigestingParser d = new DigestingParser(p, digester);
return d;
}
public void setDigester(DigestingParser.Digester digester) {
this.digester = digester;
}
}
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.batch.builders;
import java.util.Locale;
import java.util.Map;
import org.apache.tika.batch.DigestingAutoDetectParserFactory;
import org.apache.tika.batch.ParserFactory;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.utils.BouncyCastleDigester;
import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.util.ClassLoaderUtil;
import org.apache.tika.util.XMLDOMUtil;
import org.w3c.dom.Node;
public class AppParserFactoryBuilder implements IParserFactoryBuilder {
@Override
public ParserFactory build(Node node, Map<String, String> runtimeAttrs) {
Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttrs);
String className = localAttrs.get("class");
ParserFactory pf = ClassLoaderUtil.buildClass(ParserFactory.class, className);
if (localAttrs.containsKey("parseRecursively")) {
String bString = localAttrs.get("parseRecursively").toLowerCase(Locale.ENGLISH);
if (bString.equals("true")) {
pf.setParseRecursively(true);
} else if (bString.equals("false")) {
pf.setParseRecursively(false);
} else {
throw new RuntimeException("parseRecursively must have value of \"true\" or \"false\": "+
bString);
}
}
if (pf instanceof DigestingAutoDetectParserFactory) {
DigestingParser.Digester d = buildDigester(localAttrs);
((DigestingAutoDetectParserFactory)pf).setDigester(d);
}
return pf;
}
private DigestingParser.Digester buildDigester(Map<String, String> localAttrs) {
String readLimitString = localAttrs.get("digestMarkLimit");
if (readLimitString == null) {
throw new IllegalArgumentException("Must specify \"digestMarkLimit\" for "+
"the DigestingAutoDetectParserFactory");
}
int readLimit = -1;
try {
readLimit = Integer.parseInt(readLimitString);
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Parameter \"digestMarkLimit\" must be a parseable int: "+
readLimitString);
}
String digestString = localAttrs.get("digest");
try {
return new CommonsDigester(readLimit, digestString);
} catch (IllegalArgumentException commonsException) {
try {
return new BouncyCastleDigester(readLimit, digestString);
} catch (IllegalArgumentException bcException) {
throw new IllegalArgumentException("Tried both CommonsDigester ("+commonsException.getMessage()+
") and BouncyCastleDigester ("+bcException.getMessage()+")", bcException);
}
}
}
}
......@@ -17,8 +17,13 @@
package org.apache.tika.cli;
import java.io.File;
import org.apache.commons.lang.SystemUtils;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
......@@ -39,6 +44,7 @@ class BatchCommandLineBuilder {
static Pattern JVM_OPTS_PATTERN = Pattern.compile("^(--?)J(.+)");
protected static String[] build(String[] args) throws IOException {
Map<String, String> processArgs = new LinkedHashMap<String, String>();
Map<String, String> jvmOpts = new LinkedHashMap<String,String>();
//take the args, and divide them into process args and options for
......@@ -51,11 +57,6 @@ class BatchCommandLineBuilder {
//maybe the user specified a different classpath?!
if (! jvmOpts.containsKey("-cp") && ! jvmOpts.containsKey("--classpath")) {
String cp = System.getProperty("java.class.path");
//need to test for " " on *nix, can't just add double quotes
//across platforms.
if (cp.contains(" ")){
cp = "\""+cp+"\"";
}
jvmOpts.put("-cp", cp);
}
......@@ -68,28 +69,48 @@ class BatchCommandLineBuilder {
}
//use the log4j config file inside the app /resources/log4j_batch_process.properties
if (! hasLog4j) {
jvmOpts.put("-Dlog4j.configuration=\"log4j_batch_process.properties\"", "");
jvmOpts.put("-Dlog4j.configuration=log4j_batch_process.properties", "");
}
//now build the full command line
List<String> fullCommand = new ArrayList<String>();
fullCommand.add("java");
boolean foundHeadlessOption = false;
for (Map.Entry<String, String> e : jvmOpts.entrySet()) {
fullCommand.add(e.getKey());
if (e.getValue().length() > 0) {
fullCommand.add(e.getValue());
fullCommand.add(commandLineSafe(e.getValue()));
}
if (e.getKey().contains("java.awt.headless")) {
foundHeadlessOption = true;
}
}
//run in headless mode unless the user asks for something else TIKA-2434
if (! foundHeadlessOption) {
fullCommand.add("-Djava.awt.headless=true");
}
fullCommand.add("org.apache.tika.batch.fs.FSBatchProcessCLI");
//now add the process commands
for (Map.Entry<String, String> e : processArgs.entrySet()) {
fullCommand.add(e.getKey());
if (e.getValue().length() > 0) {
fullCommand.add(e.getValue());
fullCommand.add(commandLineSafe(e.getValue()));
}
}
return fullCommand.toArray(new String[fullCommand.size()]);
}
protected static String commandLineSafe(String arg) {
if (arg == null) {
return arg;
}
//need to test for " " on windows, can't just add double quotes
//across platforms.
if (arg.contains(" ") && SystemUtils.IS_OS_WINDOWS) {
arg = "\"" + arg + "\"";
}
return arg;
}
/**
* Take the input args and separate them into args that belong on the commandline
......@@ -132,14 +153,15 @@ class BatchCommandLineBuilder {
//if there are only two args and they are both directories, treat the first
//as input and the second as output.
if (args.length == 2 && !args[0].startsWith("-") && ! args[1].startsWith("-")) {
File candInput = new File(args[0]);
File candOutput = new File(args[1]);
if (candOutput.isFile()) {
Path candInput = Paths.get(args[0]);
Path candOutput = Paths.get(args[1]);
if (Files.isRegularFile(candOutput)) {
throw new IllegalArgumentException("Can't specify an existing file as the "+
"second argument for the output directory of a batch process");
}
if (candInput.isDirectory()){
if (Files.isDirectory(candInput)) {
map.put("-inputDir", args[0]);
map.put("-outputDir", args[1]);
}
......@@ -157,35 +179,28 @@ class BatchCommandLineBuilder {
map.remove("-h");
map.remove("--html");
map.put("-basicHandlerType", "html");
map.put("-outputSuffix", "html");
} else if (map.containsKey("-x") || map.containsKey("--xml")) {
map.remove("-x");
map.remove("--xml");
map.put("-basicHandlerType", "xml");
map.put("-outputSuffix", "xml");
} else if (map.containsKey("-t") || map.containsKey("--text")) {
map.remove("-t");
map.remove("--text");
map.put("-basicHandlerType", "text");
map.put("-outputSuffix", "txt");
} else if (map.containsKey("-m") || map.containsKey("--metadata")) {
map.remove("-m");
map.remove("--metadata");
map.put("-basicHandlerType", "ignore");
map.put("-outputSuffix", "json");
} else if (map.containsKey("-T") || map.containsKey("--text-main")) {
map.remove("-T");
map.remove("--text-main");
map.put("-basicHandlerType", "body");
map.put("-outputSuffix", "txt");
}
if (map.containsKey("-J") || map.containsKey("--jsonRecursive")) {
map.remove("-J");
map.remove("--jsonRecursive");
map.put("-recursiveParserWrapper", "true");
//overwrite outputSuffix
map.put("-outputSuffix", "json");
}
if (map.containsKey("--inputDir") || map.containsKey("-i")) {
......
......@@ -61,20 +61,22 @@ import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
......@@ -85,6 +87,8 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import static java.nio.charset.StandardCharsets.UTF_8;
/**
* Simple Swing GUI for Apache Tika. You can drag and drop files on top
* of the window to have them parsed.
......@@ -92,6 +96,9 @@ import org.xml.sax.helpers.AttributesImpl;
public class TikaGUI extends JFrame
implements ActionListener, HyperlinkListener {
//maximum length to allow for mark for reparse to get JSON
private static final int MAX_MARK = 20*1024*1024;//20MB
/**
* Serial version UID.
*/
......@@ -115,13 +122,16 @@ public class TikaGUI extends JFrame
final TikaConfig finalConfig = config;
SwingUtilities.invokeLater(new Runnable() {
public void run() {
new TikaGUI(new AutoDetectParser(finalConfig)).setVisible(true);
new TikaGUI(new DigestingParser(
new AutoDetectParser(finalConfig),
new CommonsDigester(MAX_MARK,
CommonsDigester.DigestAlgorithm.MD5,
CommonsDigester.DigestAlgorithm.SHA256)
)).setVisible(true);
}
});
}
//maximum length to allow for mark for reparse to get JSON
private final int MAX_MARK = 20*1024*1024;//20MB
/**
* Parsing context.
*/
......@@ -294,11 +304,8 @@ public class TikaGUI extends JFrame
public void openFile(File file) {
try {
Metadata metadata = new Metadata();
TikaInputStream stream = TikaInputStream.get(file, metadata);
try {
try (TikaInputStream stream = TikaInputStream.get(file, metadata)) {
handleStream(stream, metadata);
} finally {
stream.close();
}
} catch (Throwable t) {
handleError(file.getPath(), t);
......@@ -308,11 +315,8 @@ public class TikaGUI extends JFrame
public void openURL(URL url) {
try {
Metadata metadata = new Metadata();
TikaInputStream stream = TikaInputStream.get(url, metadata);
try {
try (TikaInputStream stream = TikaInputStream.get(url, metadata)) {
handleStream(stream, metadata);
} finally {
stream.close();
}
} catch (Throwable t) {
handleError(url.toString(), t);
......@@ -334,21 +338,34 @@ public class TikaGUI extends JFrame
getXmlContentHandler(xmlBuffer));
context.set(DocumentSelector.class, new ImageDocumentSelector());
input = TikaInputStream.get(new ProgressMonitorInputStream(
this, "Parsing stream", input));
if (input.markSupported()) {
input.mark(MAX_MARK);
int mark = -1;
if (input instanceof TikaInputStream) {
if (((TikaInputStream)input).hasFile()) {
mark = (int)((TikaInputStream)input).getLength();
}
}
if (mark == -1) {
mark = MAX_MARK;
}
input.mark(mark);
}
input = new ProgressMonitorInputStream(
this, "Parsing stream", input);
parser.parse(input, handler, md, context);
String[] names = md.names();
Arrays.sort(names);
for (String name : names) {
for (String val : md.getValues(name)) {
metadataBuffer.append(name);
metadataBuffer.append(": ");
metadataBuffer.append(md.get(name));
metadataBuffer.append(val);
metadataBuffer.append("\n");
}
}
String name = md.get(Metadata.RESOURCE_NAME_KEY);
if (name != null && name.length() > 0) {
......@@ -456,13 +473,9 @@ public class TikaGUI extends JFrame
if (e.getEventType() == EventType.ACTIVATED) {
try {
URL url = e.getURL();
InputStream stream = url.openStream();
try {
StringWriter writer = new StringWriter();
IOUtils.copy(stream, writer, IOUtils.UTF_8.name());
try (InputStream stream = url.openStream()) {
JEditorPane editor =
new JEditorPane("text/plain", writer.toString());
new JEditorPane("text/plain", IOUtils.toString(stream, UTF_8));
editor.setEditable(false);
editor.setBackground(Color.WHITE);
editor.setCaretPosition(0);
......@@ -475,8 +488,6 @@ public class TikaGUI extends JFrame
dialog.add(new JScrollPane(editor));
dialog.pack();
dialog.setVisible(true);
} finally {
stream.close();
}
} catch (IOException exception) {
exception.printStackTrace();
......
<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<!-- NOTE: tika-batch is still an experimental feature.
The configuration file will likely change and be backward incompatible
with new versions of Tika. Please stay tuned.
-->
<tika-batch-config
maxAliveTimeSeconds="-1"
pauseOnEarlyTerminationMillis="10000"
timeoutThresholdMillis="300000"
timeoutCheckPulseMillis="1000"
maxQueueSize="10000"
numConsumers="default"> <!-- numConsumers = number of file consumers, "default" = number of processors -1 -->
<!-- options to allow on the commandline -->
<commandline>
<option opt="c" longOpt="tika-config" hasArg="true"
description="TikaConfig file"/>
<option opt="bc" longOpt="batch-config" hasArg="true"
description="xml batch config file"/>
<!-- We needed sorted for testing. We added random for performance.
Where crawling a directory is slow, it might be beneficial to
go randomly so that the parsers are triggered earlier. The
default is operating system's choice ("os") which means whatever order
the os returns files in .listFiles(). -->
<option opt="crawlOrder" hasArg="true"
description="how does the crawler sort the directories and files:
(random|sorted|os)"/>
<option opt="numConsumers" hasArg="true"
description="number of fileConsumers threads"/>
<option opt="maxFileSizeBytes" hasArg="true"
description="maximum file size to process; do not process files larger than this"/>
<option opt="maxQueueSize" hasArg="true"
description="maximum queue size for FileResources"/>
<option opt="fileList" hasArg="true"
description="file that contains a list of files (relative to inputDir) to process"/>
<option opt="fileListEncoding" hasArg="true"
description="encoding for fileList"/>
<option opt="inputDir" hasArg="true"
description="root directory for the files to be processed"/>
<option opt="startDir" hasArg="true"
description="directory (under inputDir) at which to start crawling"/>
<option opt="outputDir" hasArg="true"
description="output directory for output"/> <!-- do we want to make this mandatory -->
<option opt="recursiveParserWrapper"
description="use the RecursiveParserWrapper or not (default = false)"/>
<option opt="handleExisting" hasArg="true"
description="if an output file already exists, do you want to: overwrite, rename or skip"/>
<option opt="basicHandlerType" hasArg="true"
description="what type of content handler: xml, text, html, body"/>
<option opt="outputSuffix" hasArg="true"
description="suffix to add to the end of the output file name"/>
<option opt="timeoutThresholdMillis" hasArg="true"
description="how long to wait before determining that a consumer is stale"/>
<option opt="includeFilePat" hasArg="true"
description="regex that specifies which files to process"/>
<option opt="excludeFilePat" hasArg="true"
description="regex that specifies which files to avoid processing"/>
<option opt="reporterSleepMillis" hasArg="true"
description="millisecond between reports by the reporter"/>
<option opt="digest" hasArg="true"
description="which digest(s) to use, e.g. 'md5,sha512'\"/>
<option opt="digestMarkLimit" hasArg="true"
description="max bytes to read for digest\"/>
</commandline>
<!-- can specify inputDir="input", but the default config should not include this -->
<!-- can also specify startDir="input/someDir" to specify which child directory
to start processing -->
<crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
crawlOrder="random"
maxFilesToAdd="-1"
maxFilesToConsider="-1"
includeFilePat=""
excludeFilePat=""
maxFileSizeBytes="-1"
/>
<!--
This is an example of a crawler that reads a list of files to be processed from a
file. This assumes that the files in the list are relative to inputDir.
<crawler class="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
fileList="files.txt"
fileListEncoding="UTF-8"
maxFilesToAdd="-1"
maxFilesToConsider="-1"
includeFilePat="(?i).pdf$"
excludeFilePat="(?i).msg$"
maxFileSizeBytes="-1"
inputDir="input"
/>
-->
<!--
To wrap parser in RecursiveParserWrapper (tika-app's -J or tika-server's /rmeta),
add attribute recursiveParserWrapper="true" to consumers element.
To wrap parser with DigestingParser add attributes e.g.:
digest="md5,sha256" digestMarkLimit="10000000"
-->
<consumers builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
recursiveParserWrapper="false" consumersManagerMaxMillis="60000">
<parser builderClass="org.apache.tika.batch.builders.AppParserFactoryBuilder"
class="org.apache.tika.batch.DigestingAutoDetectParserFactory"
parseRecursively="true"
digest="md5" digestMarkLimit="1000000"/>
<contenthandler builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
basicHandlerType="xml" writeLimit="-1"/>
<!-- can specify custom output file suffix with:
suffix=".mysuffix"
if no suffix is specified, BasicTikaFSConsumersBuilder does its best to guess -->
<!-- can specify compression with
compression="bzip2|gzip|zip" -->
<outputstream class="FSOutputStreamFactory" encoding="UTF-8"/>
</consumers>
<!-- reporter and interrupter are optional -->
<reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000"
reporterStaleThresholdMillis="60000"/>
<interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
</tika-batch-config>
\ No newline at end of file
......@@ -16,63 +16,73 @@
*/
package org.apache.tika.cli;
import static junit.framework.TestCase.assertTrue;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.tika.io.IOUtils;
import org.apache.commons.io.IOUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
public class TikaCLIBatchCommandLineTest {
File testInput = null;
File testFile = null;
Path testInput = null;
Path testFile = null;
String testInputPathForCommandLine;
String escapedInputPathForCommandLine;
@Before
public void init() {
testInput = new File("testInput");
if (!testInput.mkdirs()) {
testInput = Paths.get("testInput");
try {
Files.createDirectories(testInput);
} catch (IOException e) {
throw new RuntimeException("Failed to open test input directory");
}
testFile = new File("testFile.txt");
OutputStream os = null;
try {
os = new FileOutputStream(testFile);
IOUtils.write("test output", os, "UTF-8");
testFile = Paths.get("testFile.txt");
try (OutputStream os = Files.newOutputStream(testFile)) {
IOUtils.write("test output", os, UTF_8);
} catch (IOException e) {
throw new RuntimeException("Couldn't open testFile");
} finally {
IOUtils.closeQuietly(os);
}
testInputPathForCommandLine = testInput.toAbsolutePath().toString();
escapedInputPathForCommandLine = BatchCommandLineBuilder.commandLineSafe(testInputPathForCommandLine);
}
@After
public void tearDown() {
try {
FileUtils.deleteDirectory(testInput);
testFile.delete();
//TODO: refactor this to use our FileUtils.deleteDirectory(Path)
//when that is ready
FileUtils.deleteDirectory(testInput.toFile());
} catch (IOException e) {
throw new RuntimeException(e);
} finally {
try {
Files.deleteIfExists(testFile);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
@Test
public void testJVMOpts() throws Exception {
String path = testInput.getAbsolutePath();
if (path.contains(" ")) {
path = "\"" + path + "\"";
}
String[] params = {"-JXmx1g", "-JDlog4j.configuration=batch_process_log4j.xml", "-inputDir",
path, "-outputDir", "testout-output"};
testInputPathForCommandLine, "-outputDir", "testout-output"};
String[] commandLine = BatchCommandLineBuilder.build(params);
......@@ -97,64 +107,52 @@ public class TikaCLIBatchCommandLineTest {
@Test
public void testBasicMappingOfArgs() throws Exception {
String path = testInput.getAbsolutePath();
if (path.contains(" ")) {
path = "\"" + path + "\"";
}
String[] params = {"-JXmx1g", "-JDlog4j.configuration=batch_process_log4j.xml",
"-bc", "batch-config.xml",
"-J", "-h", "-inputDir", path};
"-J", "-h", "-inputDir", testInputPathForCommandLine};
String[] commandLine = BatchCommandLineBuilder.build(params);
Map<String, String> attrs = mapify(commandLine);
assertEquals("true", attrs.get("-recursiveParserWrapper"));
assertEquals("html", attrs.get("-basicHandlerType"));
assertEquals("json", attrs.get("-outputSuffix"));
assertEquals("batch-config.xml", attrs.get("-bc"));
assertEquals(path, attrs.get("-inputDir"));
assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
}
@Test
public void testTwoDirsNoFlags() throws Exception {
String outputRoot = "outputRoot";
String path = testInput.getAbsolutePath();
if (path.contains(" ")) {
path = "\"" + path + "\"";
}
String[] params = {path, outputRoot};
String[] params = {testInputPathForCommandLine, outputRoot};
String[] commandLine = BatchCommandLineBuilder.build(params);
Map<String, String> attrs = mapify(commandLine);
assertEquals(path, attrs.get("-inputDir"));
assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
assertEquals(outputRoot, attrs.get("-outputDir"));
}
@Test
public void testTwoDirsVarious() throws Exception {
String outputRoot = "outputRoot";
String path = testInput.getAbsolutePath();
if (path.contains(" ")) {
path = "\"" + path + "\"";
}
String[] params = {"-i", path, "-o", outputRoot};
String[] params = {"-i", testInputPathForCommandLine, "-o", outputRoot};
String[] commandLine = BatchCommandLineBuilder.build(params);
Map<String, String> attrs = mapify(commandLine);
assertEquals(path, attrs.get("-inputDir"));
assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
assertEquals(outputRoot, attrs.get("-outputDir"));
params = new String[]{"--inputDir", path, "--outputDir", outputRoot};
params = new String[]{"--inputDir", testInputPathForCommandLine, "--outputDir", outputRoot};
commandLine = BatchCommandLineBuilder.build(params);
attrs = mapify(commandLine);
assertEquals(path, attrs.get("-inputDir"));
assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
assertEquals(outputRoot, attrs.get("-outputDir"));
params = new String[]{"-inputDir", path, "-outputDir", outputRoot};
params = new String[]{"-inputDir", testInputPathForCommandLine, "-outputDir", outputRoot};
commandLine = BatchCommandLineBuilder.build(params);
attrs = mapify(commandLine);
assertEquals(path, attrs.get("-inputDir"));
assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
assertEquals(outputRoot, attrs.get("-outputDir"));
}
......@@ -162,17 +160,12 @@ public class TikaCLIBatchCommandLineTest {
public void testConfig() throws Exception {
String outputRoot = "outputRoot";
String configPath = "c:/somewhere/someConfig.xml";
String path = testInput.getAbsolutePath();
if (path.contains(" ")) {
path = "\"" + path + "\"";
}
String[] params = {"--inputDir", path, "--outputDir", outputRoot,
String[] params = {"--inputDir", testInputPathForCommandLine, "--outputDir", outputRoot,
"--config="+configPath};
String[] commandLine = BatchCommandLineBuilder.build(params);
Map<String, String> attrs = mapify(commandLine);
assertEquals(path, attrs.get("-inputDir"));
assertEquals(escapedInputPathForCommandLine, attrs.get("-inputDir"));
assertEquals(outputRoot, attrs.get("-outputDir"));
assertEquals(configPath, attrs.get("-c"));
......@@ -182,15 +175,14 @@ public class TikaCLIBatchCommandLineTest {
public void testOneDirOneFileException() throws Exception {
boolean ex = false;
try {
String outputRoot = "outputRoot";
String path = testInput.getAbsolutePath();
String path = testFile.toAbsolutePath().toString();
if (path.contains(" ")) {
path = "\"" + path + "\"";
}
String[] params = {path, testFile.getAbsolutePath()};
String[] params = {testInputPathForCommandLine, path};
String[] commandLine = BatchCommandLineBuilder.build(params);
fail("Not allowed to have one dir and one file");
} catch (IllegalArgumentException e) {
ex = true;
}
......@@ -198,7 +190,7 @@ public class TikaCLIBatchCommandLineTest {
}
private Map<String, String> mapify(String[] args) {
Map<String, String> map = new LinkedHashMap<String, String>();
Map<String, String> map = new LinkedHashMap<>();
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("-")) {
String k = args[i];
......
......@@ -17,20 +17,21 @@
package org.apache.tika.cli;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.Reader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.apache.tika.parser.RecursiveParserWrapper;
......@@ -40,97 +41,134 @@ import org.junit.Test;
public class TikaCLIBatchIntegrationTest {
private File testDataFile = new File("src/test/resources/test-data");
private File tempDir;
private Path testInputDir = Paths.get("src/test/resources/test-data");
private String testInputDirForCommandLine;
private Path tempOutputDir;
private String tempOutputDirForCommandLine;
private OutputStream out = null;
private OutputStream err = null;
private ByteArrayOutputStream outBuffer = null;
@Before
public void setup() throws Exception {
tempDir = File.createTempFile("tika-cli-test-batch-", "");
tempDir.delete();
tempDir.mkdir();
tempOutputDir = Files.createTempDirectory("tika-cli-test-batch-");
outBuffer = new ByteArrayOutputStream();
PrintStream outWriter = new PrintStream(outBuffer, true, IOUtils.UTF_8.name());
PrintStream outWriter = new PrintStream(outBuffer, true, UTF_8.name());
ByteArrayOutputStream errBuffer = new ByteArrayOutputStream();
PrintStream errWriter = new PrintStream(errBuffer, true, IOUtils.UTF_8.name());
PrintStream errWriter = new PrintStream(errBuffer, true, UTF_8.name());
out = System.out;
err = System.err;
System.setOut(outWriter);
System.setErr(errWriter);
testInputDirForCommandLine = testInputDir.toAbsolutePath().toString();
tempOutputDirForCommandLine = tempOutputDir.toAbsolutePath().toString();
}
@After
public void tearDown() throws Exception {
System.setOut(new PrintStream(out, true, IOUtils.UTF_8.name()));
System.setErr(new PrintStream(err, true, IOUtils.UTF_8.name()));
FileUtils.deleteDirectory(tempDir);
System.setOut(new PrintStream(out, true, UTF_8.name()));
System.setErr(new PrintStream(err, true, UTF_8.name()));
//TODO: refactor to use our deleteDirectory with straight path
FileUtils.deleteDirectory(tempOutputDir.toFile());
}
@Test
public void testSimplestBatchIntegration() throws Exception {
String[] params = {escape(testDataFile.getAbsolutePath()),
escape(tempDir.getAbsolutePath())};
String[] params = {testInputDirForCommandLine,
tempOutputDirForCommandLine};
TikaCLI.main(params);
assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile());
assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists());
assertFileExists(tempOutputDir.resolve("bad_xml.xml.xml"));
assertFileExists(tempOutputDir.resolve("coffee.xls.xml"));
}
@Test
public void testBasicBatchIntegration() throws Exception {
String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
"-o", escape(tempDir.getAbsolutePath()),
String[] params = {"-i", testInputDirForCommandLine,
"-o", tempOutputDirForCommandLine,
"-numConsumers", "2"
};
TikaCLI.main(params);
assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile());
assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists());
assertFileExists(tempOutputDir.resolve("bad_xml.xml.xml"));
assertFileExists(tempOutputDir.resolve("coffee.xls.xml"));
}
@Test
public void testJsonRecursiveBatchIntegration() throws Exception {
Reader reader = null;
try {
String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
"-o", escape(tempDir.getAbsolutePath()),
String[] params = {"-i", testInputDirForCommandLine,
"-o", tempOutputDirForCommandLine,
"-numConsumers", "10",
"-J", //recursive Json
"-t" //plain text in content
};
TikaCLI.main(params);
reader = new InputStreamReader(
new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), IOUtils.UTF_8);
Path jsonFile = tempOutputDir.resolve("test_recursive_embedded.docx.json");
try (Reader reader = Files.newBufferedReader(jsonFile, UTF_8)) {
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
assertTrue(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).contains("human events"));
} finally {
IOUtils.closeQuietly(reader);
}
}
@Test
public void testProcessLogFileConfig() throws Exception {
String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
"-o", escape(tempDir.getAbsolutePath()),
String[] params = {"-i", testInputDirForCommandLine,
"-o", tempOutputDirForCommandLine,
"-numConsumers", "2",
"-JDlog4j.configuration=log4j_batch_process_test.properties"};
TikaCLI.main(params);
assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile());
assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists());
String sysOutString = new String(outBuffer.toByteArray(), IOUtils.UTF_8);
assertFileExists(tempOutputDir.resolve("bad_xml.xml.xml"));
assertFileExists(tempOutputDir.resolve("coffee.xls.xml"));
String sysOutString = new String(outBuffer.toByteArray(), UTF_8);
assertTrue(sysOutString.contains("MY_CUSTOM_LOG_CONFIG"));
}
public static String escape(String path) {
if (path.indexOf(' ') > -1) {
return '"' + path + '"';
@Test
public void testDigester() throws Exception {
/*
try {
String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
"-o", escape(tempOutputDir.getAbsolutePath()),
"-numConsumers", "10",
"-J", //recursive Json
"-t" //plain text in content
};
TikaCLI.main(params);
reader = new InputStreamReader(
new FileInputStream(new File(tempOutputDir, "test_recursive_embedded.docx.json")), UTF_8);
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
assertEquals("59f626e09a8c16ab6dbc2800c685f772", metadataList.get(0).get("X-TIKA:digest:MD5"));
assertEquals("22e6e91f408d018417cd452d6de3dede", metadataList.get(5).get("X-TIKA:digest:MD5"));
} finally {
IOUtils.closeQuietly(reader);
}
return path;
*/
String[] params = {"-i", testInputDirForCommandLine,
"-o", tempOutputDirForCommandLine,
"-numConsumers", "10",
"-J", //recursive Json
"-t", //plain text in content
"-digest", "sha512"
};
TikaCLI.main(params);
Path jsonFile = tempOutputDir.resolve("test_recursive_embedded.docx.json");
try (Reader reader = Files.newBufferedReader(jsonFile, UTF_8)) {
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
assertNotNull(metadataList.get(0).get("X-TIKA:digest:SHA512"));
assertTrue(metadataList.get(0).get("X-TIKA:digest:SHA512").startsWith("ee46d973ee1852c01858"));
}
}
private void assertFileExists(Path path) {
assertTrue("File doesn't exist: "+path.toAbsolutePath(),
Files.isRegularFile(path));
}
}
......@@ -16,6 +16,7 @@
*/
package org.apache.tika.cli;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
......@@ -26,9 +27,9 @@ import java.net.URI;
import org.apache.commons.io.FileUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
/**
......@@ -37,7 +38,6 @@ import org.junit.Test;
public class TikaCLITest {
/* Test members */
private File profile = null;
private ByteArrayOutputStream outContent = null;
private PrintStream stdout = null;
private File testDataFile = new File("src/test/resources/test-data");
......@@ -46,23 +46,10 @@ public class TikaCLITest {
@Before
public void setUp() throws Exception {
profile = new File("welsh.ngp");
outContent = new ByteArrayOutputStream();
resourcePrefix = testDataURI.toString();
stdout = System.out;
System.setOut(new PrintStream(outContent, true, IOUtils.UTF_8.name()));
}
/**
* Creates a welsh language profile
*
* @throws Exception
*/
@Test
public void testCreateProfile() throws Exception {
String[] params = {"--create-profile=welsh", "-eUTF-8", resourcePrefix + "welsh_corpus.txt"};
TikaCLI.main(params);
assertTrue(profile.exists());
System.setOut(new PrintStream(outContent, true, UTF_8.name()));
}
/**
......@@ -74,7 +61,7 @@ public class TikaCLITest {
public void testListParserDetail() throws Exception{
String[] params = {"--list-parser-detail"};
TikaCLI.main(params);
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("application/vnd.oasis.opendocument.text-web"));
assertTrue(outContent.toString(UTF_8.name()).contains("application/vnd.oasis.opendocument.text-web"));
}
/**
......@@ -99,7 +86,13 @@ public class TikaCLITest {
public void testXMLOutput() throws Exception{
String[] params = {"-x", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("?xml version=\"1.0\" encoding=\"UTF-8\"?"));
assertTrue(outContent.toString(UTF_8.name()).contains("?xml version=\"1.0\" encoding=\"UTF-8\"?"));
params = new String[]{"-x", "--digest=SHA256", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(UTF_8.name())
.contains("<meta name=\"X-TIKA:digest:SHA256\" content=\"e90779adbac09c4ee"));
}
/**
......@@ -113,7 +106,12 @@ public class TikaCLITest {
TikaCLI.main(params);
assertTrue(outContent.toString("UTF-8").contains("html xmlns=\"http://www.w3.org/1999/xhtml"));
assertTrue("Expanded <title></title> element should be present",
outContent.toString(IOUtils.UTF_8.name()).contains("<title></title>"));
outContent.toString(UTF_8.name()).contains("<title></title>"));
params = new String[]{"-h", "--digest=SHA384", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString("UTF-8")
.contains("<meta name=\"X-TIKA:digest:SHA384\" content=\"c69ea023f5da95a026"));
}
/**
......@@ -125,7 +123,7 @@ public class TikaCLITest {
public void testTextOutput() throws Exception{
String[] params = {"-t", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("finished off the cake"));
assertTrue(outContent.toString(UTF_8.name()).contains("finished off the cake"));
}
/**
......@@ -136,7 +134,13 @@ public class TikaCLITest {
public void testMetadataOutput() throws Exception{
String[] params = {"-m", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
assertTrue(outContent.toString(UTF_8.name()).contains("text/plain"));
params = new String[]{"-m", "--digest=SHA512", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(UTF_8.name()).contains("text/plain"));
assertTrue(outContent.toString(UTF_8.name())
.contains("X-TIKA:digest:SHA512: dd459d99bc19ff78fd31fbae46e0"));
}
/**
......@@ -146,9 +150,9 @@ public class TikaCLITest {
*/
@Test
public void testJsonMetadataOutput() throws Exception {
String[] params = {"--json", resourcePrefix + "testJsonMultipleInts.html"};
String[] params = {"--json", "--digest=MD2", resourcePrefix + "testJsonMultipleInts.html"};
TikaCLI.main(params);
String json = outContent.toString(IOUtils.UTF_8.name());
String json = outContent.toString(UTF_8.name());
//TIKA-1310
assertTrue(json.contains("\"fb:admins\":\"1,2,3,4\","));
......@@ -158,6 +162,7 @@ public class TikaCLITest {
int title = json.indexOf("\"title\"");
assertTrue(enc > -1 && fb > -1 && enc < fb);
assertTrue (fb > -1 && title > -1 && fb < title);
assertTrue(json.contains("\"X-TIKA:digest:MD2\":"));
}
/**
......@@ -169,7 +174,7 @@ public class TikaCLITest {
public void testJsonMetadataPrettyPrintOutput() throws Exception {
String[] params = {"--json", "-r", resourcePrefix + "testJsonMultipleInts.html"};
TikaCLI.main(params);
String json = outContent.toString(IOUtils.UTF_8.name());
String json = outContent.toString(UTF_8.name());
assertTrue(json.contains(" \"X-Parsed-By\": [\n" +
" \"org.apache.tika.parser.DefaultParser\",\n" +
......@@ -192,7 +197,7 @@ public class TikaCLITest {
public void testLanguageOutput() throws Exception{
String[] params = {"-l", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("en"));
assertTrue(outContent.toString(UTF_8.name()).contains("en"));
}
/**
......@@ -204,7 +209,7 @@ public class TikaCLITest {
public void testDetectOutput() throws Exception{
String[] params = {"-d", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
assertTrue(outContent.toString(UTF_8.name()).contains("text/plain"));
}
/**
......@@ -216,7 +221,7 @@ public class TikaCLITest {
public void testListMetModels() throws Exception{
String[] params = {"--list-met-models", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
assertTrue(outContent.toString(UTF_8.name()).contains("text/plain"));
}
/**
......@@ -228,7 +233,7 @@ public class TikaCLITest {
public void testListSupportedTypes() throws Exception{
String[] params = {"--list-supported-types", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("supertype: application/octet-stream"));
assertTrue(outContent.toString(UTF_8.name()).contains("supertype: application/octet-stream"));
}
/**
......@@ -236,8 +241,6 @@ public class TikaCLITest {
*/
@After
public void tearDown() throws Exception {
if(profile != null && profile.exists())
profile.delete();
System.setOut(stdout);
}
......@@ -279,6 +282,34 @@ public class TikaCLITest {
FileUtils.deleteDirectory(tempFile);
}
}
@Test
public void testExtractTgz() throws Exception {
//TIKA-2564
File tempFile = File.createTempFile("tika-test-", "");
tempFile.delete();
tempFile.mkdir();
try {
String[] params = {"--extract-dir="+tempFile.getAbsolutePath(),"-z", resourcePrefix + "/test-documents.tgz"};
TikaCLI.main(params);
StringBuffer allFiles = new StringBuffer();
for (String f : tempFile.list()) {
if (allFiles.length() > 0) allFiles.append(" : ");
allFiles.append(f);
}
File expectedTAR = new File(tempFile, "test-documents.tar");
assertExtracted(expectedTAR, allFiles.toString());
} finally {
FileUtils.deleteDirectory(tempFile);
}
}
protected static void assertExtracted(File f, String allFiles) {
assertTrue(
......@@ -301,7 +332,7 @@ public class TikaCLITest {
public void testMultiValuedMetadata() throws Exception {
String[] params = {"-m", resourcePrefix + "testMultipleSheets.numbers"};
TikaCLI.main(params);
String content = outContent.toString(IOUtils.UTF_8.name());
String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("sheetNames: Checking"));
assertTrue(content.contains("sheetNames: Secon sheet"));
assertTrue(content.contains("sheetNames: Logical Sheet 3"));
......@@ -315,13 +346,45 @@ public class TikaCLITest {
new File("subdir/foo.txt").delete();
new File("subdir").delete();
TikaCLI.main(params);
String content = outContent.toString(IOUtils.UTF_8.name());
String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("Extracting 'subdir/foo.txt'"));
// clean up. TODO: These should be in target.
new File("target/subdir/foo.txt").delete();
new File("target/subdir").delete();
}
@Test
public void testExtractInlineImages() throws Exception {
File tempFile = File.createTempFile("tika-test-", "");
tempFile.delete();
tempFile.mkdir(); // not really good method for production usage, but ok for tests
// google guava library has better solution
try {
String[] params = {"--extract-dir="+tempFile.getAbsolutePath(),"-z", resourcePrefix + "/testPDF_childAttachments.pdf"};
TikaCLI.main(params);
StringBuffer allFiles = new StringBuffer();
for (String f : tempFile.list()) {
if (allFiles.length() > 0) allFiles.append(" : ");
allFiles.append(f);
}
File jpeg = new File(tempFile, "image0.jpg");
//tiff isn't extracted without optional image dependency
// File tiff = new File(tempFile, "image1.tif");
File jobOptions = new File(tempFile, "Press Quality(1).joboptions");
File doc = new File(tempFile, "Unit10.doc");
assertExtracted(jpeg, allFiles.toString());
assertExtracted(jobOptions, allFiles.toString());
assertExtracted(doc, allFiles.toString());
} finally {
FileUtils.deleteDirectory(tempFile);
}
}
@Test
public void testDefaultConfigException() throws Exception {
//default xml parser will throw TikaException
......@@ -341,32 +404,43 @@ public class TikaCLITest {
public void testConfig() throws Exception {
String[] params = new String[]{"--config="+testDataFile.toString()+"/tika-config1.xml", resourcePrefix+"bad_xml.xml"};
TikaCLI.main(params);
String content = outContent.toString(IOUtils.UTF_8.name());
String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("apple"));
assertTrue(content.contains("org.apache.tika.parser.html.HtmlParser"));
}
@Test
public void testConfigIgnoreInit() throws Exception {
String[] params = new String[]{"--config="+testDataFile.toString()+"/TIKA-2389-ignore-init-problems.xml",
resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("embed_1a"));
//TODO: add a real unit test that configures logging to a file to test that nothing is
//written at the various logging levels
}
@Test
public void testJsonRecursiveMetadataParserMetadataOnly() throws Exception {
String[] params = new String[]{"-m", "-J", "-r", resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
String content = outContent.toString(IOUtils.UTF_8.name());
String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("[\n" +
" {\n" +
" \"Application-Name\": \"Microsoft Office Word\",\n" +
" \"Application-Version\": \"15.0000\",\n" +
" \"Character Count\": \"28\",\n" +
" \"Character-Count-With-Spaces\": \"31\","));
assertTrue(content.contains("\"X-TIKA:embedded_resource_path\": \"test_recursive_embedded.docx/embed1.zip\""));
assertTrue(content.contains("\"X-TIKA:embedded_resource_path\": \"/embed1.zip\""));
assertFalse(content.contains("X-TIKA:content"));
}
@Test
public void testJsonRecursiveMetadataParserDefault() throws Exception {
String[] params = new String[]{"-J", "-r", resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
String content = outContent.toString(IOUtils.UTF_8.name());
String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("\"X-TIKA:content\": \"\\u003chtml xmlns\\u003d\\\"http://www.w3.org/1999/xhtml"));
}
......@@ -374,8 +448,67 @@ public class TikaCLITest {
public void testJsonRecursiveMetadataParserText() throws Exception {
String[] params = new String[]{"-J", "-r", "-t", resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
String content = outContent.toString(IOUtils.UTF_8.name());
String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("\\n\\nembed_4\\n"));
assertTrue(content.contains("\\n\\nembed_0"));
}
@Test
public void testDigestInJson() throws Exception {
String[] params = new String[]{"-J", "-r", "-t", "--digest=MD5", resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("\"X-TIKA:digest:MD5\": \"59f626e09a8c16ab6dbc2800c685f772\","));
assertTrue(content.contains("\"X-TIKA:digest:MD5\": \"f9627095ef86c482e61d99f0cc1cf87d\""));
}
@Test
public void testConfigSerializationStaticAndCurrent() throws Exception {
String[] params = new String[]{"--dump-static-config"};
TikaCLI.main(params);
String content = outContent.toString(UTF_8.name());
//make sure at least one detector is there
assertTrue(content.contains("<detector class=\"org.apache.tika.parser.microsoft.POIFSContainerDetector\"/>"));
//make sure Executable is there because follow on tests of custom config
//test that it has been turned off.
assertTrue(content.contains("<parser class=\"org.apache.tika.parser.executable.ExecutableParser\"/>"));
params = new String[]{"--dump-current-config"};
TikaCLI.main(params);
content = outContent.toString(UTF_8.name());
//make sure at least one detector is there
assertTrue(content.contains("<detector class=\"org.apache.tika.parser.microsoft.POIFSContainerDetector\"/>"));
//and at least one parser
assertTrue(content.contains("<parser class=\"org.apache.tika.parser.executable.ExecutableParser\"/>"));
}
@Test
public void testConfigSerializationCustomMinimal() throws Exception {
String[] params = new String[]{
"--config=" + testDataFile.toString() + "/tika-config2.xml",
"--dump-minimal-config"};
TikaCLI.main(params);
String content = outContent.toString(UTF_8.name()).replaceAll("[\r\n\t ]+", " ");
String expected =
"<parser class=\"org.apache.tika.parser.DefaultParser\">" +
" <mime-exclude>application/pdf</mime-exclude>" +
" <mime-exclude>image/jpeg</mime-exclude> " +
"</parser> " +
"<parser class=\"org.apache.tika.parser.EmptyParser\">" +
" <mime>application/pdf</mime> " +
"</parser>";
assertTrue(content.contains(expected));
}
@Test
public void testConfigSerializationCustomStatic() throws Exception {
String[] params = new String[]{
"--config=" + testDataFile.toString() + "/tika-config2.xml", "--dump-static-config"};
TikaCLI.main(params);
String content = outContent.toString(UTF_8.name());
assertFalse(content.contains("org.apache.tika.parser.executable.Executable"));
}
}
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.extractor;
import org.apache.tika.batch.DigestingAutoDetectParserFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
public class TestEmbeddedDocumentUtil {
//TODO -- figure out how to mock this into tika-core
@Test
public void testSimple() {
Parser p = new AutoDetectParser();
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, p);
Parser txtParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.txt.TXTParser.class, parseContext);
assertNotNull(txtParser);
assertEquals(org.apache.tika.parser.txt.TXTParser.class, txtParser.getClass());
}
@Test
public void testDoublyDecorated() {
Parser d = new DigestingAutoDetectParserFactory().getParser(TikaConfig.getDefaultConfig());
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(d,
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, wrapper);
Parser txtParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.txt.TXTParser.class, parseContext);
assertNotNull(txtParser);
assertEquals(org.apache.tika.parser.txt.TXTParser.class, txtParser.getClass());
}
}