From d52fbbb0409df15dd633a81fd871aaac08162321 Mon Sep 17 00:00:00 2001 From: Trenton H Date: Mon, 24 Oct 2022 13:16:14 -0700 Subject: [PATCH] More smoothly handle the case of a password protected PDF for barcodes --- src/documents/barcodes.py | 49 ++++++++++++++---- .../tests/samples/password-is-test.pdf | Bin 0 -> 8398 bytes src/documents/tests/test_barcodes.py | 39 +++++++++++++- 3 files changed, 76 insertions(+), 12 deletions(-) create mode 100755 src/documents/tests/samples/password-is-test.pdf diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 13e78e1813..1f5e33d376 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -10,9 +10,12 @@ from typing import Tuple import magic from django.conf import settings from pdf2image import convert_from_path +from pdf2image.exceptions import PDFPageCountError from pikepdf import Page +from pikepdf import PasswordError from pikepdf import Pdf from pikepdf import PdfImage +from pikepdf.models.image import HifiPrintImageNotTranscodableError from PIL import Image from PIL import ImageSequence from pyzbar import pyzbar @@ -120,7 +123,9 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis pdfimage = PdfImage(page.images[image_key]) if "/CCITTFaxDecode" in pdfimage.filters: - raise BarcodeImageFormatError() + raise BarcodeImageFormatError( + "Unable to decode CCITTFaxDecode images", + ) # Not all images can be transcoded to a PIL image, which # is what pyzbar expects to receive @@ -132,7 +137,7 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis separator_page_numbers.append(page_num) def _pdf2image_barcode_scan(pdf_filepath: str): - # use a temporary directory in case the file os too big to handle in memory + # use a temporary directory in case the file is too big to handle in memory with tempfile.TemporaryDirectory() as path: pages_from_path = convert_from_path(pdf_filepath, output_folder=path) for current_page_number, page in enumerate(pages_from_path): @@ -150,20 +155,42 @@ def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], Lis if mime_type == "image/tiff": pdf_filepath = convert_from_tiff_to_pdf(filepath) + # Chose the scanner if settings.CONSUMER_USE_LEGACY_DETECTION: - _pdf2image_barcode_scan(pdf_filepath) + logger.debug("Using pdf2image for barcodes") + scanner_function = _pdf2image_barcode_scan else: - try: - _pikepdf_barcode_scan(pdf_filepath) - except Exception as e: + logger.debug("Using pikepdf for barcodes") + scanner_function = _pikepdf_barcode_scan - logger.warning( - f"Exception using pikepdf for barcodes," - f" falling back to pdf2image: {e}", - ) - # Reset this incase pikepdf got part way through + # Run the scanner + try: + scanner_function(pdf_filepath) + # Neither method can handle password protected PDFs without it being + # provided. Log it and continue + except (PasswordError, PDFPageCountError) as e: + logger.warning( + f"File is likely password protected, not splitting: {e}", + ) + # Handle pikepdf related image decoding issues with a fallback + except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e: + logger.warning( + f"Falling back to pdf2image because: {e}", + ) + try: separator_page_numbers = [] _pdf2image_barcode_scan(pdf_filepath) + # This file is really borked, allow the consumption to continue + # but it may fail further on + except Exception as e: # pragma: no cover + logger.warning( + f"Exception during barcode scanning: {e}", + ) + # We're not sure what happened, but allow the consumption to continue + except Exception as e: # pragma: no cover + logger.warning( + f"Exception during barcode scanning: {e}", + ) else: logger.warning( diff --git a/src/documents/tests/samples/password-is-test.pdf b/src/documents/tests/samples/password-is-test.pdf new file mode 100755 index 0000000000000000000000000000000000000000..b16b023c33cd3caa086db4f624ebd681a3dc62fb GIT binary patch literal 8398 zc-oD52UJwavpyLalAuIEM}mL?lLMK7VI)dakRT2)z(|H636dluB01+IL6U$1A_@YM zRFEhVMUrF$$^OCJx4Uosz5jcC&b_z$tLm<*uCM!^(=Dil#fd@0kpRJ#yp|U&wJn(d zC=d*EFuMbgmIkR4?5#-FK-e*&1Hut)Nd#vQ&K6H1ULs``TuE%vGC8PpymS2%r-}Qz&S*YWXOohcDPi zvO5-!hrQ{JunPDVY*TWmo>bQmxD=##>0ks~WM@J$C7I=wT%FYkI5n&WAlO^{>EUth zBq#vv&*&fs$X^)Zw}_GdL&Sf|AwUQij70w(ztop{r*XP=a@Dx)sQ22k$`{rjyv*Uz zUHK-r%6MOH@$^*dW^2H!pPOT-%)qCRtMH|J&;8>M2Ulv|eZBs2?Vb9})r5km&{{t6 zN;+dXf3pkTK~Hvneueqnu63Gl>MXW)3Y7neUoYj*r|ZumVmdy@MMi_}3u{}J5N4uf zKiN@#V9mZNcW!?^M?N8&{c+qG*?W4<&qIhGHa^X@G;NhIXmn*|OVOO5n=ys>o}$u@w5E2#7_)7k zR&G;ThUGrUOBy3xSiQ-xG4(y9SB?b!$d2K-;h zV}0sUO^4fP2~cLqS6;+|V18MyjoVBsG6IZEb!y2y3Y`F&+sXN#P1Lq-J2=|^INNx% zzm{|9y$$)(9d*jZD>3JpMGb*mE?`@^bRX$*d-aG zOvA{@4tkbdROxM+piAXCt!LGD;dc^qHv{V;gMU;PG)u)boLTX8x)bVX*eQSb{L7?L zTrCA1Yf>ncqPogZp*wBa)#UQl`SuHkISy+5_cdLHOA9j-(D7f^E0rT&I|4k z?**gsRtq!KST#$pkb;0+M?k?{j8JJ@c=6uT;XqDyYrc&fMFTzRUS7F0kH#ZUt#bz= zIc!CxFFR3s8TFF<@=1}sy%YJoZ9^u~H@{5le;hGjqRi|Ha%BaT z8lY+4Okcu0c%`2`&CqUEnM*U_g;^zWI2=|NRxMOUf!VPalj#N{n~|g)%|hGBBg;2IO1t-aHO90dH3LTm zPTKshPEoSSSksL@x@s-n0u+c3vccOKJop}x=Wto_RUoDz+4iBwt=mqUkC>lodZrTI zMw%*B45H%cp9wIn^ch>#SF+ik<}6rIg63e$#pXPfHN@5thKsnX_KC4M^89EH(lx1O z&j(&=)sn;bOwHa2>3rS0Egzo6-#TY~A5RuwSd+3uOOtT^=4_T&ebwE2LtwyXUwi8M$OEtr#5o6gf;qWRHk`dURH@yH;^#b2}-%AWT{&K zD2wy1Ygh9=>o&Q5kddelm2-V0t#ecL^K06@;`CkJXM73DkCAtgd{sh}oHtMdkk6@W z3nkXiEHq5-ljFeeXKtr)e-mk064oZH;4St#*Q zkJJ3~?G|+kZ}Yo~XwcPPz|-d2pI;(-+@Fmon5Y_7Le}|8hQ=BmEr|PWCZRtVt~C?e{rm!a zlNK+B0~h#(UywZt)w}qG*2rvqB$P7wtIOvcNc+YSawsdtPG3p-D}aZ&lbw?#Hfo#W zwjix+^WLPWe(41^Ci*?EI>tv??}{+WJ@4bLd^3elPqu-BA^M|-jz3t{KB>Z3%N7#u zCdEZ2#wah1jWSzaSgSNYLZ7;R+RNj%U&oCAQGMovSet<%Vc%jl@sO=5N$bVv?t^{c z8&fWS&AO4ErfF+3kqCK9C7^avk}=)`KY! zEO2YBjcVqHCv3j{xvFRwbD1iSYvNk^*8^^j$!i`Hm565`@rQJkaiK%QLN9Woo9aV+ z(qabvu#8%amlUsV>g&EU*ASkz;%46l*aZP-s9UAfnVBK8Npbn^Paa_MT8RNe=cX_4 z*vjZ}yWsntuChc0nEDiWp+2r4o>%fMHG_uNrIax5)0gIYGo$Fk`?e%i8!v^aQS)4h zpFM9Ox%@@-`m@&2Bll{k#5QJR7{Bz{At6%s?twg4m-4NB5u-eOQHu5Fv1f`YM{$fr zN9-O^Wh>E*OeT@pTO;wY*VZWUj`(x4cl^dc?YIG;^PWGF4*r@(T?7T50e zNKgl0h5NR;e83INN{-Ty$gqjDM4j1>M3C4hX;d{{U&%V|aG25>?L4|KaD_0VTQ?k% z(F8xsoevvp3q~Ikv(xSgr6|(4I91Pn;x| zzP1TX-x=BwW$(Ny`o*a9j5vw;&_Yd?A)~|d@XGxME?H0p7Z<7R3f>5Z_Ma`0aD~16 zIw-8Mm=mm&U+W&X^d@jHtfD3T(DTBoYO=pfpEK^qB;fA;|ZQ_N`~D zSF=(nMHheay;v)F#Fj4KyWJ>bK1xjXE2e5^|$?GkL2pcCEI8 z0Fa^{uPKEINR=KZxPEvZsK4}U_pF44chn44gwJ@6fF`;@W3mhjag0{vlm zv#@dx!m{2_3fc)yd5mjHG3@ZP+LY+N8g8DTc3}8JCEKqR7t=R-zB&_`@Ci*Fy6A6} zo~O}x^9r}y#t)5_+=ZWA3suDHTSjNif@cP`v!hKT44?rb2ZbN|lX0 zQKK#X0g>;7p1&-VyBefmGwFBhTmQ=v-f0ot0FlEA;{+AsBh&iC#*$SMZ2xrd~0w%p_r z^!^#NX;nx!*Dk3r$+*b!HZ61Z(co;QBU@9aeO;P1`0e%f>eriQ+!^Poxks6}f&S7o#j(Vcyx~6CB zM#j?U<50Aw|Bd@s17&A8BnOvL)E?gzXzJqsXuwztmUp<~701zff9#TMtlXI}?N{)t zAm~|WWRA1aoTT)M@e6}`9$NkP^6&b`Z62rDl$ZL1PPxYMu2EtQZKtk}2e^2?e%^m; zvZ!+;)fFlFns#A&*>H1}wYnp*-fl`eHZ8(%(|Q(v^IIz5L0YQZjh>K$l3kttzL$(l z^V1`vqL<>gpgy{{4(EMBg5cXybY-JMVP`jnf21dG8}0+-BwG#;ZcUzXJ3_V=Q%?)c z=VN#had}n@6Sm_ehK&Z&h^v|+JF3W3aXYbvw8Q+3-nj@q5|zp)I;l5bpjm5yOHN|q zlXvwD{Tb&_Zk?x3t+5iEepU#cTQT z($~xUk=3h5R5S{(c7A%742nxKDPD{weYe_oR(Z)VU%vJyWMVz@eb;I@Gdl-YgfPrL zB^K_fD*lWg9ZYg58@FnsJXjVdn)reyCV?$`myTk~mLHJ?m^z2lM29P=({5DaSeLp` zxEFDP`Wqv(3ywcaIUP2(XCyboTGRzmtaTxqp&(dU?h1u8`Hj9PwsN$2|qVWCR;NPo@l4Z+b^o;A~dHz zCYJZwPj1~!%T;SHMm-61<@l-2AKq(L?IfHpQ(w@)NWAS5oL1AZZOHB{_n~iWd?-dZ zv?F@??o9a(jP~%~gZ9nu7LS}8(rDIJY*_%4)vcTzk<^Az#$ps% ztiE_Z>0Y>)<^(ANJZUb%K5Q0eyf^xMB;&gf{`KnBD{oR5zHtsSyI|fACRDjR5=`g! zwSTxn$$&|9j7^U9Dk?4zZP_~=v|6ERCZ<0yE~mXFZ4}24u>=@kTWRo!lzaU0{PNKd zUtY(4#0!L>sc-HB7!+4fWX9%@BB{p1(WoOx74MB7b!e-kR9Ec^bFXf1>VR$yU#p!? z8_*&!=RRx6KP;iHrk4zz*bq2R_E68oAI)5qb#zF6ZmZd7Y#57saJ`oqZEZN5j!U<3 zi*48P_cLpoOo$qci5hd6clwX|Y!tKOaIi%GRACg1#TNiFy|YE^ih z1(``6TnsIon7PnSRJoP*EQD<~KIkKrOUXC-pIoFiHIYpB{Cnn3&&kdOQYKw?CXPLS za{0{7t-U5=J<)q&ccUcPBeT-889Vi+*`$g5dkJO^ZPcu{Tww}oBec&}r7;TbFya7z z^&K!jX6hZ+N8(HTDPz^em3DYFjNZ=O^3F?xdmf$}t?xQ?v93H!s354d;yeKI2$4)v z8w>!Pp|8MtPF`?}y&XzU{?L-lt@h)B1uBA=hqgUGWl;&#TT3hLOdaTagxAkbfQpq? zIh9bzrTEJGEgMdAtfDEnv+d(^^9H(in=Mx86Flf@12%}i~XzTcsy1wPm+Jl@8A{=rcfIf zvz8t+qpoO5x2usVE1i~EvxC0~nAcvfHK;66*c9~wSjrVEwvo>}aZ$HQO>~7>)~r8X z0ddW9sPmSPS|e@y7*5(OHPSMc71Y$EnxTEe*;R6BDkf`ffQ*caMOe7z`4fAd5RNZb zsw{QFY>h|gmxW&;QZ^0~gL5QIHHxz@{dmT>_CCR)D?Oo>p~ z$@x21iVD;WUTuywi7jyw0~%Zv$k2<8!AXP7ELIlE~` z#Zlq{x%noPf_di$`SNZ~0y~RkGwuWYgm+(BHDAsYa51EO(zxEmm}GkaL(1tN?==kV zh3~9&4#aMT%bR-EX|5Oe=q%pW$lvJ6mjRH_?=O( z3Ok(q+9lVGeZ#D*rE{1`e?#42ZQtydX=iIBH*1MiTi>_JN0nqVsJ;c5e&!dx*uHv$ zx_y>J2(xdFfa|sifkA0^;@1LbL=XAyKF7pkz4;lrHcLHKvK^(!$;d6 z8m{>orG=iOz`R62mrdsFBp`j`|QW$d^wsWNgYkoV=%Z{f^j{J;%@2r$IaLx&TB(fT` zZTt99RM|btWc^snsF%85BULW9NXK#-u?_Z-HsiJ4Wl-`m6Tszd1@!XA z+6^*E8ZTyu_>tyWi5%=LAUSt z>;APvp0@Tri&B>UJNMH$lT+RZ->jdi$Oc!1*R|x&k_NNfdgiHhG`k1FJqva{tHO#t z=CjmXdsxU@Es^d#gx1}DBtkOKeRt2;F^~IMAqi7Oj4B-WxOl7Fw%f(;MMl`n8OFZBl%c2EOjKCm%};F6j`+7%<^;k*Hsx*WWL6n`8_t?@JXTAZ zz0Z1uY|&m^MK^S7QTc-hoV>mVX4RGRV7&-uMFd{Hgn7Ai+3%+Ib^*;irDZ4<3yq3=@PEX9DciD-XiG37pQfZv&jru|7fLBAe=kpSJFQ1cgb=eY$Q;)pvzDkqXgr8=Buk#AXbb9&_E)e2Ms02r;tHV^w383tJTy=HA5Qm?f1czc z!C=H+AnNbqeLYV{0tn||Pr?#h%$G4*Fv5$~u>AX-_IPPza|3rjqrbFrdIIJmQE2r_JXa6MPLx9dRoYlC;uf8dS8f1n6R*VT-4($I5uCHz7a z@GgX3lK+t@L}wS0qBY+6B-!ftKkJZV>gW?KNY*Y!U?2j41j3Km14ByyQPAUjgTX); z4EE~_MI1lJd^(YzJip|}`X{&v;9tB2qJA+2q~~zmo_In4Ao}0bfq?%G3kLy8{3eI| zLw-Cs@K=Z9F}0i>%ykJQBajvr2ht;WkWBttw|`BR7T)UQ<$OYN_^*pI9fFI4tFt-5 z1^CN^4oHJwLByY|oD%~G3559Z+L3^vz+e;_1qY(x5OFkG0)>R3P!b3z+61I{?9K6T zF2Iw^NS%M!hW`iKE`Z+*0{*rL{6CpInO`-cg^LmJSO1fyqJ!(nl>9Sr=)bHh;*S@- zgVi5i{1H0hEG>aSDJg-G7&sII!(tUNP$f805ru@}6tQsGU#YTiH7ER+9tr}7Dxi<`Ajf*( zKlGF&zz`(~2vh=$gW?oW5HK8q0xQCyI0=LVQb7R&K`ThW(8pu_GI{Jb_IOhf1~&8= zJ3D?^ishl%mVK9qcq!4=QiIV4i~Q5yf28Bj9Z8pjx3|DMTYzo=q5rjEL7-@m4iE;? z6t4f&J8z*tj1yw9lQq$Cdf}JjarWSr-wxaA%8mp=-Bbj0U06QU!U`*$Ei}gb*jl|= z=aB4QMFgY;6oY`F=zlok;Y_dufPoMM0Q^@0A`u7}0%!^R6FXi_$5i+wu?PO82a)&< zgTf*I#!xUg{C7R%Z+b`=;y++0=x-PT`8y`@yDuon@z(GghD7{^!I8iBMIe#CVJP@N z?2?@E$NwDXlkDjd?-Guu52WMZaLjq=A52!VH+S}QJZ}HVY!!P;2jDM`o;0vPBcSy0 zTq%ITFdPyGgJLmC7#JFafKh-*z$6r~I0zJrfWyH!z^{KD-kJ0(?Z@j`0w5@;q=^IkKZYu