From 811bd66088bb9936198e7b8b370fd5590bf4100d Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Fri, 18 Apr 2025 11:38:36 -0700 Subject: [PATCH] Ok, restart implementing this with just azure [ci skip] --- src/paperless/settings.py | 8 ++ src/paperless_remote/__init__.py | 4 + src/paperless_remote/apps.py | 14 +++ src/paperless_remote/checks.py | 15 +++ src/paperless_remote/parsers.py | 74 ++++++++++++++ src/paperless_remote/signals.py | 18 ++++ src/paperless_remote/tests/__init__.py | 0 .../tests/samples/simple-digital.pdf | Bin 0 -> 22926 bytes src/paperless_remote/tests/test_checks.py | 29 ++++++ src/paperless_remote/tests/test_parser.py | 91 ++++++++++++++++++ 10 files changed, 253 insertions(+) create mode 100644 src/paperless_remote/__init__.py create mode 100644 src/paperless_remote/apps.py create mode 100644 src/paperless_remote/checks.py create mode 100644 src/paperless_remote/parsers.py create mode 100644 src/paperless_remote/signals.py create mode 100644 src/paperless_remote/tests/__init__.py create mode 100644 src/paperless_remote/tests/samples/simple-digital.pdf create mode 100644 src/paperless_remote/tests/test_checks.py create mode 100644 src/paperless_remote/tests/test_parser.py diff --git a/src/paperless/settings.py b/src/paperless/settings.py index ac5f675dd9..249423d547 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -1409,3 +1409,11 @@ OUTLOOK_OAUTH_ENABLED = bool( and OUTLOOK_OAUTH_CLIENT_ID and OUTLOOK_OAUTH_CLIENT_SECRET, ) + +############################################################################### +# Remote Parser # +############################################################################### + +REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE") +REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY") +REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT") diff --git a/src/paperless_remote/__init__.py b/src/paperless_remote/__init__.py new file mode 100644 index 0000000000..5380ea5ac8 --- /dev/null +++ b/src/paperless_remote/__init__.py @@ -0,0 +1,4 @@ +# this is here so that django finds the checks. +from paperless_remote.checks import check_remote_parser_configured + +__all__ = ["check_remote_parser_configured"] diff --git a/src/paperless_remote/apps.py b/src/paperless_remote/apps.py new file mode 100644 index 0000000000..8cd3199f98 --- /dev/null +++ b/src/paperless_remote/apps.py @@ -0,0 +1,14 @@ +from django.apps import AppConfig + +from paperless_remote.signals import remote_consumer_declaration + + +class PaperlessRemoteParserConfig(AppConfig): + name = "paperless_remote" + + def ready(self): + from documents.signals import document_consumer_declaration + + document_consumer_declaration.connect(remote_consumer_declaration) + + AppConfig.ready(self) diff --git a/src/paperless_remote/checks.py b/src/paperless_remote/checks.py new file mode 100644 index 0000000000..ce72ebcc82 --- /dev/null +++ b/src/paperless_remote/checks.py @@ -0,0 +1,15 @@ +from django.conf import settings +from django.core.checks import Error +from django.core.checks import register + + +@register() +def check_remote_parser_configured(app_configs, **kwargs): + if settings.REMOTE_OCR_ENGINE == "azureai" and not settings.REMOTE_OCR_ENDPOINT: + return [ + Error( + "Azure AI remote parser requires endpoint to be configured.", + ), + ] + + return [] diff --git a/src/paperless_remote/parsers.py b/src/paperless_remote/parsers.py new file mode 100644 index 0000000000..03b53793c1 --- /dev/null +++ b/src/paperless_remote/parsers.py @@ -0,0 +1,74 @@ +from pathlib import Path + +from django.conf import settings + +from paperless_tesseract.parsers import RasterisedDocumentParser + + +class RemoteEngineConfig: + def __init__( + self, + engine: str, + api_key: str | None = None, + endpoint: str | None = None, + ): + self.engine = engine + self.api_key = api_key + self.endpoint = endpoint + + def engine_is_valid(self): + valid = self.engine in ["azureai"] and self.api_key is not None + if self.engine == "azureai": + valid = valid and self.endpoint is not None + return valid + + +class RemoteDocumentParser(RasterisedDocumentParser): + """ + This parser uses a remote ocr engine to parse documents + """ + + logging_name = "paperless.parsing.remote" + + def get_settings(self) -> RemoteEngineConfig: + """ + This parser uses the OCR configuration settings to parse documents + """ + return RemoteEngineConfig( + engine=settings.REMOTE_OCR_ENGINE, + api_key=settings.REMOTE_OCR_API_KEY, + endpoint=settings.REMOTE_OCR_ENDPOINT, + ) + + def supported_mime_types(self): + if self.settings.engine_is_valid(): + return [ + "application/pdf", + "image/png", + "image/jpeg", + "image/tiff", + "image/bmp", + "image/gif", + "image/webp", + ] + else: + return [] + + def azure_ai_vision_parse( + self, + file: Path, + ) -> str | None: + """ + This method uses the Azure AI Vision API to parse documents + """ + # TODO: Implement the Azure AI Vision API parsing logic + + def parse(self, document_path: Path, mime_type, file_name=None): + if not self.settings.engine_is_valid(): + self.log.warning( + "No valid remote parser engine is configured, content will be empty.", + ) + self.text = "" + return + elif self.settings.engine == "azureai": + self.text = self.azure_ai_vision_parse(document_path) diff --git a/src/paperless_remote/signals.py b/src/paperless_remote/signals.py new file mode 100644 index 0000000000..81955a4794 --- /dev/null +++ b/src/paperless_remote/signals.py @@ -0,0 +1,18 @@ +def get_parser(*args, **kwargs): + from paperless_remote.parsers import RemoteDocumentParser + + return RemoteDocumentParser(*args, **kwargs) + + +def get_supported_mime_types(): + from paperless_remote.parsers import RemoteDocumentParser + + return RemoteDocumentParser(None).supported_mime_types() + + +def remote_consumer_declaration(sender, **kwargs): + return { + "parser": get_parser, + "weight": 5, + "mime_types": get_supported_mime_types(), + } diff --git a/src/paperless_remote/tests/__init__.py b/src/paperless_remote/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/paperless_remote/tests/samples/simple-digital.pdf b/src/paperless_remote/tests/samples/simple-digital.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e450de48269ce43785b8344c63e233a1794abae6 GIT binary patch literal 22926 zc-ri{byQr-(g#WiK?V!%Ft{@e?gaNhaED70El40h5`w!E+=3G{5G=Sq zl5_68=icvrPuBbAt+&ozvuE$_uCDr3@7g^*)m8Lr(z5Je4n6>VXJKbk=cmpb02h!G z=xAmG5D@_>!yGK(RzU8D8g-DYwH+Mh0+O|Zz+uubsG~VdObp-(cY#6d0iK!cv4emR z&Zjcxm#_4QlpX8}kR{tL0tpS}9~&ic6i$!Fi~>*2$uup%5rW5vDJRf;j6qA%Z1nm{ zGZVxhT6U(y*Sm)5m38FCKiqTEc&n8RQy`QPoTRi3JBD}_Zl_`tv`uNp%CM$fa-r-a z+DL76!`+g`An8{HDuwN6Eb0vs7K!qJ%d%Ix^YRh}p;BSq{9hdOuMdf)J*TSzPESzY z)PU9iFbDHL0YBXR(h_+JL#&vq~n%qhUd`Pbpt5cGnRZDcJw z%GSUPKG9LtkkgMdb=2GGedNpDa?Z>A*q8V?Z$-WYI|N0ZBY z_NlOuJ?;~n1g`;;jXqq`!$-|D=nbpVRc>5eRUxsWHPus}faQ{e{Lo$d<-5AIp`nA8 zR!1@6(k{)HR{cDykvu{qFNMp?o=Empp#_bPH8Vq8Rl8+FRdG6Qw{(4de`erFS2`V| z%+UHHIa|~4kqR_?m=3)ldkMc?|fMb!QFP;;Y z(M!^mJ(l8ViOs0BO3jxOGxF`TBKfP z-M|)FT^7>jaq5$!*C-H%vV^-FZ-W81RUrR({7##ebbMdR6kD2xu(vIN?WKJPZ1o*DTuHyn?fGn z`ZZ@~X9+hnesbLX;nX-cuV1IgZoua__c_n&Hf(pAZ&G-d{A0Ox<=d9FmIVvmWU`Zn z*WdCx5LNuM^}Ai4**qb+;Yh0@#Mlp9gki&lY@VbrFYLc{`?~0_hl_eQlTwDj_vpRZXuYWQx!fp>E(K zSxt_uAX&jnlAm_c0FA)9MxYCi}|w_I~cLF7rV)sq3eg_Fvn+ovLi9$f>O5kkHoI z-Rl@KK`n-g-|p8Z1xogpPv3{Bscp`gZy(QX!`z|oEG-t?+eO!M(8x_PDXY=FuZJqM zh4DNf^^v_%46z_F_ffV+$FqL#0Ajw2q{&4;BNx4cZsW9acl1>R^+S`W#ZOCi+~zP5 zlP(5bLN+YzS&IUgfTtl1?aRK2<|Q7Ec%0)-X_x5>^I=s%SCZ%5zztzh;y4-tfn6!_nMb4lXtoCGkAXC)B6dD@N;+Ulk3G|{cx3( zh8Yp}WgxJwe0ga~c-^Q9>9vKLYnjcAm&Ffb*EtNV+=|j^RO8{HYc;hvr1qG=W;Z;s zjT;-xC2ux?g7J3yMLN%RWv+4J@FsnyusH(3S2<8kmEO{K+)(zoe5;ao6L!a6$5RU4 zNPWr`kj2Dgx3LTlo5$zk5y-L6ik7yu&X)pt?ol_6P!gK=uNBpg{d~{D+HK0KJ7IAqjpV4u|0!FdBK+;VLAcy^x44k8OZ*vZN3*?T1soZy&ttBbDL0_+`Yq*17z%O|wrktRO9Q`JBH+mN+dwKz*$t*-TxJ@zQ2iY#>CW{paf+eJ z+?B6zh{e3gQ46fOcz^}H^*ky$?jM2$U!z8lYo*h|3bo`X3auxidn)xLl^6)h=wmd2 z*7at?BIrpcoa&UFu6B>&q54e7$80vI;GfQ^+3fK{){)jp@#8p@#3H-iPJKSgQ6`q$ zKKTXO-m9NeJ7(=}Dn5q-DUzx#JGlL+jO@BzzH1KvhWMd>>Th}SxPM|G>`blP(A=@; zgiz5@Vd9uclu8aiI&f!V)+Y`Fj>vH~=cE8o@`q%PEsvXnHFA`}v z>M~20#kTQE0Mi6AiHnJUNU9PP1X{UmL2OB)U3edt^odx)GE_rW01|2t?G%F(9up`X zG%n>+p*vfZDvq*-1kHcVt8hf7w2bu5nJdR48BUAQp}0 zAK7;4k<*tj6*?XoC3ofi@|ifDdu?s}hnv@?BGb(N@pjIt8l`n9_~}?zHuBgy`I29$ z6;s%NMaZn0qk&Iedy658d&n146Fn4O)3+6YF`y_*4&J)b*bf=(txfr>8y_vEj5a`R zJzK76u(dcdsEUPynfWctd)*ZFql-5QF48RJB4YM1 z8Is=8sc^9z{eVe8Nd>3nlV*aK1nD?}c440hZnO_|C2Ay_36NNINyzjjNH}V;Ez@fV zVmC)!qY~vaH4x}Mh8koF1N#Ql0AEslu1fMK2af9kXRRvQx~zHAe+(WIIBn>YeLfZr z7Ek3pX)KjITs%++pt4C7?8aa;<#S8dE5wx1^k@3Q3EeCqNf1f;A{g zC%6??BpImHVMyPX?39!zNx#h+q}d@-kZV=BUe7J5#5q(JZ!fNAHhzzsF#<4sn!JAe zV^F8UUO(c=$_RsB^7`B4Lj4HMi>$D+0{&+)J70J>{Az+W`_Z-Fm{@ErgUx;7Qrb8< zljegU6ZL4?YHyY5n5v2op1zxkq9*G{8^=B0d7qj*zT;idd)z_lX@!YV6_m{zADLq6 zgSz{IxqYdSEcB5Dtzo@(1<$J(+l)-L-fAaS)*q}ovng9zNo1_-BIn?g%uqb&iLOas|?!h73^0nn)$hpf46pA zsxPdrv?sdd4S{pC44Mp!D-3;ZR90k1R&o2NIBHX_DMERjLd0=!n#O_=UNj31nQF84 z3JJ0T#9XorRR?Q&`bvv8gU0^E`%RqOJ8&L=P`GU~L_iAra{Q~C494U5YDSw!>+~ousdaNC? z?-j({B;FsMa`S5nx-+(YxGsCwv~$)*KU`|LhfbL6?V z^_n{&VG6hf=OH?++o`ZV`*@jtuZ4< zyX9&@S(vVUjLc};tsOp7kgvumTknzll)Clzna8p zY~twgv`8nQv8~zj(g=N~>1&=J4QBm`O}!nv8%6c-GXbxMV>M6 zBtn@}naG6iX7TG6!=aduq+1%C?^VX0@yk#*BVT4=Vq>uzUMyEF!B0BU?2r#G4S}UH&47x_*T^K@y>6`XsD}i(IDQd06kb zm`U~0TlTgK*4P@G5!kC`UzQT8P+rb?e!;}&=`^Xpx*zp9*7?5q$vVSw>Hd$mHCebD zURU<82vv{zp4j3oBRDU2KXL1c-1~#KYHsq8!S~?mpl2dfl)RwkbN2|J^ zSSE{GYo|Ukv-BN8I2=#gnIyAOsUik``})~9H1}<){NOgX*+|sk2UKzVRZhHl+74Nq(CD~R zYZN(|!zj-9qZh+G%NFS#FF9lOi_*zk(!RHRL0mCscxt&nR{k6*rp7qD^(NJ8pP6S1 z=&mg)$7Ja`Kt7UmthX7eohF)B{m7K@gW^?1@DK32&_Qt@8w2O1>UX=3`guRv^}6mI zUlSoaiZyg?*pzaT>syHm2y8d+k9Ji0UFSxh&)8YnR9YKd6u876i@jxL(|$LfE^+NQ zIIIc77O{)X6u@Zh71USUEG$>hODmNx|H8iks%7g=+*daup^;7O)v>BzDuNc2K#z^k zo9u&1*aNBXlb@vH)OR8>qa(3%ng+R}^XRQ0qNkFOwb~i(dtkO&j2Hl~`v=EeS%q#q zvV2EG&$D&f&nWI|sQvc$D9AfP4qz!JSo%vQ^O>S&-q#Vyu2n9SP1Z;3v#Q2Du2l*w zG+thFW-TjAQI}lf`I&;wRRe-f9sW8RiG%E~_LWG5gvq4qGHwIVb=*9{N*!!;>%N1! z((erJ13rk~XVeDt8~L(UDXQEfR<<9gTuC{=+aN2xPexw_9?^R+bISefPenZTB+Sw@ zB^V%AGv#GwSXN&V$){b`Sn+kC_mU3cy=ZjKLL`dEwbELVd>Jek6xFA`ANk|nWy-mV z*$J#SXp8?jlx>xNVq5E^>KOYc5i{pgiEdU-IYmwPC!763S|Nz#Y8Hk;Q|6I73{Qq> z38y^p1#*1f3gxwt)+qWZ2ch?=pY8$-` zJo7}kC>3NXF`k5(0tueVLXU1E`#c5cD@cvld8YX%(J88DFQnzIqUgU>O9KfaWp)Ac z!$h|DSEJIFKrL3gsMu*YIax_-y*%-xm(izTJtYi?k?;D))Od&$QL$EiBae@a@}ytW z#JFL)VOpkb4a&Zjd?rti*gk3wtd@HjlSLL0j~XNQx>OA@GT|+!?u-eeCt|4@4?R-8 z1O!+<%gskff(ia+#gqWZbz&pvkq2ccO22OJV>*3PF~|_F5p$<$F zUfP8C>oYo1rD?!*N5a_q_n>(E;5>74kyk?rw;lCDh%S#)m3TZ{LQF^G~V($f?KMMmDv*7OV5#F)mN$*4o%A|#3cVX5oTmRWP)RY(Jz zNjC#rQTjA3n$gThFFJPJiU$8xe(f_(PTaJYg`@fc@u<;e-S6!Z#^}mX2_;{mtHxA${Z$9E;hD8=qFGz8j~zs&hjZb*_jb3&Fz=(x>K56G(Z5&IYB+ zR-C4%#dE9f`dr()s4}Mz8oid)ajl3;BSEu5@k}IelqoVn;HxUGGE1~(P=(}T#hEM287aOdneGK^D3=5S^;4836WCh$MI&+2kP9ie64^Zr;rY_}jp?w=) zmCKm*n*x0m{0p)!KDcmP|3|0%+hxv>#}(NU#VVF;Wmd5(lV~P8a%%9$iYEC(y~gy4 zWg6v^KGC9yLyq` z3Qvw^Ofw4l(RD7yM*yC#duuPad-S!eaLYnDO(W=g4^C_~8z>sxchf>`cXA)jXrM^7o$ZM<>z@fLGz!A9*1UkgRk zOph2`?d0@6wA;mgx6vp?>(&rjL;FOv*MDXi{l%amU&7+c(Gy-O=tl)+V<=lb@UBoe zJNq5+&hu*j!Wj+kbd8)+!@w#2;_dFQl}YMmY394MLkyhH`BvJ;Bge*ag!m%r%PlnZ zxZM;J^1z)xJlWYCCv$lIh-)8I)Im7%H6gNVT*+#Wo z@3PaDKqFshL}PHeG}yA~%3&0P5b{gW?YxZayopxcg+OY*VP1okuR@5_h)=_v)nzMI z7=?2JB_82)p>H{b={Iw0%FVEQXx3dzK$QSDGvc z#an**<0E?;!8_tN8Aqm)-;!hT1cQhObBtmC%`Uy;zd_giRc3cV|n zB%b>isFo|~s%^(;$^lLnkN0mcr=}w|Rfk+f4=`KLM}2&6eR6%UdDcJNec1YN55GAX zoA3&2@Q_ftLk+Kc{;sa2{&IaKz}_DkpV>9^d^+bOGgFK?M>7Y>{}6M@YH9?{_0;vK z{(Jq_*7Gl4My!i-1a6SdCwjBA7uTkH1$V2np)=%bx7dOEk9HqtZW-SHNjS4EJ>PyaecZVmnm<2OH?`nCWsl;h(d(Gma&#DTbAlP+xI#JS zV+Kk&srb~B&pU0^OKO{JMLI-u*)9SPSt_!poho0rgY*kUOA=vE8AJ$a8w_@hOYkyO zNJ>yRZWvrpiuF9K*H(^s{V1b85*=t{dtozr0X{ew#eMAk+DT<1LLb&(XJzl>dwt^W zpW*ZIV=c5OCg9ZZMD8G9$t2o4P$= zKE>+yLNVtOfLD7Eu8_5mVNuY$8*Etb@vr(}vbp;@1XUhCRrXn{-L$;EU%z=-xRIJh z$RWO%!WT_UTo1+x&DR@=>2!3?QHosjZ}SkX$q)6UyySDe1LqnOL&cbC8Qw{MOnfC+ zoaxW`@INmAVUW}r{lDyc2>iLn!N(=Q|IbRne{FnddO5*BvW^aLX_zb2#o7t(=<>Vx z9AXaxNlN@WSe32KU@j22wWEUu%*EOQ@M~wp(iO-9_)RG(>F8;|&db9MWasAO27lA0DBef-s+q@_8gqUtl2<`oq2|Meo%l9eEtpc8?aUS+*t@TXLS}9}jLBdSeJb{ZuW=S8} zw0WPwv|0|u7W%p<_XhS5?@wCpQ<0jK6UO13_>m#Y_4ExT=RRv~l4f*vl{Q}ANAckL%%jn^4aB+je|LE!ZYv|-$9NnBA*lIkCJ;cG)=~oNX z3nZoS@K(87Lt*N2k{|`(L$3$Z2T463{@w96=)dCQ`4_$DRF=1>EKH_Am2QpDq~`vT$tGI3I^VN=YDUK|e$N)16Hoy{czPD->RVv;4eRq^;3e)_{@v>E zV~Z2apJ&1=jVNq5uAD269B&G)Zz3S;;^Y^g+Ph9LmirY7+am-J_%Pu8$i3-uZmJW) zg%;Ks`kioRz$^0WI5e@Nc8K%qcAR?;MXr}m+}nt^9~rMa{wBeR`q=r|&s@2{dsTw1 z9QxI`G0}>#XjmAp5?5m4PPZ^$9-U(=-C6GC*xfDOnv6gnuW#JZN=S<$sG(n9 zi-#h#B_X``@ea6G?Yz6&_dd#2~5CnnP_<=NP$SltY-ac~6Mf(uQH!A34NJWg$ z`S?I3keTx_VH>XQBU@9S_y|!agor>qCuEK`g3ph!SJ0XQ&m|;V+OUxVpFM@gt8GhY zuc8Rb1Y13IjfaxUBSZqx=tA)00po2i;vaEI5cVP0uMi7h{d^|Igb=nu$AnnYPNIgP z_>6=Jg&-GKGlWzPedn2|019cF*C=ZGGj6L#?{YB>5%+Q*8$Qa)MTdt_j-p?+@j6i~ zAUs}SAV!ihC0;MCfUkIz&|t?llF7JmOwq_C&U5C-;109PD*S$`;7K z!tROkrrqc8QAIHCA&xJyNt@35M=e2AD~z*<^D9KNj{@2_W*=WaMQVSFD=9&YioQyX zC>`AQ^htii+o!Z$NY&3Oo(gw8a+VNz3hKf}dnOsrNe37MI?Lm}4mJns$y2HaCIJcL z3DhHafb8-p<6$S%QhlggK|?^-KHQpcJD_miBOxR~366Nybu2H*2x`YZqMs3nc|ay4 zKnP8)JQL}wuy1)tOlU%gMUrxPa3;KgaF@I_H8D6MOX#cI2{o`LAX$n)jX?A9l*Fmp zWB4PRVEx>vQTn$O4H#~ci)z@DD5p~Vxf!EC9*mf09(mwxF&-?@V1_(qD^hQOXvAi2 z%BaW(gyyg}x#y#R56FJOEzeBy97nOO@UEWhJqySaC2r%44h3^!F|^U8B5FWnQ%Nks zzCt)sQ8q)Km{RNF6Nb>6GV2n)i6DRw=weO>el#`GrQ!=!G3A~`{1k2v@tMVO4>y8f z&!V4&->x7C5MzaPt}+QA$420-nhOw!0W3Sd9)bn1IRaHXN>!sW6v2<$LrrCzfS+-iP#=YI28YK#lczSv0ieP{NhR^} zVn(HiX_TLCqOyjvcRX5s=_K))w&|%DYJl`p8ho6fm$F^dxHxi8tzTkD6VebRL@+(W zryd3>;v~N$?b4HeoR2dWY$XK;3gPJB_`TeHnemdk3)Kw&2;m6zh!`B>8J;TDB*{jz zM7>0-jg#|q_vOJ$l`c>hRTm|$Y<|2h-6I7F^#o;kZ92x+DE)ExlH>8X^45hS>S(2} zCZ%_PrwKjs+=cYxgx>?WAEtG#2#%us43KPhlNzoqL_!h4K=)KGaacU%HaEm6kuD zHKCCyP%{5L6XL62vuEIf*%u2;{z*??g};{W&&O9|VZuzpQlx@KbIIuy{7{Q!(n(?* zKp_YO$c7c>b#Yi#I<>HigrqHig${BN`yHo&qJspJ&{K;R$=v=he{t+MM^0jHulO& z>jR-awtXm{ltJF!*mEoX4@7fB6rosBa(Qc`{o6cn1|cFpM59m+S)E+l(bH{NIBNsN zV%WZni5zym+$eNAV!L;{WLxk=Pq$?)Y2r6yQ?Bs!P zm~lWW%_<=dW)9FE&b8Fu)RI)9)R|O|RDvPPA^sr-3zXV888c^H+&8aa?Jj1Q?>lsz zD|oaYmA{(m*nz6Nf1@j0fj3RD8-v*vZdUOAXqI*sZimH3Wf02Vu5Gsaeyf^&*4>Wi z6Si<5MNh0*Z*^R?YBlJ6)GX`=sXN8><7@QBD5;hA0=TiSpzY497tS;SXqjx!NlJny zx=B{qoIVN2Ws(h(5e1@m%dFu#F@IKG02@Bq$EYON39)VCUE^47aPs?{@p=9;c!72S zZ-LE$@*Jx!_&WGH_Bw37yVH{`JJhnRcD4Q}*pt+lv?@@i({>H>$ikDXiFOI|AO^cF zVikHAdBNk$Cx&ej3hLHb-91vd;PYi{!4AL+APc}256WpDStU9gyokG~x=^{HLX>WM z`VxsHD7j7LDcwt)LbO*%ZowSwsiq3=89zU5LiG=QDv3`W6tCDvvySr$wJ6l4Lr{`C zFKbj~n-K$dAGZfhBXld&t3$Yhbd?p7co>0-1D3{32Q&&GvzpFQW9+YiTdo00TlUbnuk>ubcP3wxHQKFidE zkMjy&CLv#9tR(e?9rG|gND+ZP6IV_l&-}$t<`U-QByUCTqL6VZlH6#u9%fZ0*Cd$% zlmVgvh7Ax!FqW!Z5wu{f2rn}OdXLIpO=&un-`fEl9#ed*vZt1)ae@j zBpe%K7sCMKI+|D(K#NWbDqt$WC;$|26fhUC7qAv!Ls6k@PymzaHZ^i6ddP67V`zM6U`S}Fc*uRIa;RkpGITT~ zmlg-kF{?CpFm-^o42ZP(rg#Nk3LeuOvmEnpQV=4cCxj)0MaiVfq|1cKq;yAjXLo0H z=gB5E0HENgO1K``;brH-X;u{N;*v3{}MyVzUqYldqvl-Q7t zd8D-8ddPb4d(Za~l5Ve1?ugo#wXcswEQKwpw(N%^GDif61ZM;ST=tzI^=}O`4QUK( z40jB}4fzZ!4gC!h4Eqhg8p;?Z8*bGLeXev~-1P1`&D>?5VXRbfXp7mh__n!9xTd`_ z{f%$q)7IHK*0NzzWcsGzF=Ru z%b%Bumphkp7v7iQmy}0eem1XNcbrF2^{cL0ZV`P;S)Ezo->h3ZU&r2zTT|Tvt$?;1 zR}NMVwpu&yQ}5~TQSKaX0-n4_#6T25G(&ua*nmigIE>hWn1CpU6pNOL^A^vFU>k=R z*OTeca3!m)q0Lbu377>e$$gvKl=~^SKDQt@&NS4t#U}bOCQOP8r51yjq;fDyz&V0s3fmDkRXJR zM-Xd>8w3jRfq+)kR;5?TS0_3?cD(Bl?da@?dzkT4!uUfyWSnS}X?$s3)0EN3(zw%P z(@@i((R`&Tk+zZ6m-dzpm!XVodg=xGsba2!4;o-&0kW`{aoEu80KIj240-fUQmv}D z^~RL)XQU5aJ&$Z5G{RfVZ|4ETgfJ*-Rq|l_Yr0S%Qq&m$)&9X_fQa~v~sY5BG6s=T8yHp+TWfppS+4H;|;&l!P~9N#GRc1hk~c#QxWU z?5^6yMw<;oqG^^~N?gn4XE24DA2s=(w*jJ1someiMw z#H7U(?oxfzZk}9nTtY4tF1;>Qf3^~E0)_y=fS-T{z%(Em00A%qE&<7aFMwu%2YxZZ z5|gjitQH+h4eL05GC>SJ1->5v6rUSkm3f-^6Dt;zm|CWW;fn=LNft2E5pzG2HLD(T zJ!=>X#ft%*GBq3R9j%6Po`Mr?Ybk4!w2Gl3w!Df;yRsh@1Cy5 zwKlmn`8(h#t|{3m$|>Qpi84p6*3w73uSnz4lhTXRi_#nQ-|D}bL;Gm+G0NuF=6uRx zN_+CtUP`JBQfs*SL54C69d$vcnC$guw zr@iODcRdICF!Lc`cLhER?}AUlhv0^8l5Xm70NloH&dp&*dAn))GQYM&X?kNid3tL4 z41x+cfhvKWdKfKK3>_ zaX;<961f_`61w_ywR8o(`g&D#6?4^m1-+`hV!dj`a6nxR!JxdMbuJS~QwAnmc-gIEQ~9Ov&r-&}4HnkL1{4 zO=F#JZwCElk8ArHs_rmu=CWwMV7{~-)!)LL#oSujRGL>>U>*M*X996TXkxIqvv{Id zsW`*Jid&l-%^(QE&L!8QGV0{p%$4t98dJCf~8WYpO7-aH$x0$Dxm@tD?)U?^h?Q&uuVKG4<|U z1-3!2?v}oeZk^uphwKmjwzr+g_MJAg4h7aw=o%Ey(RHDCuDH{p&tlHvtHnH_<)AgI zeYx$fDVbG_Tky#DUJn>&dS-!{>tV3yv5)}exE|o zTG21YolA2|y<$sZBP~65HMjQHC1!dP@D#wcDWv?xRooDuK{vj~O=|A+`IbgX6U zJgjO;T=7D;zO(rIyh5ob5yO-_LOkA9`=cdNKCzl|)N%xJu5zGW;$FjEmR|Q>>Rysw z>{#LFbd1J~PZ*(${3WO*Y$aSJQP26aiZkD2e#$x;UKp+!4j&#Kh76Mo(+{%`^Jg+< zEekp}tQdYaG&jUB%rufT8aGlj)HihatlZG#`qibo-lcw^fwZCFb9lYQ=Qj%A$7QV`r-t9dUO(b+J9Vn{Qazc>v+Jj zlW1dbpLNf6W%nD|YQ~*te>UmQnr z+H*FmHXIMA)^j#ZzAf(r32*@|yC;lQNOM zC9$F+#`olTPj_gxV!ATX_Ob02*aNHv?&V5JP)m?VY)GU8JD`YSUy);=)FLS%@1h8! z$RTYZ={zw(qr~(lXu=7`TE$2}2csRKMq>73mtud%B*FH=t07Rt*ud68`v^Ei7sd4! z`XG2>u?^q0*)}GIrs}3XPlf90>T2u4W;fn9LO`8-9b>DRt3OwZSB+PXR-de5uHLRD zt`4pW!q;}zd zMXkuD$T+~vBErI0CRE0{V>~H2IbtQr6YxRtgB$!A+y<@yH#l)Ru{-g`;0mXe<>;a7 zVP&A+G*^ZW#iTNV%fZ*xL)Ey|SJjo(W7S^Or`3Mdc-4c|!pEB_|EcF+y#a-@&PMZ%lJ`++No%Z0~+S4p>^q$@wG zOveCVC}W(X52v$W&}MLFSW+^SUr?l>N29l<6Qm1cfYEC)y3vcudncZ-roCo+U6K4F zl{_Ic!!Ttb@rKEZ`Emp&oYqY4B=$39q}c0#!d3bEgi_jaW?&e-j8L~gH!D4L5mS*M zjB~?lz?+37nN{o851lwK`$x8qv~E0ZVyBj*U=>ak+Aq}XZ0wBh_}($v)9m@p?Y}54 zcrAw+=S?0X?)}qhVywMjB(AcgV4{8^dE(u~0RkL=sD|!>8a?2=#A92 zSU_vMYPD+ZYo%&xAO4&8)G*d8TOMq@U+-H_UH9A~?`Q6}9;web7P4ebdCi($p0S%w zmS_+^pX$SU?zKMyorU`LPR4$vLKlAH^W#UgZEtbxJBo-nS8^Qz2QQIB&z_^QwZ&d4 zN)xe`jNaFbiI(YGi7Bsj-gpQe7%ycsW%-8tM*0@`R^1fda9^hUJlF)SJLcpHQ@cyv zhxSH>;*~r%6al-K-E~$idMS?b-mgLwW`r~oGpi%5ZhJx zRo~T3g#m>*h33S4M!e_QVqQnlDq3r)8q$iRDV=Rhn@+tyPbBGkFJi-Fa7ey?A4IrFcQMe74_h>1|?c{cPi= zwWrRepHKOhbxt15SWnkac28whHfr)~pI17Zv(4*;)$&%&y;R3q=Zj}0HI+=@?kWxmPG$#h1E5ycr-)XxSSeeZU(U+1gl zTjOi}6#c2;6PbOjy*S61(d#C2cjiO&vm)0N2P1DU>&C2VqvFLH(ZX9#AL#YoS=gC4 zMFqtx3U`VGiZBWy;U1wk93g{-DN`K6MzM{CzLx_Vaw#*FxmBo!WA)#>=rN;kJyW$hD3vXr8>TvEX=k8|cuN|i6LZ|twF~IyKTmAV8crF`-<+SF zJU@3hNw}uFQocsKcDd#_S==}HG2h(dta6$@@$tfzex{*PwB@WQXH<2UfB*boWZ!X{ zYA0aveoJWHEqOO~X04LQvj;c19J{@jqDM>i$i8B*cW=#vyadlI2U;QtmE0& zwy7Y-$7KBd{Pd3L%NX@(9<`U7+h2+d4%wFs=ITToik8ai-EOBB-o2A3mbjE?Y~OCz z2uVR9+fjj)pHk$NOmFBKz2E9Dr`&N=da`u(-jfYa&W>1zGz-n?=hfy4ZAY`x#6 z`_^7>-He<(EiUaQEj8^W?K|3YS+NC)MVzLBfZna=n^~?N;RkF>UmFMAbk-{87n2&N zJgSzn8XH9ixXK<@Lp=T;;3qrVC)?-XD{vY(qq?tJ1Devy(9^O&yOj9Z(jRoP6zOAs zPPt&v7}pH)UcT^OxrLr*9K673E3=DA zh~W5|9d_L->?am8LRH0-p~W)AF(o0z$Hk!M;;DB6AvriX)v0J1f`SuZZo%5Sq@S9b zrC%ntOS)AXbB0B3FQbprzH#?jMvky#pNjh&*=-44yPj_Ck4$9e_=%nPIuj24{Ic`q zYIt*q?@R2L$|1;>m8*qoo@(IIKIr&RDZmAYj9#P zR%hYZxs>hev$2<0C%y!3*6`o@x!>0~=Mi$S#MZ79tR$->p~U!RE`=e5?@j5$LO*s; zVQ^}YKZR&X>fp%|Sqrvr&i&o^b>ul2)mtijF#&)1JGJB0>4L+J-0R#$%o<6KKC;8_cotIf!F?|g# zu>6Q_9Da(P_>7%b9*nfyU$q}eT}O<>El?2!h+I`4C9al?tIp(Di<9{;-PSiK%qA^; zm{*>ym}~lYX@Oo(Uju6)sMl(msyj8a^~i9XHx;&ObZT_IJ00JkG`0{@Xq~ZNDAc^%&mWg)hL)h5LjDS!(5nL)#W6C3NliB zU_m}Ub}kMu^REai_#YkATpTT|?O?#)5g-2+0>b|vl2u@Eh&cof`DbvCu8Q(M0z-HN zc?A9y10o{s=>&n=!r(wNn5DIYD9u4*D-F=vT$DzK_hDV=Bn7jwmiKmnX?Q=^gnHXT z1``@FrkP0U)3NQF#u4+#oR(jU0U`}nun4o zjTId3Bm@F^czAGlaC11iSc1TUf`TAUE)W+N`vV2LtCs^D;>qsdO8bk&A3XjM2=p*k zz+YvEnWGzAl!oR{#!luIkY9-Z!W&|3ZU3A5uQrf_E6CdXH>3xKe^3iKL0sTs+#;aA z7XR7~b#$1vJ6DLR_3eJOZqoU?EPXnt`83;ZWrC4=7?W&h3)Ebw1BYP!H+>W+@TIQ>p{`&9;V{gJWV!!hwf4dt0q~nqLgNBdmPx=Qso`-rqzCX`jZQOsu z@cJ$CVK{y{{FtPLemMuhV6Xs~my7eaUyzZLlacu^a6MgM77ve^iyOfC&*T3n-0XEC zhG96q_bIa6e2wFeL_$by5=%@BJi!Sa;&i}Jj)kWusVb1_f%O;LiJiZ_L~%?M$i29Y z35AySy7J+E?Rqop7|`AhJTV%E5)F$Z+IV92WyWY3o3n5<`?g|&i6>~|2}W-ir@gOd zpk$fx^^9XM`7)BN3nasY!LVfG!5HO_-Imkk?YwjiK3%@shzQ43`>@TNE_DanH{FNQ zy8JFo09e?QPMnIU?*de+Tx-@G=KJA)litZ~%;eYe^Y!!V-5qCvsI%5Og*s;TrKplm bRZ?y_q%y1zCa2}L$`IQ9z;HO!M;(3ucXOG6 literal 0 Hc-jL100001 diff --git a/src/paperless_remote/tests/test_checks.py b/src/paperless_remote/tests/test_checks.py new file mode 100644 index 0000000000..b153df2241 --- /dev/null +++ b/src/paperless_remote/tests/test_checks.py @@ -0,0 +1,29 @@ +from django.test import TestCase +from django.test import override_settings + +from paperless_remote import check_remote_parser_configured + + +class TestChecks(TestCase): + @override_settings(REMOTE_OCR_ENGINE=None) + def test_no_engine(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 0) + + @override_settings(REMOTE_OCR_ENGINE="azureai") + @override_settings(REMOTE_OCR_API_KEY="somekey") + @override_settings(REMOTE_OCR_ENDPOINT=None) + def test_azure_no_endpoint(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 1) + self.assertTrue( + msgs[0].msg.startswith( + "Azure AI Vision remote parser requires endpoint to be configured.", + ), + ) + + @override_settings(REMOTE_OCR_ENGINE="something") + @override_settings(REMOTE_OCR_API_KEY="somekey") + def test_valid_configuration(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 0) diff --git a/src/paperless_remote/tests/test_parser.py b/src/paperless_remote/tests/test_parser.py new file mode 100644 index 0000000000..160796fe0f --- /dev/null +++ b/src/paperless_remote/tests/test_parser.py @@ -0,0 +1,91 @@ +import sys +import uuid +from pathlib import Path +from unittest import mock + +import pytest +from django.test import TestCase +from django.test import override_settings + +from documents.tests.utils import DirectoriesMixin +from documents.tests.utils import FileSystemAssertsMixin +from paperless_remote.parsers import RemoteDocumentParser + + +class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): + SAMPLE_FILES = Path(__file__).resolve().parent / "samples" + + def assertContainsStrings(self, content, strings): + # Asserts that all strings appear in content, in the given order. + indices = [] + for s in strings: + if s in content: + indices.append(content.index(s)) + else: + self.fail(f"'{s}' is not in '{content}'") + self.assertListEqual(indices, sorted(indices)) + + @pytest.mark.skipif( + sys.version_info > (3, 10), + reason="Fails on 3.11 only on CI, for some reason", + ) # TODO: investigate + @mock.patch("azure.ai.formrecognizer.DocumentAnalysisClient") + def test_get_text_with_azure(self, mock_azure_client): + result = mock.Mock() + result.content = "This is a test document." + result.pages = [ + mock.Mock( + width=100, + height=100, + words=[ + mock.Mock( + content="This", + polygon=[ + mock.Mock(x=0, y=0), + ], + ), + mock.Mock( + content="is", + polygon=[ + mock.Mock(x=10, y=10), + ], + ), + mock.Mock( + content="a", + polygon=[ + mock.Mock(x=20, y=20), + ], + ), + mock.Mock( + content="test", + polygon=[ + mock.Mock(x=30, y=30), + ], + ), + mock.Mock( + content="document.", + polygon=[ + mock.Mock(x=40, y=40), + ], + ), + ], + ), + ] + + mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = result + + with override_settings( + REMOTE_OCR_ENGINE="azureaivision", + REMOTE_OCR_API_KEY="somekey", + REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com/", + ): + parser = RemoteDocumentParser(uuid.uuid4()) + parser.parse( + self.SAMPLE_FILES / "simple-digital.pdf", + "application/pdf", + ) + + self.assertContainsStrings( + parser.text.strip(), + ["This is a test document."], + ) -- 2.47.2