From 0c676b90f24ded60ddee4a95d43658496e99076f Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Mon, 8 Feb 2021 20:59:14 +0100 Subject: [PATCH] migration for #511 --- .../migrations/1012_fix_archive_files.py | 181 ++++++++++++++++++ src/documents/tests/samples/simple.jpg | Bin 0 -> 17740 bytes src/documents/tests/samples/simple.txt | 1 + .../tests/test_migration_archive_files.py | 175 +++++++++++++++++ 4 files changed, 357 insertions(+) create mode 100644 src/documents/migrations/1012_fix_archive_files.py create mode 100644 src/documents/tests/samples/simple.jpg create mode 100644 src/documents/tests/samples/simple.txt create mode 100644 src/documents/tests/test_migration_archive_files.py diff --git a/src/documents/migrations/1012_fix_archive_files.py b/src/documents/migrations/1012_fix_archive_files.py new file mode 100644 index 0000000000..e95715265d --- /dev/null +++ b/src/documents/migrations/1012_fix_archive_files.py @@ -0,0 +1,181 @@ +# Generated by Django 3.1.6 on 2021-02-07 22:26 +import hashlib +import logging +import os +import shutil + +from django.conf import settings +from django.db import migrations + + +logger = logging.getLogger("paperless.migrations") + + +def archive_name_from_filename_old(filename): + return os.path.splitext(filename)[0] + ".pdf" + + +def archive_path_old(doc): + if doc.filename: + fname = archive_name_from_filename_old(doc.filename) + else: + fname = "{:07}.pdf".format(doc.pk) + + return os.path.join( + settings.ARCHIVE_DIR, + fname + ) + + +def archive_name_from_filename_new(filename): + name, ext = os.path.splitext(filename) + if ext == ".pdf": + return filename + else: + return filename + ".pdf" + + +def archive_path_new(doc): + if doc.filename: + fname = archive_name_from_filename_new(doc.filename) + else: + fname = "{:07}.pdf".format(doc.pk) + + return os.path.join( + settings.ARCHIVE_DIR, + fname + ) + + +STORAGE_TYPE_GPG = "gpg" + + +def source_path(doc): + if doc.filename: + fname = str(doc.filename) + else: + fname = "{:07}{}".format(doc.pk, doc.file_type) + if doc.storage_type == STORAGE_TYPE_GPG: + fname += ".gpg" # pragma: no cover + + return os.path.join( + settings.ORIGINALS_DIR, + fname + ) + + +def move_old_to_new_locations(apps, schema_editor): + Document = apps.get_model("documents", "Document") + + affected_document_ids = set() + + old_archive_path_to_id = {} + + # check for documents that have incorrect archive versions + for doc in Document.objects.filter(archive_checksum__isnull=False): + old_path = archive_path_old(doc) + + if not os.path.isfile(old_path): + raise ValueError( + f"Archived document of {doc.filename} does not exist at: " + f"{old_path}") + + if old_path in old_archive_path_to_id: + affected_document_ids.add(doc.id) + affected_document_ids.add(old_archive_path_to_id[old_path]) + else: + old_archive_path_to_id[old_path] = doc.id + + # check that we can regenerate these archive versions + for doc_id in affected_document_ids: + from documents.parsers import get_parser_class_for_mime_type + + doc = Document.objects.get(id=doc_id) + parser_class = get_parser_class_for_mime_type(doc.mime_type) + if not parser_class: + raise Exception( + f"document {doc.filename} has an invalid archived document, " + f"but no parsers are available. Cannot migrate.") + + # move files + for doc in Document.objects.filter(archive_checksum__isnull=False): + old_path = archive_path_old(doc) + new_path = archive_path_new(doc) + + if old_path != new_path and not os.path.isfile(new_path): + logger.debug( + f"Moving {old_path} to {new_path}" + ) + shutil.move(old_path, new_path) + + # regenerate archive documents + for doc_id in affected_document_ids: + from documents.parsers import get_parser_class_for_mime_type, \ + DocumentParser, \ + ParseError + + doc = Document.objects.get(id=doc_id) + logger.info( + f"Regenerating archive document for {doc.filename}" + ) + parser_class = get_parser_class_for_mime_type(doc.mime_type) + parser: DocumentParser = parser_class(None, None) + try: + parser.parse(source_path(doc), doc.mime_type, os.path.basename(doc.filename)) + doc.content = parser.get_text() + if parser.archive_path and os.path.isfile(parser.archive_path): + with open(parser.archive_path, "rb") as f: + doc.archive_checksum = hashlib.md5(f.read()).hexdigest() + shutil.copy2(parser.archive_path, archive_path_new(doc)) + else: + doc.archive_checksum = None + if os.path.isfile(archive_path_new(doc)): + os.unlink(archive_path_new(doc)) + doc.save() + except ParseError: + logger.exception( + f"Unable to regenerate archive document for {doc.filename}" + ) + finally: + parser.cleanup() + + +def move_new_to_old_locations(apps, schema_editor): + Document = apps.get_model("documents", "Document") + + old_archive_paths = set() + + for doc in Document.objects.filter(archive_checksum__isnull=False): + new_archive_path = archive_path_new(doc) + old_archive_path = archive_path_old(doc) + if old_archive_path in old_archive_paths: + raise ValueError( + f"Cannot migrate: Archive file name {old_archive_path} of " + f"document {doc.filename} would clash with another archive " + f"filename.") + old_archive_paths.add(old_archive_path) + if new_archive_path != old_archive_path and os.path.isfile(old_archive_path): + raise ValueError( + f"Cannot migrate: Cannot move {new_archive_path} to " + f"{old_archive_path}: file already exists." + ) + + for doc in Document.objects.filter(archive_checksum__isnull=False): + new_archive_path = archive_path_new(doc) + old_archive_path = archive_path_old(doc) + shutil.move(new_archive_path, old_archive_path) + logger.debug(f"Moving {new_archive_path} to {old_archive_path}") + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '1011_auto_20210101_2340'), + ] + + operations = [ + migrations.RunPython( + move_old_to_new_locations, + move_new_to_old_locations + ) + ] diff --git a/src/documents/tests/samples/simple.jpg b/src/documents/tests/samples/simple.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a8c58af0df6e4b4e6b72179b92aa9df4e2bbd8d8 GIT binary patch literal 17740 zc-qvubx>SSyDqv$ad&qa++BjZy9XKE9fBkQ5+t}oaMuKa6ClAgA-I#^P9R7i0S>?K z+jaJ?eb24B=hpe>Jgd6*pjX^l-xX;Ibz3;-R#1n>b7Yg->rHg~5$ z;Qtj@a{%a{a+>oWU%>w!Fzk>%wg3P^edOVAJ9|Hsc2#^6~Hra?mO~hE1#I>*0>1 zWeq^u_&E6@#b_OUeLcmvx!s-Ik#X_4RhL@$+@^aOc!>wDv~IxH`EYY5BQ$Xbs#wJ-mG%JFEZO3oYwEf%-f~ zYWp8vZ2#?q%fs7&`#*;A;kF6l^s(lCbinP3bg*`{^{_)Cxc+$-|LFF=>(R%apYLBL z5T+~uz}6lfelY<6N-6-{B|SX+DSCLgD}L0&3IKd^|Ihv&g#aLO`Djo2Pa9(a0N_Ld zKuiCB+H5ibp!GEXkS^N#dHes{hWoG405AXrU;rFI0ni@T5ys<6!Uk|2EAapVfDj-8 zhyhZ73?K(607`%wpaEzBI)DLS1egG3fE7RhYykVCmbd_Jfag(F{D1%;2nYirfG8ja zhy#*<6d(;edDNdAAP*=!Dp46w0aO8XKm*VOv;dt)b?O27fB|3x7y~ANDPRs*0G5Ci zVEw2}Tfh#mf7Gbsqc&Y0wd)4B10I0aqh5U;RqGD~0D(Xd@C*n6LV+;gIq(8_2}A%< zz$@T25DmltaljiO9!LO^fMg&ANCnb?3?LK80&;*{;4P2`6aa-l5l{@20%brsPyti| z)j$nU3)BJcfcHQH&0{ z0(=D~fp5SxFayj2^S}bI2rL0Bz$&l?dyQ90A9`Dex0G1AYM) zz!h)}{044;KfoPu4?H|>gJ2L81OuUfP(kP*3=kFw8-xSG1L1=RK|~-D5GjZpL;<1# z(ST?{^dJTh6Nnka3POO`L7X5i5D$nK#19ex34ugFq9AdQ1V{?>1SA8J11W$MLCPRi zkQzt>qy^Fj>4Nk@1|TDl3CI*=4zdJ21zCe^K}e7T$PwfWas|1AJV0I`ACMo&9~1}* z28DpaK;fVlpa@VD=rt$?6bpI-N&qE+Qb1{-3{WO08!2WS(t z4cZ0mgAPH*pi|Hp=ojb`^c!>w`U|=T17I*13Pu5=fib{XU>q<$m=H`1CIyp&slYT~ zIxqv63Cs#+19O16z`S67upn3jEC!YUOMzv;a$p6pGFTO?4%Pzefc3zJU}LZ;*aBEKLo4)`s&09*tv z1($=Xz%}4{@O$tFa5K0K+zIXie**V{2f)MNQSb!#D|iY#3!Vorfmgxn;0^E=co%#C zJ_4VD&%u}AYw#`j4*UQCLtqe82s#7{f(s#l5J5;G6cB0%ErbEW3_(CRAY2e$hyX+w zA_kF!NJC^H3J_(88blMK1JQ>VLQEj$5G#lc1PO73xIo+?UJzeM03;X^3V9BB35kM4 zL*gI_kYq?2Bny%Y$%hm{N+A`HYDgXAJ>&zV1=0@r2++mJoTA>;&d4!MNKs4LV1>I3zM20=ri&!Lgf*U&g<0yG7h z4$X$Xg%(0fpcT++Xg#z6+6--jeuVZw`=Eo+QRo-wH|Q*M0lEVH4&8+ALJy!P&~xY| z^agqd17J`XDhvaL10#SD!^mOOFnSmh3<2YW@xlaPA}|Tq6PO%K38n_qg6YBxVWu!k zm<`Mx<^*$tdBOZ(L9kHR3s@v98ukX31WSWu!QR3OVWqH2SS{=w>;tS7)(Pu@^}&W< zW3aEVY1lk$8MY4Fgzdr(VW+SQ*l*ZhH~@#jQQ??yTsR?|6ix}JgEPSqa85WcTo5h_ zmx9Z}72&FIO}HN12yO<*pC}=2HDEKJEC=@6(D2ynq zD4Zy~C_*S=DAFi$D9R`rC^{&HD5fY@D7Gk$C~hcTDE=tVP{L6nQDRWyQBqJcQF2iV zQOZ!NQ0h@WptPcVMEQ&|h%$=u6=eox5oHx+17#QG2;~gr66F@<0TqghhKhxXk4l0{ ziAsmcjLMG6jVgdDiYkRFhpLRKfvSsYgldjzjcSkTg6fIthZ>9;jv9#?gPMSvikgL* zhgytUfm(~&fZBrEiTVk30Cg1gE9wmDBI-Ko7V19g3F7$J5HwUYEHr#H5;Q6_ zdNdX^4m4ggVKfOe88k&Sbu=9`Lo_oqYczW_7c?(4f3#<4FVJ40#i1pmWuWDv6`_@* z)u1(?wV-`O`;0b-HjefUZ60j}Z3Ar=?HKJG?Kj#TIv5=l9SfZRoeZ5Coe>>@&W$dB zE`~0Ru7IwJu8nSpZia4+ZjbJY?u{OR9*Q1;9*v%Wo`#-{UVvVPUXA`9y#@Uv`e*bZ z^a=E7^ab=a^eyxQ^i%Xp^gkFN3=|AZ415ey3~CHU3^ojI3_%QW3>ge13=IrD3=<41 z3_A>G3{MPyj1Y{M7||FB7-<+e7=;++7_}IU7;PBc82uQd7?T)t7%LbX82cEf7?&7- zFhQ6om{^zum}Ho=n9P_Qn0%Nbm{OPum};0hm`0cun0A=Xn4Xyan4y>vm@$}%m>HOF zF^e%PG2dY}V|HTpVh&?|!JNfh#{7Y~hk1f|iTMW$goTQQjYWh-fklVKip7N`h$W6C zi=~XEg=K(cj%9=8gyn(dj}?Lyffa+5gq4AnhgFJIjn#nFiq(xZfHj6Sg|&#ajkBz|Q#umbs#Foca#n!<##4o4M7 z2gex43dbJD4aXPf8O}?b7@Q=WESv(Ia-2GxCY(;3UYt>!Z#WA$>o_|&$2gZbe{mtW z=(zZ}WVp1rEV$gbLb#H+^0?}_dbp;zHn>i>p16Uy;kd7G6L8aU^KeUXYj8i{cHs8n zj^Iw>F5s@??&6-{Ug6&1!SFEg2=OTJ81UHe`0zyWWbl;nwDFAatneK0-0}SJLh+*T z;_=e(-r|+w)!===>%{BD8^xQ#Tg3Z;w~zM|?>9bxkBX0jPl8W_&w|f|FN80JuZXXS zZ-8%skHmMw_rnjxkHmk2pN9VyzZAa~zX|^%en0*={tW&y{wDq*{xAGL1P}rY0s;aG z0tNzh0)7H<0(k;;0(}B=0y_d%0$+jYM^Pxzg1kMNA}h6qH2PJ~ZHLBv4BK_oyVNu)reNn}K1MdV22Nfbo%f+&_Kl_-~} zl&F@dnW&p+kmxJX0?~J(eWG)sTVevFGyoaQ%T>FmXp3CZ6oa^9V49~T_fEk{YiRD1|h>FBPOFELy+;2iIXXi zX_6U}S(CYt`I3c^y(UX0%ONWzt0QYA`$RTMHchrlwnO%l?3NrtjzvyPPD{>4&QC5$ zu1Ky;Zc1)T?nWL!{(?M~JdHe`yn?)eypw!@{0sR4`3CtB`6c-S1sVkb1r-Gg1rLQ7 zg*=5Og)xN7n#E^yKtR^gQ$u^h)%)^p^Ba^uF}r^l|hV^hNY_^lkKg z^k3IaHG7vD(Ft9NQGRQEfGZ-`2GI%fqGrVF*X2@fxWN2dOVHjhWW7uFg zVYp$0G2${(GO{xAGfFe6F&Z-3FuF4aF-9>aGrnc4WNc#WVH{(eXWV2wVZ3F6GvPB) zGa;A+nPiwWm`s?EOrA_3OwmkfOodFfOl?g4Op{D2OuJ0Km>!rhm`Ru!n7Nt7nU$FJ zn5~#ym;;z2nUk2`GFLJ;F@ItnXI^05V*bf|$AZQ}#6riy$s)?4$fC<)$>Pl7&l16s z$db!a!P3O?iDjH+k!73ZjOC6Mot2oCft8z8oK=}spYsW4mWZXD4B2WanjhQy;b`L+ z;F#iA=Q!rL%kz>ai6@_@mZzO(h-a2( zljn@*fftLHoR^hXh*zFhm-i{JJ8uYYEN>QXId2nhFYhGp8t)PBEgvc$F&`rzAKw!` zO+IrzXTBi5*L>-GC43EhJ$w^nMFi|j1uvV~Ba71uHa98kB2r5J%L?^@}BrT*VWG>_)6f6`Ylqpm$ z)GX95G%fT)=%>(wFt#w2Fo&?Xu!^v;u!FF_aFlSGaEWlE@Mqyk;dS8?;X4sb5egAD z5m6Ck5hD?M5kHYgkyMdlkp_{^B9kKPA}1nuqL`u-qU@q#qAH@sq7I?~qOU~LMN37S zMEgXiM1P2$i2-6bVl-l0Vv=GSV&-BlV$Z~4#d5@|#M;D$#pcC!#jeER;zZ(1;sWCG z;(Fq?;@;vf#goO0#NUg55}y=b7e5uhm%x^wmf)0-l+chcmvEH`k%*JXm8g;EkQkL% zk~omKkwlXulSD|0N-9elOFBvhN=8d&NmfXODRbiNjXRb zNWGTIl&X+wl^T+om)e!OmPU~#k!Fz=mR6EBl6H^|l#Z6plCG3)lOC2{l-`%Vd4l$Y z>;8x(sL zrxmvpuar=fNR<#u;!5gD7E10)&y|vuij|s_29)NM_LOdv(UmEcIhCcAwUw=veUu}W zGn6Zo+m%O^SCo&HA5?Hv=vDYt6jTgV994o<;#Bff>Qz3eOsQr8%;5nW0&w*{M09xvqJx1=S+fLTHI=X=**y^45yd%G9dX`l$6q z>xb4aZMZh6HoLZ@wzjs7wx9NE?Huho?H=tZ?JezV9W)(E9c~>N9eo`KogkezIt4ln zI{iBHI{P|*b#Zj*bp>>lbWL^Lbf4>{=$7fW>5l2H>7MC9^@#NldJ=kCdNz7~deM5h zdi8pr^=9;T^=|dC^lA0^^%eC^^j-DC^;7iA^xO2u^w;#y4PXYO2J8k>208{vgFu5g zgM5Pqg8_pDgF}ORLwrLfLlHwYLn}jX!zjaS!&<{0!)e1E!&@UPBU&Q?BPAmWy))+U}|FOYWmzX&9u_=qv@pSmg#RZ3^N)telsOAGcymf z2(wJH8nYg=X|r9kKjt{*4Ccb-YUWSPea)lI-ApCKe7N>5LqBBBrS9-kQTuf z2^Pf`Ef!-I>lVK(Q7tJgc`W5EO)T9kUs`5ZR$KO1PFwC;{mUEY{-I+SW+xVCw|y z66-eW3F{x$S2h?nv^Ih^DmIojJ~q)dc{YtULpIAcKW$;Q2}q2J$5s8`*!z80we+{h15ejB14g>$Vy}vatgVJ zytBu*XR(*G*R^-B54BIRudwg3pR(VxzjMHMV0Dmm&~tEf2y;kvsB-9bm~q&5cyJ_i zWOI~uG;nlweD0XxSmXHFanA9`3G77T#OWmGWbEYb6zP=h^v-F(Y1!$gGu)Zdna^3- z*}~b!ImWrbx!HNt`MdL#3#JRbi-?P+i=9ibOOi{u%SV?fmpzwzS3*}d*C(!quCA^x zU9(*4UHe^^Tz|U3-6-Ao-BjGH-2B|)+=|@V+`hPNx!thkk^x*eU@p$Ut@A1Z?#G~C~(qqTt&Xd5C&GU(;k*Awyq-Tz2gXfUvn&*WV zh8Mk;h?ka^y;q1=s#mqwC$D+06K|+Dg*TtKvbUADzjwTMsdtC>H}5^~2OnY|4j(xm zQy(v%XrBU~7M}^9EuULoJYQB{XogKjgpWe;I%oz!)GNpc~*E@FE~9;C;YQz*@jn zAXXq#phTd4piAJ(z?{H_z>&c3fxm-rf>?s2f((P)gQ9}+f|`QHgEoWy1QP_a1U1K#8D@5RK4@aE^$G$c<=WZ3+I(>!uitd%zE4^2) zucBV%ziNFo`D*_)_%+3Af!7+Z9bP|wo&CD;_1NpJ*LTq*(LB*A(KgY~qSK@6qlcou zN8iQ}#BjtY##qG!#w5qo#Pr9k#azeY#YvoOd~+Ior8FE@iH8u5PYd zZgg&GZcpw~?&Vv&w;XSk-rBwmdz<~X`R(M}!#tEc`aH=zlRV$Nq`aEE!Mu&UyL_^I zfqd($%+Mwb&B1JV~Wd)KNqhS-;@xQ@Rn$lIF&?|6qR(9 zES6lB;+1lis+1y2UzFyTc9hPQo|j>lv6U&6*_MTu<(9RT&6NEt$1F#bE0kN8hn44) zx0FwppH^U0uvW-dJgo?=$gXIqn5sCf#HeJgl&`d|46V$mY^j{CJgvg4LR2YK*;Iv9 z!x{#Aop!&#$R<4_Y(Q&`hgvs80kOIXWOt5NG* z`?|KY_H*r8?Vmc*I>9>KI*+K8%7(V zO{opp7SUGJ*3-7y_NSe!U8LQx-M2lZ{eAnF_TvtW4n&7ihkZw6M{&oej`fbaPKr*k zPUBAh&h*X?o!>fte#HLB`BCko^T+6qHQ9Y$Sy*)oZ0iS3-NqsW^^z2j4r}j?^pRPX>eHQqv z_u2b%^5^%TzkWXH#p>ngRqJ)`jp?oI9qQfbgY_}?$@SUvz340K`_#AIci&IdFVSz- z|Exc!zrBB<|9XIAKxn{Vz;7UJ;KRVw!1*BFAkU!opvPe1VEy34;PDXV5XX?(kjqf) zQ1#Hr(Ec#$Fzc|=u*2}H;qu{u;q4LF2;+$Sh}}rUNXbaw$i^smlzvoZ)Oz&AXwm4W z(eGox812}TF{`ogv4XLlvGuWsahh@Iam(?r@%-`b@wM^$3F--{3CoGFiTsJ~iM5IQ zFEn4IzgT?<|5EU!=gaz+hp)6>pL~7#_4(Jrub;pEm;_DIP0CK%Oun2fp6r|4{08~P z_)Y#B@>|rmvTuXmcBW9KSf-Sw9H*kEs-{M!4yMtk*{9W~U8m!x>!!a zc+I5DG|o)VT+9;A3eFnN2Fzy8w#_ci-p-NFiO-qOh0f*A^~|l$1M_tAvh%j{k@IEq zgY&x!s0)Y%)diP@Hw$$OUlvXmaTj?Pbr*dW(-xZ-=N5l2ku8ZWnJtAZ}BbQ$-S1yk%AFN=kaIR>sc&sF^G_K67T&@zYimaNfK3jde+P%8I23n(ElV3xw zySJBB+!JGnbuJL|jPUB+F-U8mi+-MZb$-Sa)dJ>fl*y=Qy* zd!P0;_hI|2`>OkH`-%IF`!oC32V@5l2bKpf4oVLO5B3i+4!I6>4t)+Y4%-h`4)2fX zj^vN*k7AB$kG>wA9}^x6ADbSB9v2?>9dDnYp0J;2o_L+4owS}To!p($p30rtpT?Zl zo_;+&|4H;y74nVWu0}Nt(}9;8PAo^UCtBE8_s9XfB&NR zCH>3hSJbbnU*o@iUJzUeU6@{kUKCyQU+i6CTykIPUix2VUv^!7zk*(|T&Z1oT%}yK zTrFPRUDI95UprpExqg2=eSQ6#;_&rY9(9%fI*<3#|9W^aFl-?c>qJ8FgOY-8Xi6YIvpWBZvYVkpNOc8jX!)+GQF@)!{w1P_n~dY%LS zmp~-`nYIY=Y^}71`Ay4Y`PbO)l)|s=HjTg7HuDLi+Ei73B9D+apS-_QlH6+V7u?fx zSlx7x+)du6i8(v~(QzdA?okhbFNzpwHb~qgke|YKF^DTuuf~dpR^47wmW#bmrNe$e zSa&07Gr7t!bVj&OZM#oPcXMS3J}Goa&biX-wy8NKd8c2e{5}(nL{3o687Znm#IZ$y z*ids6BztQ#JluXbzn-8Bj|U)p^jlwFa#v*mI0}j93H)OMuj?_5k{yi%P^T$l=>2+( z>|HpOm8c$fN8Ob4ij-~UkW8i#oZx?n{hQ3F@n}`}!~o%VoJ_Lsv~{tC;*w4M=mDPX zyM+37%JeN+oW?-i6#0(K>XE*h@V3S~r| zHxTm?A0A%+g)+{B7uv^5dl(fSs~oh~;Vxt{ZEKEVB>eeeDjX(Sqk~jZ)9FT7>Q5vr zHic0W%hzMcmuCBn0CP|NKYngt+wT*J7h{#H-p;wYIX6=RF7W|yQzwxVgTae5fjb1@ zaCtjs%XF3OWNSu~zpna?-EiCY0TF;kJgg3@?LxHJLSW-rjjEVqORZ_V^Gzmx4c}`j zNAnB<%(;v?XPtBTcj3PXab!D*+-Nc`7^2RJp#E7dKZsq}_BPKSGHSD874Y1~PNMlTh7-gy2+T=RH9WqHV|qpfrU zlbe3BKfv19+pAY)-r~*K2hFzz@I|lfI+wFiKgD95zuk7p&pdf)N}eX(T+O|B0CrqF zu4^JKUi^BoU;b_6P>IR;ntIMpGQHs#&N(e-VF_n%CoL5>KUc0VLrrJ@v67@Aq~h~x z(W~^`_&Yl8$P~G~-)X7*DrDYPx|GkjH=PZU{qu^n^)oxY5q`@b9DgxS)10X!%#;)N zt~iCzDB-7Pmzh5!!%NJo`{h4a?n>jk(bapkJ3}l}GFxqwvOoKZ3nOBa(%aNjKO@bZ z39D*{QjVK$$MT%X&dy4XPkc4>$4B0G8lq-N{81%BNc?s%mS3B|VXU9v4z)LLjs_o{ z#MrcDXy;RWVLL25;qx7>x^V`Qy$=rCnvkiqZgSI-_q5$8;6;^Pv@6zpk3AN zNWSC>y!~la7;d<+7%SB&sgRK|9sGXWAkGS`!?{D_|FztlYsz*lJz#&Gd|KDa$x-)D zT|ZBxKlc$~#%e%K5BF{xAsppnc7J;1|5=ipA%WF;CIa!Nz>?a9OBt5vctu1Sxn3pcA$%I$=jbBG&N#Sm;#k%?);2xIK- zDMN`KFXD8s43^RKNPVt2D;gaqVaBT+E?K*;@vsa#G0islRu#=6=aXV4_I&l|aeq_* zQ?72cBF@w8Q(Z`W<#K%|J8U_yZdhw!#%0M*kjU)KfvPEG(u_fi{iscyc^$jzOtEYG zK`KLTccCP^u{hxzPkmcUeWj6?hNZzWeYUqYqE-GzU};2X%1*$WH04iZ6B*syitE>s z7*q8rRV`bbLV&tuylwd-$F0HAxGj=M)rA0mG<~En&yAh&*Uj(aPxp)8^u!1z)Be;E zOd#N!*yO|sY6-f<*D9Gi(&JD0;icCijqjy4vnbrzG~Q3x?~Y>`I2de&PM%TqXvAU5 zjZ3+u;EYzRy*YH!LeMjt>lA!XK9Kq4Qbh&cAaAYi9sYJX6mFN()#~|eIaRNA_FPa= z!%NKo-8J=4*K0cI<+)G_J(H&jzsfVRr1v+9#Fz_lu7BTU@mif{d{A#;7-^&L?{Z)C zb3htw(F?FPe36f;8eA~je-ojq6?ttM-L7*^fu^demd+a8qmp$_Up`J@yO7)Klzu#R z9a3j*lCw2@d7Er1AU5$;jEU`qW<1ky^>4=pY9~(e0(%*JwJllvb^VlhRiommFS;Y* zyuneMZU!#f>-^ zrp9d&vB))HZNE)Er?>zAw@@!8?)F!w%BjcO!nL1yo1AQny&4=NWb0bTX4*9nG58Z2 zgV0~77x?v+adl6=^bcd^tg=ZqYdl?@FzQ@Z`Pz5%{UDPO{IC1c?M1{~`PIYSUcwUPq?EuV7I_L&ef>xrO2ZV+Lp$MA*SI2Z*kF zD(;*dC2oB2Y`jbfESV43b2$#Z9U-5|hd$I}zB*pj?`u+-vYGW}QncX6%Ts!~&M4#6 zheTuWv%pvR(c}Kn^9d_0kV{=YG2f`Um?Ktfmh-HJIWMKL&QO7Gy*5|HbRXBYC2AsT zN?$QGNXqe;A>{`P?fj(-UKFz+6SI679(m$R2I68d zd(s(l>5Hko6o1S@%g%UHBWkDd*#k3z)41wi-PMQQ61&=(;zTd)^+M>%SifyqIS@Mu zk|nbyz11q|60^}@4j!t0oC)#YYJ)QOWiIA;jck)lZFAoq`=OMko9&LnF|FiX%osJP zQoEENjPf!Uv=aU*tSQc1>h1JfD(`ePx5-oc+i9@fyoxE#6{?bd)tNF7i8n&ZS4Qp7 zd{V*eUMpJTQ~60A{(f*ZSz9v3;gxO4PL+!RH~GLHong0w*KQ-hZ1k%BMwMISRpvMo zho=H>)f3SvkALB#mAH0APrVD1jeH(YLG6;LPwub^VRO}~H6Jn<<~>%jmyE6d6Dylv z`J15#-|Wq19KY$ZKcp%ttxEN0F1xJyWNAvcqMHI=9PnXRj6Ddx2CkN_FF_0i`b zGkpC4mAP~@12VjvPzegRRv;x*x0uB;B-d&*fsojx@tN>gH4%qaF)*kCySEYC4Hs;AGC>B@dGIf-ZF{YK#SO#t@hQA$jz9imQ4p(}Xux}HV zQ*({1`nzeFp!RW7w!6Asv9cq!StKP4rL?X*?)55znv<-0=84j0Pm(=2<0aTy9DK7Z z%(@dhE$Gkn#>-Z}+8=2j@1s>Au9fEGI$^Ivv%)ORI$8&67oYMwPiBE&!X&T ztj6&6*9>D0+MJ7L$wJHa?{RV~d41|_Z|Ul~n>ay)qT-(#M!ZWICPsPiCSFQhS~gi& z(f66OvqgFH0@%o%(u9SYO8XG9?l*^Ir|*lpk1?*Bo*6gap=AQQT{_b1dna9%Wz z>SC#H7K@sH-{LiuQiMkT)?ulNiUx6aF$I4dU#eZkb`us^OFbdakQnCY>@cvbVbU7$ z&(Z!dYTZg@lJiz*mT7LvnV-)Mt70#O?92Qx$$Vva@KHh8QAwJ+s(a>paUb&F{ucJc zPxgoD)TY)82>v=Gd*i9)?K|Ga;-YF$?mJv2W|4Wh049yJ(I(Y&oW)KP>GKjOiuq2X2SeB7KIP zMCvPiFYsGc+KdO9a;y8Rrns_yT9$=QnWGR#r5WYQ{iXZzDpR+Sfq{>nhj2VlHKq6{ z9WVI~^Nqq?zEE$Gd-8`hreX4dx%@(Q-_Z!c;k`zuY%E;IW1$J}!q4Q(&eA!iUyEu! zW?Bkd?IlDr7Iv^Gl8!zA3lG3do%`{6q>to?N#CLn*svPr?QHf7w?Op`U=%$PD z8jrasV$-uo2?-(ue?76NKg4mXBVOz1=-%I}d;nNz%LrA;%K{CIsy;S4bI8AoB3IF5 zjdy0OoLn-u(qkmU9{Uu;`S;t)+K&TlNwLpbk!2dS?V#59{z3i9x^6f8@wf)Y`T2Z| ze9`ZbPM5__2aBanieF}WxpSu!EGyEhX7q~GJGVRK;%H736zp3NqgH*5=jl`*^hRVj zZj4xZ-?-hC`!`KX<(`#u&q_(Q@ljk0nvv*{ut^5c>pM(GZ-25L3YHJ*bD|{s`;ldf z+vYob>%aeep?N#R$d5N~J;;)$<}8#gczIIHZTIV@UD`dI2Q~B1|F5oR!IHrG3L%@} z;(BVh*1C;Q#m~TECy(SJi>=o9A%B0)3ddKUh9)_dqi#$1^CxO&)^gl=TrYIdG0+^N z6ANsOm;cy*S+OFuIeJz$q32jV(_Bd``5GMYx1$g_cMz9W$&;I~lz{SCs5D4I-l1=l z@#^niVE!e+t&nt2sAJf)S$LN@{j39fE0>ALn%h8x7u4lo8>x|o;f@?%rK_T>j8Ux8 z6xbJ1;nMm^=r9D8)pMn4h3`?A?H*MbkKbB1RcGutzMHbDHrN+cNfd0EaoDX>W8EeF zV(pjj@EK9^=W7E^c$%b9UBMYJAnmGrPb6Z9#Sz9~ym^J4_$QU#CpbX>t*zxiNTH`Z zPsC|%(RsPaeaGe7yPP>C>(^|mBG`N_=r|D{u0`w}r#w|*)zG)8Q88M5orD8Dk zRlK?{)aS0I^mlA@E7&dP*PbO75W&(+`4z zS57~evL&S{eB4W#D91KNN5!<7+=H2w>ZS_vEXt74N1cl9yi!q>C2V}%EZa%W-`~pI zoxa>{Wm?a+qBzAV%^UaLscA~A5U1ha2x&)O*Z)IpD6fEB2X+(7S>X?@Knj0%Qak7# zxSsMtb4it{cuvsCW-oav$tx;5l$m!)Tzgv4acN$4)U20vR4g=Uk%@<%J|1gVY?slJ z^x8b3zG8&CIHjN{*Sklp<9Lrm16wJBNR>dhsB>nlQB~Du$A8`9qwfBs?Eg4RLb=B zeA{MJ<>i%Pu*FDY`;g}?OZY0PYL|F;B=?`B3uV=`{(1>ZA8m&3iM(9j%Go}4Gfo#f z5oy5lnWrBRoMy!~NQUuSY_+*=ak8l?TuKe5-iAW`O59F6@AyNV^UrI(DI~P{u{uQu z{{CP~tvIRATt%%F|C^dHL~!Tu2mMWl|J8}LN-Nd!JGIXI?+Q*OdTBROFttA=9tKtT zlLM-$yiGY$%zPQ?ct0~WFjSM85L}+6zh*`So!Om3w?n1d20v6=bP9v2ECcJm9qY6m zwV%zbf9^XTOB7WL)#^`LQvN#nOL4hB-!QE-Mpc1vB?A2a{Tz8cg%SpFawJpZ}7>6;El_X}Q>j z6otoLqVEhF*#}e@7R$Y$lS0UkJ7QAolr@!=raAGeu-?l_VMb^llf8R)m9>XIINSfn zB$mmhX*@cBiJ2|URjG0y)N5TRUeRI0G?ztN(OICVzjS>3%~w_Ih-C^{q55TVhDL!< z7ju_7stQ^Qj)uZw36oYR^Q{FK!bK{y@WoKl^_L_PAK5)LiQ6GD;cHKSy?6X@# z=b-8e#N~~iJjqbSgIat z{U^bK85{vqw=cN*T9)q3}x)w+b z>%GL^_ev7cy80nBj6(Mm4Sqp&tK4=R;vMCA9BRzjA-a?${LUuAX3Z+fk$*7cPLz`z z&xFnp%6f_;@2V;q3K*ZT4EBm!KbeKkh`i&~2@sj}OVUd=vNtG6@IKUElH}ZR{za}! z`o?obNF`Kd7O(OTVwU~xN2ME%redb!Qdgtk=cMM)%JG=+1tM1OJwM90Bsv2$mDqzk zXYpzSNv>SR=Gt4P{(JH^zAYptt=ruJ%EHEWyBGZRF=jyJTVAu;cAW|xmw5{Pn4~;i z6>1vcTJpM!`7GURE8zB6)>t<`hH zcS(m&y{$Kvf|W%K;jGhUpCVAk=zNlNxVq+#F=cp46c~j6y{p=gOS83*Igk9jHOE;i z`6a(!Ig5fx#dsP;tSJ_|Q!k4&=5(*B!?yZR*KRortMU1-?&8_Ji)+i8bjgfrAF4FP zC5*-mI^FhU^)&UAxhdSk^RhU9-XqZ4Em_v-R1W@xs>PQilp* zJJvz;oD-IHA2TvWm_K|A;e{Uq^KhNeW%iI0hwv>6SutJ6SbD3W1gobX-j@%9&=8Y zHL^qszw9|))6)%x;c0q?NsUV!XqRxI*(D67Wmk3_I*y3{+&Bt-BT!Z2MzL{dI5>2n z+{6}t$b9c^py|d%84k6v;UJ-L-SU-F5o~K>GDj%NJA96!!Nr(Md0lDpN42%Ae=7Zn z-C^WGyhLf*Oto1e$xW`aQtR&KCB`l9&Xu@^>9dY~hMNqF$$`!wvu5dzE2a^qD3Nie z$faB}oRl!zgma?V&>(>-&NlfT8U^^LcxHixiSgHT*o_BhZemL@DyHh~On8|M$(cR$ zgp1OfuO8pP_BjuLtSr@!lH7t>veI!2xvBKSVhN4Z{i`OSDt*dK-H~q_K4_&WRnSKJdNBrA*rVyAfq-gj9nHSmCOqkm3cKCS1hb9 zRPxuGQLGq=RkG`ux}&RMVakNBOQqM1sUq~cI!@WBY)Ua!{?hokH|_L;qprJ3e3it# zdyT1KcwvSj_ov?F1=sWV`S+_OUd92PLfCeLzceeJuAJs%^l`l#Y_%RXlXqdB7|b!X zl+`bv`-VX1Ae@W}i=D@21l>378zu&tTBO=MvVM-THL6P-YQ;ZEyCJcPw-8lEILQ=@ zyOufeU;j|PqQ|6Av_3qXrxJQ+e|IH`*Ua{gslZ!C@F_EYUe8U&tHjH!zrJ(lUKMqk z4?xlSAv4qbtbqoZE>6PzGruZVe69VHUiG+(JE`uzfTPci_j2EeO@@O{=jwIKtxR8z z;S78CXqV81`|LGrDI>Ut63WWp8*lwMuZ-lG(2!UyDJlrH3yS$8WXfejF30gFUy3X` z+tnw=Of=tjX7pQr%;Ays`SVuw(}f|hva1-&gqp;n2;A|Q80`o@`BC>g0Jwh_uavyQ zF_L(h1!uLYxe_(wky~ALKexS$dsQ+$L~b-0Nxem`esS?3JNC)WMpATBw(gP&K66&R zc6C$VKIS&`+WD@-!R&6X-^6>+hp_y$;J6itmThJWn-L>{tW2 zy37Wf9d=be9GU!b;zb;n>yx26=~ZL+d9MU6m})p7o0>Pd@OQ(Mh3{{}lmF;QBQI(0 zBgIIjtZs;0J(g--|BfqZ3SemofX%M0q_0ZYc9Ch@T8b(zROy@q>Yex74K@dvYL8Wg z6h8n~=M3qWs-}lS=zRQ?EKl8dT@F3sIdv}7s=5T0C#wWZ{8S@^R|yyzgg2;?>9oqC zWLfw1!VU$lS@bPevPmR>!Q5}*vDb8}gMxpp607Lex}{cXT6K$-iaQ1S z!uC!SS`!4Ul?$G&%^mBeTD4ScD}S?HzPEpCjh*&2%0!KUDdo1TDJ+6d+LPeVH}M;> z^YCT;#2^3)c&FGKNS5$DRK4!B1x$Y0XM$ zO05^#d{fuj{d@WC#o22=2O5nvf|=Pg<7FN3G#B{q)@i|Zq|y{iR8vT;kWzZITODX!MNb zj>U^EF5J0{nkEX*y;Dt6#kG#=cJ#<-Bdigra|LQavcXctXTvK2rheXNeQtKjC;5>I zqi^nudha;HlRlt@ADFKnm=+F@|9t>Rck&-M)qR=^wg`V$-ikbrJr{+mFRukPDuRi~ zIpeXb?}RRj?cmOcnl7&Dzf;ya_`O7=Q`xFo9t@FT)M1ueDI+=Y%ku&1t6d5&Ia#`h#ZBs(_V))zZ*$RxL-p&0H0mx zvmMRO82Sp3{?|VjHaSigTTXmdQ{7Z8SqCo;+%Oom>&e}g;VV@JDU~Pc?u^mrh#^kI z{s_Ho|LZ_Tgd0PutiH3>&tqw;b=~>qv%!47s*-Ul2NQpVj|1e*fmFX^(w*NLH(*7N z{_{e!E6`2)nye`jNftzyRyx57k_A-WWR~3B{*PtNTX4#4k*;D$>OHUQLycFKYD!5~d^%MX2Nt=hv9QMct{QqwP05r|+2><{9 literal 0 Hc-jL100001 diff --git a/src/documents/tests/samples/simple.txt b/src/documents/tests/samples/simple.txt new file mode 100644 index 0000000000..6de7b8c69d --- /dev/null +++ b/src/documents/tests/samples/simple.txt @@ -0,0 +1 @@ +This is a test file. diff --git a/src/documents/tests/test_migration_archive_files.py b/src/documents/tests/test_migration_archive_files.py new file mode 100644 index 0000000000..534a5b4992 --- /dev/null +++ b/src/documents/tests/test_migration_archive_files.py @@ -0,0 +1,175 @@ +import hashlib +import os +import shutil +from pathlib import Path + +from django.conf import settings +from django.test import override_settings + +from documents.sanity_checker import SanityFailedError +from documents.tasks import sanity_check +from documents.tests.utils import DirectoriesMixin, TestMigrations + + +STORAGE_TYPE_GPG = "gpg" + + +def archive_name_from_filename_old(filename): + return os.path.splitext(filename)[0] + ".pdf" + + +def archive_path_old(self): + if self.filename: + fname = archive_name_from_filename_old(self.filename) + else: + fname = "{:07}.pdf".format(self.pk) + + return os.path.join( + settings.ARCHIVE_DIR, + fname + ) + + +def archive_name_from_filename_new(filename): + name, ext = os.path.splitext(filename) + if ext == ".pdf": + return filename + else: + return filename + ".pdf" + + +def archive_path_new(self): + if self.filename: + fname = archive_name_from_filename_new(self.filename) + else: + fname = "{:07}.pdf".format(self.pk) + + return os.path.join( + settings.ARCHIVE_DIR, + fname + ) + + +def source_path(doc): + if doc.filename: + fname = str(doc.filename) + else: + fname = "{:07}{}".format(doc.pk, doc.file_type) + if doc.storage_type == STORAGE_TYPE_GPG: + fname += ".gpg" # pragma: no cover + + return os.path.join( + settings.ORIGINALS_DIR, + fname + ) + + +def thumbnail_path(doc): + file_name = "{:07}.png".format(doc.pk) + if doc.storage_type == STORAGE_TYPE_GPG: + file_name += ".gpg" + + return os.path.join( + settings.THUMBNAIL_DIR, + file_name + ) + + +def make_test_document(document_class, title: str, filename: str, mime_type: str, original: str, archive: str = None, new: bool = False): + doc = document_class() + doc.filename = filename + doc.title = title + doc.mime_type = mime_type + doc.content = "the content, does not matter for this test" + + shutil.copy2(original, source_path(doc)) + with open(original, "rb") as f: + doc.checksum = hashlib.md5(f.read()).hexdigest() + + if archive: + if new: + shutil.copy2(archive, archive_path_new(doc)) + else: + shutil.copy2(archive, archive_path_old(doc)) + with open(archive, "rb") as f: + doc.archive_checksum = hashlib.md5(f.read()).hexdigest() + + doc.save() + + Path(thumbnail_path(doc)).touch() + + return doc + + +@override_settings(PAPERLESS_FILENAME_FORMAT="{title}") +class TestMigrateArchiveFiles(DirectoriesMixin, TestMigrations): + + migrate_from = '1011_auto_20210101_2340' + migrate_to = '1012_fix_archive_files' + + def setUpBeforeMigration(self, apps): + simple_jpg = os.path.join(os.path.dirname(__file__), "samples", "simple.jpg") + simple_pdf = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") + simple_pdf2 = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf") + simple_txt = os.path.join(os.path.dirname(__file__), "samples", "simple.txt") + + Document = apps.get_model("documents", "Document") + + self.doc_unrelated = make_test_document(Document, "unrelated", "unrelated.txt", "application/pdf", simple_pdf2, simple_pdf2) + self.doc_no_archive = make_test_document(Document, "no_archive", "no_archive.txt", "text/plain", simple_txt) + self.clashA = make_test_document(Document, "clash", "clash.pdf", "application/pdf", simple_pdf, simple_pdf) + self.clashB = make_test_document(Document, "clash", "clash.jpg", "image/jpeg", simple_jpg, simple_pdf) + + self.assertEqual(archive_path_old(self.clashA), archive_path_old(self.clashB)) + self.assertRaises(SanityFailedError, sanity_check) + + def testArchiveFilesMigrated(self): + Document = self.apps.get_model('documents', 'Document') + + for doc in Document.objects.all(): + self.assertTrue(os.path.isfile(archive_path_new(self.clashB))) + with open(source_path(doc), "rb") as f: + original_checksum = hashlib.md5(f.read()).hexdigest() + self.assertEqual(original_checksum, doc.checksum) + + if doc.archive_checksum: + self.assertTrue(os.path.isfile(archive_path_new(doc))) + with open(archive_path_new(doc), "rb") as f: + archive_checksum = hashlib.md5(f.read()).hexdigest() + self.assertEqual(archive_checksum, doc.archive_checksum) + + # this will raise errors when any inconsistencies remain after migration + sanity_check() + + +class TestMigrateArchiveFilesBackwards(DirectoriesMixin, TestMigrations): + + migrate_from = '1012_fix_archive_files' + migrate_to = '1011_auto_20210101_2340' + + def setUpBeforeMigration(self, apps): + simple_jpg = os.path.join(os.path.dirname(__file__), "samples", "simple.jpg") + simple_pdf = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") + simple_pdf2 = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf") + simple_txt = os.path.join(os.path.dirname(__file__), "samples", "simple.txt") + + Document = apps.get_model("documents", "Document") + + self.doc_unrelated = make_test_document(Document, "unrelated", "unrelated.txt", "application/pdf", simple_pdf2, simple_pdf2, new=True) + self.doc_no_archive = make_test_document(Document, "no_archive", "no_archive.txt", "text/plain", simple_txt, new=True) + self.clashB = make_test_document(Document, "clash", "clash.jpg", "image/jpeg", simple_jpg, simple_pdf, new=True) + + def testArchiveFilesReverted(self): + Document = self.apps.get_model('documents', 'Document') + + for doc in Document.objects.all(): + self.assertTrue(os.path.isfile(archive_path_old(self.clashB))) + with open(source_path(doc), "rb") as f: + original_checksum = hashlib.md5(f.read()).hexdigest() + self.assertEqual(original_checksum, doc.checksum) + + if doc.archive_checksum: + self.assertTrue(os.path.isfile(archive_path_old(doc))) + with open(archive_path_old(doc), "rb") as f: + archive_checksum = hashlib.md5(f.read()).hexdigest() + self.assertEqual(archive_checksum, doc.archive_checksum) -- 2.47.3