mirror of
https://0xacab.org/jvoisin/mat2
synced 2025-10-06 16:42:57 +02:00
Compare commits
112 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
505be24be9 | ||
|
ef8265e86a | ||
|
1d75451b77 | ||
|
dc35ef56c8 | ||
|
3aa76cc58e | ||
|
8ff57c5803 | ||
|
04bb8c8ccf | ||
|
3a070b0ab7 | ||
|
283e5e5787 | ||
|
513d897ea0 | ||
|
5a9dc388ad | ||
|
5a08f5b7bf | ||
|
fe885babee | ||
|
1040a594d6 | ||
|
e510a225e3 | ||
|
a98962a0fa | ||
|
9a81b3adfd | ||
|
f1a071d460 | ||
|
38df679a88 | ||
|
44f267a596 | ||
|
5bc88faedf | ||
|
83389a63e9 | ||
|
e70ea811c9 | ||
|
2ae5d909c3 | ||
|
5896387ade | ||
|
d4c050a738 | ||
|
f04d4b28fc | ||
|
da88d30689 | ||
|
f1552b2ccb | ||
|
2ba38dd2a1 | ||
|
b832a59414 | ||
|
6ce88b8b7f | ||
|
2444caccc0 | ||
|
b9dbd12ef9 | ||
|
b2e153b69c | ||
|
35dca4bf1c | ||
|
4ed30b5e00 | ||
|
0d25b18d26 | ||
|
d0f3534eff | ||
|
8675706c93 | ||
|
5e196ecef8 | ||
|
8e98593b02 | ||
|
df252fd71a | ||
|
a1c39104fc | ||
|
34fbd633fd | ||
|
f1ceed13b5 | ||
|
5a5c642a46 | ||
|
84e302ac93 | ||
|
7901fdef2e | ||
|
1b356b8c6f | ||
|
c67bbafb2c | ||
|
5b606f939d | ||
|
156e81fb4c | ||
|
9578e4b4ee | ||
|
a46a7eb6fa | ||
|
a24c59b208 | ||
|
652b8e519f | ||
|
c14be47f95 | ||
|
81a3881aa4 | ||
|
e342671ead | ||
|
212d9c472c | ||
|
a88107c9ca | ||
|
7f629ed2e3 | ||
|
719cdf20fa | ||
|
2e243355f5 | ||
|
174d4a0ac0 | ||
|
fbcf68c280 | ||
|
9826de3526 | ||
|
ab71c29a28 | ||
|
3d2842802c | ||
|
a1a06d023e | ||
|
9275d64be5 | ||
|
0a2a398c9c | ||
|
5cf94bd256 | ||
|
de65f4f4d4 | ||
|
759efa03ee | ||
|
9fe6f1023b | ||
|
e3d817f57e | ||
|
2e9adab86a | ||
|
c8c27dcf38 | ||
|
120b204988 | ||
|
f3cef319b9 | ||
|
2d9ba81a84 | ||
|
072ee1814d | ||
|
3649c0ccaf | ||
|
119085f28d | ||
|
e515d907d7 | ||
|
46bb1b83ea | ||
|
1d7e374e5b | ||
|
915dc634c4 | ||
|
10d60bd398 | ||
|
4192a2daa3 | ||
|
9ce458cb3b | ||
|
907fc591cc | ||
|
8255293d1d | ||
|
6b7e8ad8c0 | ||
|
b7a8622682 | ||
|
3e2890eb9e | ||
|
91e80527fc | ||
|
7877ba0da5 | ||
|
e2634f7a50 | ||
|
aba9b72d2c | ||
|
15dd3d84ff | ||
|
588466f4a8 | ||
|
cf89ff45c2 | ||
|
f583d12564 | ||
|
1c72448e58 | ||
|
f068621628 | ||
|
fe09d81ab1 | ||
|
5be66dbe91 | ||
|
ee496cfa7f | ||
|
6e2e411a2a |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,3 +1,4 @@
|
||||
.*
|
||||
*.pyc
|
||||
.coverage
|
||||
.eggs
|
||||
|
@@ -9,7 +9,7 @@ bandit:
|
||||
script: # TODO: remove B405 and B314
|
||||
- apt-get -qqy update
|
||||
- apt-get -qqy install --no-install-recommends python3-bandit
|
||||
- bandit ./mat2 --format txt
|
||||
- bandit ./mat2 --format txt --skip B101
|
||||
- bandit -r ./nautilus/ --format txt --skip B101
|
||||
- bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314
|
||||
|
||||
@@ -20,7 +20,7 @@ pylint:
|
||||
- apt-get -qqy install --no-install-recommends pylint3 python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0
|
||||
- pylint3 --extension-pkg-whitelist=cairo,gi ./libmat2 ./mat2
|
||||
# Once nautilus-python is in Debian, decomment it form the line below
|
||||
- pylint3 --extension-pkg-whitelist=Nautilus,GObject,Gtk,Gio,GLib,gi ./nautilus/nautilus_mat2.py
|
||||
- pylint3 --extension-pkg-whitelist=Nautilus,GObject,Gtk,Gio,GLib,gi ./nautilus/mat2.py
|
||||
|
||||
pyflakes:
|
||||
stage: linting
|
||||
@@ -35,21 +35,31 @@ mypy:
|
||||
- apt-get -qqy update
|
||||
- apt-get -qqy install --no-install-recommends python3-pip
|
||||
- pip3 install mypy
|
||||
- mypy mat2 libmat2/*.py --ignore-missing-imports
|
||||
- mypy --ignore-missing-imports ./nautilus/nautilus_mat2.py
|
||||
- mypy --ignore-missing-imports mat2 libmat2/*.py ./nautilus/mat2.py
|
||||
|
||||
tests:debian:
|
||||
stage: test
|
||||
script:
|
||||
- apt-get -qqy update
|
||||
- apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage
|
||||
- apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage ffmpeg
|
||||
- python3-coverage run --branch -m unittest discover -s tests/
|
||||
- python3-coverage report -m --include 'libmat2/*'
|
||||
- python3-coverage report --fail-under=100 -m --include 'libmat2/*'
|
||||
|
||||
tests:fedora:
|
||||
image: fedora
|
||||
stage: test
|
||||
tags:
|
||||
- whitewhale
|
||||
script:
|
||||
- dnf install -y python3 python3-mutagen python3-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 gdk-pixbuf2-modules cairo-gobject cairo python3-cairo perl-Image-ExifTool mailcap
|
||||
- gdk-pixbuf-query-loaders-64 > /usr/lib64/gdk-pixbuf-2.0/2.10.0/loaders.cache
|
||||
- python3 setup.py test
|
||||
|
||||
tests:archlinux:
|
||||
image: archlinux/base
|
||||
stage: test
|
||||
tags:
|
||||
- whitewhale
|
||||
script:
|
||||
- pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap ffmpeg
|
||||
- python3 setup.py test
|
||||
|
5
.mailmap
Normal file
5
.mailmap
Normal file
@@ -0,0 +1,5 @@
|
||||
Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org> totallylegit <totallylegit@dustri.org>
|
||||
Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org> jvoisin <julien.voisin@dustri.org>
|
||||
Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org> jvoisin <jvoisin@riseup.net>
|
||||
|
||||
Daniel Kahn Gillmor <dkg@fifthhorseman.net> dkg <dkg@fifthhorseman.net>
|
13
.pylintrc
13
.pylintrc
@@ -6,11 +6,12 @@ max-locals=20
|
||||
disable=
|
||||
fixme,
|
||||
invalid-name,
|
||||
duplicate-code,
|
||||
missing-docstring,
|
||||
protected-access,
|
||||
abstract-method,
|
||||
wrong-import-position,
|
||||
catching-non-exception,
|
||||
cell-var-from-loop,
|
||||
locally-disabled,
|
||||
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
|
||||
abstract-method,
|
||||
wrong-import-position,
|
||||
catching-non-exception,
|
||||
cell-var-from-loop,
|
||||
locally-disabled,
|
||||
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
|
||||
|
51
CHANGELOG.md
51
CHANGELOG.md
@@ -1,3 +1,54 @@
|
||||
# 0.6.0 - 2018-11-10
|
||||
|
||||
- Add lightweight cleaning for jpeg
|
||||
- Add support for zip files
|
||||
- Add support for mp4 files
|
||||
- Improve metadata extraction for archives
|
||||
- Improve robustness against corrupted embedded files
|
||||
- Fix a possible security issue on some terminals (control character
|
||||
injection via --show)
|
||||
- Various internal cleanup/improvements
|
||||
|
||||
# 0.5.0 - 2018-10-23
|
||||
|
||||
- Video (.avi files for now) support, via FFmpeg, optionally
|
||||
- Lightweight cleaning for png and tiff files
|
||||
- Processing files starting with a dash is now quicker
|
||||
- Metadata are now displayed sorted
|
||||
- Recursive metadata support for FLAC files
|
||||
- Unsupported extensions aren't displayed in `./mat2 -l` anymore
|
||||
- Improve the display when no metadata are found
|
||||
- Update the logo according to the GNOME guidelines
|
||||
- The testsuite is now runnable on the installed version of mat2
|
||||
- Various internal cleanup/improvements
|
||||
|
||||
# 0.4.0 - 2018-10-03
|
||||
|
||||
- There is now a policy, for advanced users, to deal with unknown embedded fileformats
|
||||
- Improve the documentation
|
||||
- Various minor refactoring
|
||||
- Improve how corrupted PNG are handled
|
||||
- Dangerous/advanced cli's options no longer have short versions
|
||||
- Significant improvements to office files anonymisation
|
||||
- Archive members are sorted lexicographically
|
||||
- XML attributes are sorted lexicographically too
|
||||
- RSID are now stripped
|
||||
- Dangling references in [Content_types].xml are now removed
|
||||
- Significant improvements to office files support
|
||||
- Anonimysed office files can now be opened by MS Office without warnings
|
||||
- The CLI isn't threaded anymore, for it was causing issues
|
||||
- Various misc typo fix
|
||||
|
||||
# 0.3.1 - 2018-09-01
|
||||
|
||||
- Document how to install MAT2 for various distributions
|
||||
- Fix various typos in the documentation/comments
|
||||
- Add ArchLinux to the CI to ensure that MAT2 is running on it
|
||||
- Fix the handling of files with a name ending in `.JPG`
|
||||
- Improve the detection of unsupported extensions in upper-case
|
||||
- Streamline MAT2's logging
|
||||
|
||||
|
||||
# 0.3.0 - 2018-08-03
|
||||
|
||||
- Add a check for missing dependencies
|
||||
|
@@ -24,9 +24,14 @@ Since MAT2 is written in Python3, please conform as much as possible to the
|
||||
1. Update the [changelog](https://0xacab.org/jvoisin/mat2/blob/master/CHANGELOG.md)
|
||||
2. Update the version in the [mat2](https://0xacab.org/jvoisin/mat2/blob/master/mat2) file
|
||||
3. Update the version in the [setup.py](https://0xacab.org/jvoisin/mat2/blob/master/setup.py) file
|
||||
4. Update the version and date in the [man page](https://0xacab.org/jvoisin/mat2/blob/master/doc/mat.1)
|
||||
4. Update the version and date in the [man page](https://0xacab.org/jvoisin/mat2/blob/master/doc/mat2.1)
|
||||
5. Commit the changelog, man page, mat2 and setup.py files
|
||||
6. Create a tag with `git tag -s $VERSION`
|
||||
7. Push the commit with `git push origin master`
|
||||
8. Push the tag with `git push --tags`
|
||||
9. Do the secret release dance
|
||||
9. Create the signed tarball with `git archive --format=tar.xz --prefix=mat-$VERSION/ $VERSION > mat-$VERSION.tar.xz`
|
||||
10. Sign the tarball with `gpg --armor --detach-sign mat-$VERSION.tar.xz`
|
||||
11. Upload the result on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there
|
||||
12. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
|
||||
13. Tell the [downstreams](https://0xacab.org/jvoisin/mat2/blob/master/INSTALL.md) about it
|
||||
14. Do the secret release dance
|
||||
|
59
INSTALL.md
Normal file
59
INSTALL.md
Normal file
@@ -0,0 +1,59 @@
|
||||
# GNU/Linux
|
||||
|
||||
## Fedora
|
||||
|
||||
Thanks to [atenart](https://ack.tf/), there is a package available on
|
||||
[Fedora's copr]( https://copr.fedorainfracloud.org/coprs/atenart/mat2/ ).
|
||||
|
||||
We use copr (cool other packages repo) as the Mat2 Nautilus plugin depends on
|
||||
python3-nautilus, which isn't available yet in Fedora (but is distributed
|
||||
through this copr).
|
||||
|
||||
First you need to enable Mat2's copr:
|
||||
|
||||
```
|
||||
dnf -y copr enable atenart/mat2
|
||||
```
|
||||
|
||||
Then you can install both the Mat2 command and Nautilus extension:
|
||||
|
||||
```
|
||||
dnf -y install mat2 mat2-nautilus
|
||||
```
|
||||
|
||||
## Debian
|
||||
|
||||
There is currently no package for Debian. If you want to help to make this
|
||||
happen, there is an [issue](https://0xacab.org/jvoisin/mat2/issues/16) open.
|
||||
|
||||
But fear not, there is a way to install it *manually*:
|
||||
|
||||
```
|
||||
# apt install python3-mutagen python3-gi-cairo gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl gir1.2-glib-2.0 gir1.2-poppler-0.18
|
||||
$ git clone https://0xacab.org/jvoisin/mat2.git
|
||||
$ cd mat2
|
||||
$ ./mat2
|
||||
```
|
||||
|
||||
and if you want to install the über-fancy Nautilus extension:
|
||||
|
||||
```
|
||||
# apt install gnome-common gtk-doc-tools libnautilus-extension-dev python-gi-dev
|
||||
$ git clone https://github.com/GNOME/nautilus-python
|
||||
$ cd nautilus-python
|
||||
$ PYTHON=/usr/bin/python3 ./autogen.sh
|
||||
$ make
|
||||
# make install
|
||||
$ mkdir -p ~/.local/share/nautilus-python/extensions/
|
||||
$ cp ../nautilus/mat2.py ~/.local/share/nautilus-python/extensions/
|
||||
$ PYTHONPATH=/home/$USER/mat2 PYTHON=/usr/bin/python3 nautilus
|
||||
```
|
||||
|
||||
## Arch Linux
|
||||
|
||||
Thanks to [Francois_B](https://www.sciunto.org/), there is an package available on
|
||||
[Arch linux's AUR](https://aur.archlinux.org/packages/mat2/).
|
||||
|
||||
## Gentoo
|
||||
|
||||
MAT2 is available in the [torbrowser overlay](https://github.com/MeisterP/torbrowser-overlay).
|
39
README.md
39
README.md
@@ -30,10 +30,11 @@ metadata.
|
||||
- `python3-mutagen` for audio support
|
||||
- `python3-gi-cairo` and `gir1.2-poppler-0.18` for PDF support
|
||||
- `gir1.2-gdkpixbuf-2.0` for images support
|
||||
- `FFmpeg`, optionally, for video support
|
||||
- `libimage-exiftool-perl` for everything else
|
||||
|
||||
Please note that MAT2 requires at least Python3.5, meaning that it
|
||||
doesn't run on [Debian Jessie](https://packages.debian.org/jessie/python3),
|
||||
doesn't run on [Debian Jessie](https://packages.debian.org/jessie/python3).
|
||||
|
||||
# Running the test suite
|
||||
|
||||
@@ -44,22 +45,33 @@ $ python3 -m unittest discover -v
|
||||
# How to use MAT2
|
||||
|
||||
```bash
|
||||
usage: mat2 [-h] [-v] [-l] [-s | -L] [files [files ...]]
|
||||
usage: mat2 [-h] [-v] [-l] [--check-dependencies] [-V]
|
||||
[--unknown-members policy] [-s | -L]
|
||||
[files [files ...]]
|
||||
|
||||
Metadata anonymisation toolkit 2
|
||||
|
||||
positional arguments:
|
||||
files
|
||||
files the files to process
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-v, --version show program's version number and exit
|
||||
-l, --list list all supported fileformats
|
||||
-s, --show list all the harmful metadata of a file without removing
|
||||
them
|
||||
-L, --lightweight remove SOME metadata
|
||||
-h, --help show this help message and exit
|
||||
-v, --version show program's version number and exit
|
||||
-l, --list list all supported fileformats
|
||||
--check-dependencies check if MAT2 has all the dependencies it needs
|
||||
-V, --verbose show more verbose status information
|
||||
--unknown-members policy
|
||||
how to handle unknown members of archive-style files
|
||||
(policy should be one of: abort, omit, keep)
|
||||
-s, --show list harmful metadata detectable by MAT2 without
|
||||
removing them
|
||||
-L, --lightweight remove SOME metadata
|
||||
```
|
||||
|
||||
Note that MAT2 **will not** clean files in-place, but will produce, for
|
||||
example, with a file named "myfile.png" a cleaned version named
|
||||
"myfile.cleaned.png".
|
||||
|
||||
# Notes about detecting metadata
|
||||
|
||||
While MAT2 is doing its very best to display metadata when the `--show` flag is
|
||||
@@ -78,12 +90,15 @@ be cleaned or not.
|
||||
tries to deal with *printer dots* too.
|
||||
- [pdfparanoia](https://github.com/kanzure/pdfparanoia), that removes
|
||||
watermarks from PDF.
|
||||
- [Scrambled Exif](https://f-droid.org/packages/com.jarsilio.android.scrambledeggsif/),
|
||||
an open-source Android application to remove metadata from pictures.
|
||||
|
||||
# Contact
|
||||
|
||||
If possible, use the [issues system](https://0xacab.org/jvoisin/mat2/issues).
|
||||
If you think that a more private contact is needed (eg. for reporting security issues),
|
||||
you can email Julien (jvoisin) Voisin at `julien.voisin+mat@dustri.org`,
|
||||
If possible, use the [issues system](https://0xacab.org/jvoisin/mat2/issues)
|
||||
or the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
|
||||
Should a more private contact be needed (eg. for reporting security issues),
|
||||
you can email Julien (jvoisin) Voisin at `julien.voisin+mat2@dustri.org`,
|
||||
using the gpg key `9FCDEE9E1A381F311EA62A7404D041E8171901CC`.
|
||||
|
||||
# License
|
||||
|
BIN
data/mat2.png
BIN
data/mat2.png
Binary file not shown.
Before Width: | Height: | Size: 235 KiB After Width: | Height: | Size: 28 KiB |
868
data/mat2.svg
868
data/mat2.svg
File diff suppressed because one or more lines are too long
Before Width: | Height: | Size: 351 KiB After Width: | Height: | Size: 34 KiB |
@@ -6,7 +6,7 @@ Lightweight cleaning mode
|
||||
|
||||
Due to *popular* request, MAT2 is providing a *lightweight* cleaning mode,
|
||||
that only cleans the superficial metadata of your file, but not
|
||||
the ones that might be in **embeded** resources. Like for example,
|
||||
the ones that might be in **embedded** resources. Like for example,
|
||||
images in a PDF or an office document.
|
||||
|
||||
Revisions handling
|
||||
@@ -61,3 +61,11 @@ Images handling
|
||||
When possible, images are handled like PDF: rendered on a surface, then saved
|
||||
to the filesystem. This ensures that every metadata is removed.
|
||||
|
||||
XML attacks
|
||||
-----------
|
||||
|
||||
Since our threat model conveniently excludes files crafted to specifically
|
||||
bypass MAT2, fileformats containing harmful XML are out of our scope.
|
||||
But since MAT2 is using [etree](https://docs.python.org/3/library/xml.html#xml-vulnerabilities)
|
||||
to process XML, it's "only" vulnerable to DoS, and not memory corruption:
|
||||
odds are that the user will notice that the cleaning didn't succeed.
|
||||
|
@@ -1,16 +1,20 @@
|
||||
.TH MAT2 "1" "August 2018" "MAT2 0.3.0" "User Commands"
|
||||
.TH MAT2 "1" "November 2018" "MAT2 0.6.0" "User Commands"
|
||||
|
||||
.SH NAME
|
||||
mat2 \- the metadata anonymisation toolkit 2
|
||||
|
||||
.SH SYNOPSIS
|
||||
mat2 [\-h] [\-v] [\-l] [\-c] [\-s | \-L]\fR [files [files ...]]
|
||||
\fBmat2\fR [\-h] [\-v] [\-l] [\-V] [-s | -L] [\fIfiles\fR [\fIfiles ...\fR]]
|
||||
|
||||
.SH DESCRIPTION
|
||||
.B mat2
|
||||
removes metadata from various fileformats. It supports a wide variety of file
|
||||
formats, audio, office, images, …
|
||||
|
||||
Careful, mat2 does not clean files in-place, instead, it will produce a file with the word
|
||||
"cleaned" between the filename and its extension, for example "filename.cleaned.png"
|
||||
for a file named "filename.png".
|
||||
|
||||
.SH OPTIONS
|
||||
.SS "positional arguments:"
|
||||
.TP
|
||||
@@ -27,9 +31,15 @@ show program's version number and exit
|
||||
\fB\-l\fR, \fB\-\-list\fR
|
||||
list all supported fileformats
|
||||
.TP
|
||||
\fB\-c\fR, \fB\-\-check\-dependencies\fR
|
||||
\fB\-\-check\-dependencies\fR
|
||||
check if MAT2 has all the dependencies it needs
|
||||
.TP
|
||||
\fB\-V\fR, \fB\-\-verbose\fR
|
||||
show more verbose status information
|
||||
.TP
|
||||
\fB\-\-unknown-members\fR \fIpolicy\fR
|
||||
how to handle unknown members of archive-style files (policy should be one of: abort, omit, keep)
|
||||
.TP
|
||||
\fB\-s\fR, \fB\-\-show\fR
|
||||
list harmful metadata detectable by MAT2 without
|
||||
removing them
|
@@ -1,12 +1,15 @@
|
||||
#!/bin/env python3
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import collections
|
||||
import enum
|
||||
import importlib
|
||||
from typing import Dict
|
||||
from typing import Dict, Optional
|
||||
|
||||
from . import exiftool, video
|
||||
|
||||
# make pyflakes happy
|
||||
assert Dict
|
||||
assert Optional
|
||||
|
||||
# A set of extension that aren't supported, despite matching a supported mimetype
|
||||
UNSUPPORTED_EXTENSIONS = {
|
||||
@@ -35,13 +38,13 @@ DEPENDENCIES = {
|
||||
'mutagen': 'Mutagen',
|
||||
}
|
||||
|
||||
def check_dependencies() -> dict:
|
||||
|
||||
|
||||
def check_dependencies() -> Dict[str, bool]:
|
||||
ret = collections.defaultdict(bool) # type: Dict[str, bool]
|
||||
|
||||
exiftool = '/usr/bin/exiftool'
|
||||
ret['Exiftool'] = False
|
||||
if os.path.isfile(exiftool) and os.access(exiftool, os.X_OK): # pragma: no cover
|
||||
ret['Exiftool'] = True
|
||||
ret['Exiftool'] = True if exiftool._get_exiftool_path() else False
|
||||
ret['Ffmpeg'] = True if video._get_ffmpeg_path() else False
|
||||
|
||||
for key, value in DEPENDENCIES.items():
|
||||
ret[value] = True
|
||||
@@ -51,3 +54,9 @@ def check_dependencies() -> dict:
|
||||
ret[value] = False # pragma: no cover
|
||||
|
||||
return ret
|
||||
|
||||
@enum.unique
|
||||
class UnknownMemberPolicy(enum.Enum):
|
||||
ABORT = 'abort'
|
||||
OMIT = 'omit'
|
||||
KEEP = 'keep'
|
||||
|
@@ -1,13 +1,15 @@
|
||||
import abc
|
||||
import os
|
||||
from typing import Set, Dict
|
||||
import re
|
||||
from typing import Set, Dict, Union
|
||||
|
||||
assert Set # make pyflakes happy
|
||||
|
||||
|
||||
class AbstractParser(abc.ABC):
|
||||
""" This is the base classe of every parser.
|
||||
It might yield `ValueError` on instanciation on invalid files.
|
||||
""" This is the base class of every parser.
|
||||
It might yield `ValueError` on instantiation on invalid files,
|
||||
and `RuntimeError` when something went wrong in `remove_all`.
|
||||
"""
|
||||
meta_list = set() # type: Set[str]
|
||||
mimetypes = set() # type: Set[str]
|
||||
@@ -16,21 +18,23 @@ class AbstractParser(abc.ABC):
|
||||
"""
|
||||
:raises ValueError: Raised upon an invalid file
|
||||
"""
|
||||
if re.search('^[a-z0-9./]', filename) is None:
|
||||
# Some parsers are calling external binaries,
|
||||
# this prevents shell command injections
|
||||
filename = os.path.join('.', filename)
|
||||
|
||||
self.filename = filename
|
||||
fname, extension = os.path.splitext(filename)
|
||||
self.output_filename = fname + '.cleaned' + extension
|
||||
self.lightweight_cleaning = False
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_meta(self) -> Dict[str, str]:
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
pass # pragma: no cover
|
||||
|
||||
@abc.abstractmethod
|
||||
def remove_all(self) -> bool:
|
||||
pass # pragma: no cover
|
||||
|
||||
def remove_all_lightweight(self) -> bool:
|
||||
""" This method removes _SOME_ metadata.
|
||||
It might be useful to implement it for fileformats that do
|
||||
not support non-destructive cleaning.
|
||||
"""
|
||||
return self.remove_all()
|
||||
:raises RuntimeError: Raised if the cleaning process went wrong.
|
||||
"""
|
||||
pass # pragma: no cover
|
||||
|
164
libmat2/archive.py
Normal file
164
libmat2/archive.py
Normal file
@@ -0,0 +1,164 @@
|
||||
import zipfile
|
||||
import datetime
|
||||
import tempfile
|
||||
import os
|
||||
import logging
|
||||
import shutil
|
||||
from typing import Dict, Set, Pattern, Union
|
||||
|
||||
from . import abstract, UnknownMemberPolicy, parser_factory
|
||||
|
||||
# Make pyflakes happy
|
||||
assert Set
|
||||
assert Pattern
|
||||
assert Union
|
||||
|
||||
|
||||
class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||
""" Office files (.docx, .odt, …) are zipped files. """
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
|
||||
# Those are the files that have a format that _isn't_
|
||||
# supported by MAT2, but that we want to keep anyway.
|
||||
self.files_to_keep = set() # type: Set[Pattern]
|
||||
|
||||
# Those are the files that we _do not_ want to keep,
|
||||
# no matter if they are supported or not.
|
||||
self.files_to_omit = set() # type: Set[Pattern]
|
||||
|
||||
# what should the parser do if it encounters an unknown file in
|
||||
# the archive?
|
||||
self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
|
||||
|
||||
try: # better fail here than later
|
||||
zipfile.ZipFile(self.filename)
|
||||
except zipfile.BadZipFile:
|
||||
raise ValueError
|
||||
|
||||
def _specific_cleanup(self, full_path: str) -> bool:
|
||||
""" This method can be used to apply specific treatment
|
||||
to files present in the archive."""
|
||||
# pylint: disable=unused-argument,no-self-use
|
||||
return True # pragma: no cover
|
||||
|
||||
@staticmethod
|
||||
def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
|
||||
zipinfo.create_system = 3 # Linux
|
||||
zipinfo.comment = b''
|
||||
zipinfo.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be
|
||||
return zipinfo
|
||||
|
||||
@staticmethod
|
||||
def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]:
|
||||
metadata = {}
|
||||
if zipinfo.create_system == 3: # this is Linux
|
||||
pass
|
||||
elif zipinfo.create_system == 2:
|
||||
metadata['create_system'] = 'Windows'
|
||||
else:
|
||||
metadata['create_system'] = 'Weird'
|
||||
|
||||
if zipinfo.comment:
|
||||
metadata['comment'] = zipinfo.comment # type: ignore
|
||||
|
||||
if zipinfo.date_time != (1980, 1, 1, 0, 0, 0):
|
||||
metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time))
|
||||
|
||||
return metadata
|
||||
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
meta = dict() # type: Dict[str, Union[str, dict]]
|
||||
|
||||
with zipfile.ZipFile(self.filename) as zin:
|
||||
temp_folder = tempfile.mkdtemp()
|
||||
|
||||
for item in zin.infolist():
|
||||
if item.filename[-1] == '/': # pragma: no cover
|
||||
# `is_dir` is added in Python3.6
|
||||
continue # don't keep empty folders
|
||||
|
||||
zin.extract(member=item, path=temp_folder)
|
||||
full_path = os.path.join(temp_folder, item.filename)
|
||||
|
||||
tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore
|
||||
if not tmp_parser:
|
||||
continue
|
||||
|
||||
local_meta = tmp_parser.get_meta()
|
||||
if local_meta:
|
||||
meta[item.filename] = local_meta
|
||||
|
||||
shutil.rmtree(temp_folder)
|
||||
return meta
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
# pylint: disable=too-many-branches
|
||||
|
||||
with zipfile.ZipFile(self.filename) as zin,\
|
||||
zipfile.ZipFile(self.output_filename, 'w') as zout:
|
||||
|
||||
temp_folder = tempfile.mkdtemp()
|
||||
abort = False
|
||||
|
||||
# Since files order is a fingerprint factor,
|
||||
# we're iterating (and thus inserting) them in lexicographic order.
|
||||
for item in sorted(zin.infolist(), key=lambda z: z.filename):
|
||||
if item.filename[-1] == '/': # `is_dir` is added in Python3.6
|
||||
continue # don't keep empty folders
|
||||
|
||||
zin.extract(member=item, path=temp_folder)
|
||||
full_path = os.path.join(temp_folder, item.filename)
|
||||
|
||||
if self._specific_cleanup(full_path) is False:
|
||||
logging.warning("Something went wrong during deep cleaning of %s",
|
||||
item.filename)
|
||||
abort = True
|
||||
continue
|
||||
|
||||
if any(map(lambda r: r.search(item.filename), self.files_to_keep)):
|
||||
# those files aren't supported, but we want to add them anyway
|
||||
pass
|
||||
elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
|
||||
continue
|
||||
else: # supported files that we want to first clean, then add
|
||||
tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
|
||||
if not tmp_parser:
|
||||
if self.unknown_member_policy == UnknownMemberPolicy.OMIT:
|
||||
logging.warning("In file %s, omitting unknown element %s (format: %s)",
|
||||
self.filename, item.filename, mtype)
|
||||
continue
|
||||
elif self.unknown_member_policy == UnknownMemberPolicy.KEEP:
|
||||
logging.warning("In file %s, keeping unknown element %s (format: %s)",
|
||||
self.filename, item.filename, mtype)
|
||||
else:
|
||||
logging.error("In file %s, element %s's format (%s) " +
|
||||
"isn't supported",
|
||||
self.filename, item.filename, mtype)
|
||||
abort = True
|
||||
continue
|
||||
if tmp_parser:
|
||||
if tmp_parser.remove_all() is False:
|
||||
logging.warning("In file %s, something went wrong \
|
||||
with the cleaning of %s \
|
||||
(format: %s)",
|
||||
self.filename, item.filename, mtype)
|
||||
abort = True
|
||||
continue
|
||||
os.rename(tmp_parser.output_filename, full_path)
|
||||
|
||||
zinfo = zipfile.ZipInfo(item.filename) # type: ignore
|
||||
clean_zinfo = self._clean_zipinfo(zinfo)
|
||||
with open(full_path, 'rb') as f:
|
||||
zout.writestr(clean_zinfo, f.read())
|
||||
|
||||
shutil.rmtree(temp_folder)
|
||||
if abort:
|
||||
os.remove(self.output_filename)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
|
||||
class ZipParser(ArchiveBasedAbstractParser):
|
||||
mimetypes = {'application/zip'}
|
@@ -1,8 +1,12 @@
|
||||
import mimetypes
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from typing import Dict, Union
|
||||
|
||||
import mutagen
|
||||
|
||||
from . import abstract
|
||||
from . import abstract, parser_factory
|
||||
|
||||
|
||||
class MutagenParser(abstract.AbstractParser):
|
||||
@@ -13,13 +17,13 @@ class MutagenParser(abstract.AbstractParser):
|
||||
except mutagen.MutagenError:
|
||||
raise ValueError
|
||||
|
||||
def get_meta(self):
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
f = mutagen.File(self.filename)
|
||||
if f.tags:
|
||||
return {k:', '.join(v) for k, v in f.tags.items()}
|
||||
return {}
|
||||
|
||||
def remove_all(self):
|
||||
def remove_all(self) -> bool:
|
||||
shutil.copy(self.filename, self.output_filename)
|
||||
f = mutagen.File(self.output_filename)
|
||||
f.delete()
|
||||
@@ -30,8 +34,8 @@ class MutagenParser(abstract.AbstractParser):
|
||||
class MP3Parser(MutagenParser):
|
||||
mimetypes = {'audio/mpeg', }
|
||||
|
||||
def get_meta(self):
|
||||
metadata = {}
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
metadata = {} # type: Dict[str, Union[str, dict]]
|
||||
meta = mutagen.File(self.filename).tags
|
||||
for key in meta:
|
||||
metadata[key.rstrip(' \t\r\n\0')] = ', '.join(map(str, meta[key].text))
|
||||
@@ -44,3 +48,30 @@ class OGGParser(MutagenParser):
|
||||
|
||||
class FLACParser(MutagenParser):
|
||||
mimetypes = {'audio/flac', 'audio/x-flac'}
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
shutil.copy(self.filename, self.output_filename)
|
||||
f = mutagen.File(self.output_filename)
|
||||
f.clear_pictures()
|
||||
f.delete()
|
||||
f.save(deleteid3=True)
|
||||
return True
|
||||
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
meta = super().get_meta()
|
||||
for num, picture in enumerate(mutagen.File(self.filename).pictures):
|
||||
name = picture.desc if picture.desc else 'Cover %d' % num
|
||||
extension = mimetypes.guess_extension(picture.mime)
|
||||
if extension is None: # pragma: no cover
|
||||
meta[name] = 'harmful data'
|
||||
continue
|
||||
|
||||
_, fname = tempfile.mkstemp()
|
||||
fname = fname + extension
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(picture.data)
|
||||
p, _ = parser_factory.get_parser(fname) # type: ignore
|
||||
# Mypy chokes on ternaries :/
|
||||
meta[name] = p.get_meta() if p else 'harmful data' # type: ignore
|
||||
os.remove(fname)
|
||||
return meta
|
||||
|
66
libmat2/exiftool.py
Normal file
66
libmat2/exiftool.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
from typing import Dict, Union, Set
|
||||
|
||||
from . import abstract
|
||||
|
||||
# Make pyflakes happy
|
||||
assert Set
|
||||
|
||||
|
||||
class ExiftoolParser(abstract.AbstractParser):
|
||||
""" Exiftool is often the easiest way to get all the metadata
|
||||
from a import file, hence why several parsers are re-using its `get_meta`
|
||||
method.
|
||||
"""
|
||||
meta_whitelist = set() # type: Set[str]
|
||||
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
out = subprocess.check_output([_get_exiftool_path(), '-json', self.filename])
|
||||
meta = json.loads(out.decode('utf-8'))[0]
|
||||
for key in self.meta_whitelist:
|
||||
meta.pop(key, None)
|
||||
return meta
|
||||
|
||||
def _lightweight_cleanup(self) -> bool:
|
||||
if os.path.exists(self.output_filename):
|
||||
try:
|
||||
# exiftool can't force output to existing files
|
||||
os.remove(self.output_filename)
|
||||
except OSError as e: # pragma: no cover
|
||||
logging.error("The output file %s is already existing and \
|
||||
can't be overwritten: %s.", self.filename, e)
|
||||
return False
|
||||
|
||||
# Note: '-All=' must be followed by a known exiftool option.
|
||||
# Also, '-CommonIFD0' is needed for .tiff files
|
||||
cmd = [_get_exiftool_path(),
|
||||
'-all=', # remove metadata
|
||||
'-adobe=', # remove adobe-specific metadata
|
||||
'-exif:all=', # remove all exif metadata
|
||||
'-Time:All=', # remove all timestamps
|
||||
'-quiet', # don't show useless logs
|
||||
'-CommonIFD0=', # remove IFD0 metadata
|
||||
'-o', self.output_filename,
|
||||
self.filename]
|
||||
try:
|
||||
subprocess.check_call(cmd)
|
||||
except subprocess.CalledProcessError as e: # pragma: no cover
|
||||
logging.error("Something went wrong during the processing of %s: %s", self.filename, e)
|
||||
return False
|
||||
return True
|
||||
|
||||
def _get_exiftool_path() -> str: # pragma: no cover
|
||||
possible_pathes = {
|
||||
'/usr/bin/exiftool', # debian/fedora
|
||||
'/usr/bin/vendor_perl/exiftool', # archlinux
|
||||
}
|
||||
|
||||
for possible_path in possible_pathes:
|
||||
if os.path.isfile(possible_path):
|
||||
if os.access(possible_path, os.X_OK):
|
||||
return possible_path
|
||||
|
||||
raise RuntimeError("Unable to find exiftool")
|
@@ -1,5 +1,5 @@
|
||||
import shutil
|
||||
from typing import Dict
|
||||
from typing import Dict, Union
|
||||
from . import abstract
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ class HarmlessParser(abstract.AbstractParser):
|
||||
""" This is the parser for filetypes that can not contain metadata. """
|
||||
mimetypes = {'text/plain', 'image/x-ms-bmp'}
|
||||
|
||||
def get_meta(self) -> Dict[str, str]:
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
return dict()
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
|
@@ -1,56 +1,19 @@
|
||||
import subprocess
|
||||
import imghdr
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import re
|
||||
from typing import Set
|
||||
|
||||
import cairo
|
||||
|
||||
import gi
|
||||
gi.require_version('GdkPixbuf', '2.0')
|
||||
from gi.repository import GdkPixbuf
|
||||
from gi.repository import GdkPixbuf, GLib
|
||||
|
||||
from . import abstract
|
||||
from . import exiftool
|
||||
|
||||
# Make pyflakes happy
|
||||
assert Set
|
||||
|
||||
class _ImageParser(abstract.AbstractParser):
|
||||
""" Since we use `exiftool` to get metadata from
|
||||
all images fileformat, `get_meta` is implemented in this class,
|
||||
and all the image-handling ones are inheriting from it."""
|
||||
meta_whitelist = set() # type: Set[str]
|
||||
|
||||
@staticmethod
|
||||
def __handle_problematic_filename(filename: str, callback) -> str:
|
||||
""" This method takes a filename with a problematic name,
|
||||
and safely applies it a `callback`."""
|
||||
tmpdirname = tempfile.mkdtemp()
|
||||
fname = os.path.join(tmpdirname, "temp_file")
|
||||
shutil.copy(filename, fname)
|
||||
out = callback(fname)
|
||||
shutil.rmtree(tmpdirname)
|
||||
return out
|
||||
|
||||
def get_meta(self):
|
||||
""" There is no way to escape the leading(s) dash(es) of the current
|
||||
self.filename to prevent parameter injections, so we need to take care
|
||||
of this.
|
||||
"""
|
||||
fun = lambda f: subprocess.check_output(['/usr/bin/exiftool', '-json', f])
|
||||
if re.search('^[a-z0-9/]', self.filename) is None:
|
||||
out = self.__handle_problematic_filename(self.filename, fun)
|
||||
else:
|
||||
out = fun(self.filename)
|
||||
meta = json.loads(out.decode('utf-8'))[0]
|
||||
for key in self.meta_whitelist:
|
||||
meta.pop(key, None)
|
||||
return meta
|
||||
|
||||
class PNGParser(_ImageParser):
|
||||
class PNGParser(exiftool.ExiftoolParser):
|
||||
mimetypes = {'image/png', }
|
||||
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
|
||||
'Directory', 'FileSize', 'FileModifyDate',
|
||||
@@ -62,36 +25,48 @@ class PNGParser(_ImageParser):
|
||||
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
try: # better fail here than later
|
||||
cairo.ImageSurface.create_from_png(self.filename)
|
||||
except MemoryError:
|
||||
|
||||
if imghdr.what(filename) != 'png':
|
||||
raise ValueError
|
||||
|
||||
def remove_all(self):
|
||||
try: # better fail here than later
|
||||
cairo.ImageSurface.create_from_png(self.filename)
|
||||
except MemoryError: # pragma: no cover
|
||||
raise ValueError
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
if self.lightweight_cleaning:
|
||||
return self._lightweight_cleanup()
|
||||
surface = cairo.ImageSurface.create_from_png(self.filename)
|
||||
surface.write_to_png(self.output_filename)
|
||||
return True
|
||||
|
||||
|
||||
class GdkPixbufAbstractParser(_ImageParser):
|
||||
class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
|
||||
""" GdkPixbuf can handle a lot of surfaces, so we're rending images on it,
|
||||
this has the side-effect of completely removing metadata.
|
||||
"""
|
||||
_type = ''
|
||||
|
||||
def remove_all(self):
|
||||
_, extension = os.path.splitext(self.filename)
|
||||
pixbuf = GdkPixbuf.Pixbuf.new_from_file(self.filename)
|
||||
if extension == '.jpg':
|
||||
extension = '.jpeg' # gdk is picky
|
||||
pixbuf.savev(self.output_filename, extension[1:], [], [])
|
||||
return True
|
||||
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
if imghdr.what(filename) != self._type: # better safe than sorry
|
||||
# we can't use imghdr here because of https://bugs.python.org/issue28591
|
||||
try:
|
||||
GdkPixbuf.Pixbuf.new_from_file(self.filename)
|
||||
except GLib.GError:
|
||||
raise ValueError
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
if self.lightweight_cleaning:
|
||||
return self._lightweight_cleanup()
|
||||
|
||||
_, extension = os.path.splitext(self.filename)
|
||||
pixbuf = GdkPixbuf.Pixbuf.new_from_file(self.filename)
|
||||
if extension.lower() == '.jpg':
|
||||
extension = '.jpeg' # gdk is picky
|
||||
pixbuf.savev(self.output_filename, type=extension[1:], option_keys=[], option_values=[])
|
||||
return True
|
||||
|
||||
|
||||
class JPGParser(GdkPixbufAbstractParser):
|
||||
_type = 'jpeg'
|
||||
|
@@ -1,127 +1,46 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
import datetime
|
||||
import zipfile
|
||||
import logging
|
||||
from typing import Dict, Set, Pattern
|
||||
from typing import Dict, Set, Pattern, Tuple, Union
|
||||
|
||||
try: # protect against DoS
|
||||
from defusedxml import ElementTree as ET # type: ignore
|
||||
except ImportError:
|
||||
import xml.etree.ElementTree as ET # type: ignore
|
||||
import xml.etree.ElementTree as ET # type: ignore
|
||||
|
||||
from .archive import ArchiveBasedAbstractParser
|
||||
|
||||
from . import abstract, parser_factory
|
||||
# pylint: disable=line-too-long
|
||||
|
||||
# Make pyflakes happy
|
||||
assert Set
|
||||
assert Pattern
|
||||
|
||||
logging.basicConfig(level=logging.ERROR)
|
||||
|
||||
def _parse_xml(full_path: str):
|
||||
""" This function parse XML, with namespace support. """
|
||||
|
||||
def _parse_xml(full_path: str) -> Tuple[ET.ElementTree, Dict[str, str]]:
|
||||
""" This function parses XML, with namespace support. """
|
||||
namespace_map = dict()
|
||||
for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
|
||||
# The ns[0-9]+ namespaces are reserved for internal usage, so
|
||||
# we have to use an other nomenclature.
|
||||
if re.match('^ns[0-9]+$', key, re.I): # pragma: no cover
|
||||
key = 'mat' + key[2:]
|
||||
|
||||
namespace_map[key] = value
|
||||
ET.register_namespace(key, value)
|
||||
|
||||
return ET.parse(full_path), namespace_map
|
||||
|
||||
|
||||
class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||
""" Office files (.docx, .odt, …) are zipped files. """
|
||||
# Those are the files that have a format that _isn't_
|
||||
# supported by MAT2, but that we want to keep anyway.
|
||||
files_to_keep = set() # type: Set[str]
|
||||
def _sort_xml_attributes(full_path: str) -> bool:
|
||||
""" Sort xml attributes lexicographically,
|
||||
because it's possible to fingerprint producers (MS Office, Libreoffice, …)
|
||||
since they are all using different orders.
|
||||
"""
|
||||
tree = ET.parse(full_path)
|
||||
|
||||
# Those are the files that we _do not_ want to keep,
|
||||
# no matter if they are supported or not.
|
||||
files_to_omit = set() # type: Set[Pattern]
|
||||
for c in tree.getroot():
|
||||
c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
|
||||
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
try: # better fail here than later
|
||||
zipfile.ZipFile(self.filename)
|
||||
except zipfile.BadZipFile:
|
||||
raise ValueError
|
||||
|
||||
def _specific_cleanup(self, full_path: str) -> bool:
|
||||
""" This method can be used to apply specific treatment
|
||||
to files present in the archive."""
|
||||
# pylint: disable=unused-argument,no-self-use
|
||||
return True # pragma: no cover
|
||||
|
||||
@staticmethod
|
||||
def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
|
||||
zipinfo.create_system = 3 # Linux
|
||||
zipinfo.comment = b''
|
||||
zipinfo.date_time = (1980, 1, 1, 0, 0, 0) # this is as early as a zipfile can be
|
||||
return zipinfo
|
||||
|
||||
@staticmethod
|
||||
def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]:
|
||||
metadata = {}
|
||||
if zipinfo.create_system == 3: # this is Linux
|
||||
pass
|
||||
elif zipinfo.create_system == 2:
|
||||
metadata['create_system'] = 'Windows'
|
||||
else:
|
||||
metadata['create_system'] = 'Weird'
|
||||
|
||||
if zipinfo.comment:
|
||||
metadata['comment'] = zipinfo.comment # type: ignore
|
||||
|
||||
if zipinfo.date_time != (1980, 1, 1, 0, 0, 0):
|
||||
metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time))
|
||||
|
||||
return metadata
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
with zipfile.ZipFile(self.filename) as zin,\
|
||||
zipfile.ZipFile(self.output_filename, 'w') as zout:
|
||||
|
||||
temp_folder = tempfile.mkdtemp()
|
||||
|
||||
for item in zin.infolist():
|
||||
if item.filename[-1] == '/': # `is_dir` is added in Python3.6
|
||||
continue # don't keep empty folders
|
||||
|
||||
zin.extract(member=item, path=temp_folder)
|
||||
full_path = os.path.join(temp_folder, item.filename)
|
||||
|
||||
if self._specific_cleanup(full_path) is False:
|
||||
shutil.rmtree(temp_folder)
|
||||
os.remove(self.output_filename)
|
||||
logging.info("Something went wrong during deep cleaning of %s", item.filename)
|
||||
return False
|
||||
|
||||
if item.filename in self.files_to_keep:
|
||||
# those files aren't supported, but we want to add them anyway
|
||||
pass
|
||||
elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
|
||||
continue
|
||||
else:
|
||||
# supported files that we want to clean then add
|
||||
tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
|
||||
if not tmp_parser:
|
||||
shutil.rmtree(temp_folder)
|
||||
os.remove(self.output_filename)
|
||||
logging.info("%s's format (%s) isn't supported", item.filename, mtype)
|
||||
return False
|
||||
tmp_parser.remove_all()
|
||||
os.rename(tmp_parser.output_filename, full_path)
|
||||
|
||||
zinfo = zipfile.ZipInfo(item.filename) # type: ignore
|
||||
clean_zinfo = self._clean_zipinfo(zinfo)
|
||||
with open(full_path, 'rb') as f:
|
||||
zout.writestr(clean_zinfo, f.read())
|
||||
|
||||
shutil.rmtree(temp_folder)
|
||||
return True
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
return True
|
||||
|
||||
|
||||
class MSOfficeParser(ArchiveBasedAbstractParser):
|
||||
@@ -130,18 +49,117 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation'
|
||||
}
|
||||
files_to_keep = {
|
||||
'[Content_Types].xml',
|
||||
'_rels/.rels',
|
||||
'word/_rels/document.xml.rels',
|
||||
'word/document.xml',
|
||||
'word/fontTable.xml',
|
||||
'word/settings.xml',
|
||||
'word/styles.xml',
|
||||
content_types_to_keep = {
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml', # /word/endnotes.xml
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml', # /word/footnotes.xml
|
||||
'application/vnd.openxmlformats-officedocument.extended-properties+xml', # /docProps/app.xml
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml', # /word/document.xml
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml', # /word/fontTable.xml
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', # /word/footer.xml
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml', # /word/header.xml
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml', # /word/styles.xml
|
||||
'application/vnd.openxmlformats-package.core-properties+xml', # /docProps/core.xml
|
||||
|
||||
# Do we want to keep the following ones?
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml',
|
||||
|
||||
# See https://0xacab.org/jvoisin/mat2/issues/71
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml
|
||||
}
|
||||
files_to_omit = set(map(re.compile, { # type: ignore
|
||||
'^docProps/',
|
||||
}))
|
||||
|
||||
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
|
||||
self.files_to_keep = set(map(re.compile, { # type: ignore
|
||||
r'^\[Content_Types\]\.xml$',
|
||||
r'^_rels/\.rels$',
|
||||
r'^word/_rels/document\.xml\.rels$',
|
||||
r'^word/_rels/footer[0-9]*\.xml\.rels$',
|
||||
r'^word/_rels/header[0-9]*\.xml\.rels$',
|
||||
|
||||
# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
|
||||
r'^word/stylesWithEffects\.xml$',
|
||||
}))
|
||||
self.files_to_omit = set(map(re.compile, { # type: ignore
|
||||
r'^customXml/',
|
||||
r'webSettings\.xml$',
|
||||
r'^docProps/custom\.xml$',
|
||||
r'^word/printerSettings/',
|
||||
r'^word/theme',
|
||||
r'^word/people\.xml$',
|
||||
|
||||
# we have a whitelist in self.files_to_keep,
|
||||
# so we can trash everything else
|
||||
r'^word/_rels/',
|
||||
}))
|
||||
|
||||
if self.__fill_files_to_keep_via_content_types() is False:
|
||||
raise ValueError
|
||||
|
||||
def __fill_files_to_keep_via_content_types(self) -> bool:
|
||||
""" There is a suer-handy `[Content_Types].xml` file
|
||||
in MS Office archives, describing what each other file contains.
|
||||
The self.content_types_to_keep member contains a type whitelist,
|
||||
so we're using it to fill the self.files_to_keep one.
|
||||
"""
|
||||
with zipfile.ZipFile(self.filename) as zin:
|
||||
if '[Content_Types].xml' not in zin.namelist():
|
||||
return False
|
||||
xml_data = zin.read('[Content_Types].xml')
|
||||
|
||||
self.content_types = dict() # type: Dict[str, str]
|
||||
try:
|
||||
tree = ET.fromstring(xml_data)
|
||||
except ET.ParseError:
|
||||
return False
|
||||
for c in tree:
|
||||
if 'PartName' not in c.attrib or 'ContentType' not in c.attrib:
|
||||
continue
|
||||
elif c.attrib['ContentType'] in self.content_types_to_keep:
|
||||
fname = c.attrib['PartName'][1:] # remove leading `/`
|
||||
re_fname = re.compile('^' + re.escape(fname) + '$')
|
||||
self.files_to_keep.add(re_fname) # type: ignore
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def __remove_rsid(full_path: str) -> bool:
|
||||
""" The method will remove "revision session ID". We're '}rsid'
|
||||
instead of proper parsing, since rsid can have multiple forms, like
|
||||
`rsidRDefault`, `rsidR`, `rsids`, …
|
||||
|
||||
We're removing rsid tags in two times, because we can't modify
|
||||
the xml while we're iterating on it.
|
||||
|
||||
For more details, see
|
||||
- https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx
|
||||
- https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/
|
||||
"""
|
||||
try:
|
||||
tree, namespace = _parse_xml(full_path)
|
||||
except ET.ParseError:
|
||||
return False
|
||||
|
||||
# rsid, tags or attributes, are always under the `w` namespace
|
||||
if 'w' not in namespace.keys():
|
||||
return True
|
||||
|
||||
parent_map = {c:p for p in tree.iter() for c in p}
|
||||
|
||||
elements_to_remove = list()
|
||||
for item in tree.iterfind('.//', namespace):
|
||||
if '}rsid' in item.tag.strip().lower(): # rsid as tag
|
||||
elements_to_remove.append(item)
|
||||
continue
|
||||
for key in list(item.attrib.keys()): # rsid as attribute
|
||||
if '}rsid' in key.lower():
|
||||
del item.attrib[key]
|
||||
|
||||
for element in elements_to_remove:
|
||||
parent_map[element].remove(element)
|
||||
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def __remove_revisions(full_path: str) -> bool:
|
||||
@@ -151,7 +169,8 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
|
||||
"""
|
||||
try:
|
||||
tree, namespace = _parse_xml(full_path)
|
||||
except ET.ParseError:
|
||||
except ET.ParseError as e:
|
||||
logging.error("Unable to parse %s: %s", full_path, e)
|
||||
return False
|
||||
|
||||
# Revisions are either deletions (`w:del`) or
|
||||
@@ -163,39 +182,126 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
|
||||
|
||||
parent_map = {c:p for p in tree.iter() for c in p}
|
||||
|
||||
elements = list()
|
||||
elements_del = list()
|
||||
for element in tree.iterfind('.//w:del', namespace):
|
||||
elements.append(element)
|
||||
for element in elements:
|
||||
elements_del.append(element)
|
||||
for element in elements_del:
|
||||
parent_map[element].remove(element)
|
||||
|
||||
elements = list()
|
||||
elements_ins = list()
|
||||
for element in tree.iterfind('.//w:ins', namespace):
|
||||
for position, item in enumerate(tree.iter()): #pragma: no cover
|
||||
for position, item in enumerate(tree.iter()): # pragma: no cover
|
||||
if item == element:
|
||||
for children in element.iterfind('./*'):
|
||||
elements.append((element, position, children))
|
||||
elements_ins.append((element, position, children))
|
||||
break
|
||||
for (element, position, children) in elements:
|
||||
for (element, position, children) in elements_ins:
|
||||
parent_map[element].insert(position, children)
|
||||
parent_map[element].remove(element)
|
||||
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
return True
|
||||
|
||||
def __remove_content_type_members(self, full_path: str) -> bool:
|
||||
""" The method will remove the dangling references
|
||||
form the [Content_Types].xml file, since MS office doesn't like them
|
||||
"""
|
||||
try:
|
||||
tree, namespace = _parse_xml(full_path)
|
||||
except ET.ParseError: # pragma: no cover
|
||||
return False
|
||||
|
||||
if len(namespace.items()) != 1:
|
||||
return False # there should be only one namespace for Types
|
||||
|
||||
removed_fnames = set()
|
||||
with zipfile.ZipFile(self.filename) as zin:
|
||||
for fname in [item.filename for item in zin.infolist()]:
|
||||
for file_to_omit in self.files_to_omit:
|
||||
if file_to_omit.search(fname):
|
||||
matches = map(lambda r: r.search(fname), self.files_to_keep)
|
||||
if any(matches): # the file is whitelisted
|
||||
continue
|
||||
removed_fnames.add(fname)
|
||||
break
|
||||
|
||||
root = tree.getroot()
|
||||
for item in root.findall('{%s}Override' % namespace['']):
|
||||
name = item.attrib['PartName'][1:] # remove the leading '/'
|
||||
if name in removed_fnames:
|
||||
root.remove(item)
|
||||
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
return True
|
||||
|
||||
def _specific_cleanup(self, full_path: str) -> bool:
|
||||
if full_path.endswith('/word/document.xml'):
|
||||
# pylint: disable=too-many-return-statements
|
||||
if os.stat(full_path).st_size == 0: # Don't process empty files
|
||||
return True
|
||||
|
||||
if not full_path.endswith('.xml'):
|
||||
return True
|
||||
|
||||
if full_path.endswith('/[Content_Types].xml'):
|
||||
# this file contains references to files that we might
|
||||
# remove, and MS Office doesn't like dangling references
|
||||
if self.__remove_content_type_members(full_path) is False:
|
||||
return False
|
||||
elif full_path.endswith('/word/document.xml'):
|
||||
# this file contains the revisions
|
||||
return self.__remove_revisions(full_path)
|
||||
if self.__remove_revisions(full_path) is False:
|
||||
return False
|
||||
elif full_path.endswith('/docProps/app.xml'):
|
||||
# This file must be present and valid,
|
||||
# so we're removing as much as we can.
|
||||
with open(full_path, 'wb') as f:
|
||||
f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
|
||||
f.write(b'<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties">')
|
||||
f.write(b'</Properties>')
|
||||
elif full_path.endswith('/docProps/core.xml'):
|
||||
# This file must be present and valid,
|
||||
# so we're removing as much as we can.
|
||||
with open(full_path, 'wb') as f:
|
||||
f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
|
||||
f.write(b'<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties">')
|
||||
f.write(b'</cp:coreProperties>')
|
||||
|
||||
|
||||
if self.__remove_rsid(full_path) is False:
|
||||
return False
|
||||
|
||||
try:
|
||||
_sort_xml_attributes(full_path)
|
||||
except ET.ParseError as e: # pragma: no cover
|
||||
logging.error("Unable to parse %s: %s", full_path, e)
|
||||
return False
|
||||
|
||||
# This is awful, I'm sorry.
|
||||
#
|
||||
# Microsoft Office isn't happy when we have the `mc:Ignorable`
|
||||
# tag containing namespaces that aren't present in the xml file,
|
||||
# so instead of trying to remove this specific tag with etree,
|
||||
# we're removing it, with a regexp.
|
||||
#
|
||||
# Since we're the ones producing this file, via the call to
|
||||
# _sort_xml_attributes, there won't be any "funny tricks".
|
||||
# Worst case, the tag isn't present, and everything is fine.
|
||||
#
|
||||
# see: https://docs.microsoft.com/en-us/dotnet/framework/wpf/advanced/mc-ignorable-attribute
|
||||
with open(full_path, 'rb') as f:
|
||||
text = f.read()
|
||||
out = re.sub(b'mc:Ignorable="[^"]*"', b'', text, 1)
|
||||
with open(full_path, 'wb') as f:
|
||||
f.write(out)
|
||||
|
||||
return True
|
||||
|
||||
def get_meta(self) -> Dict[str, str]:
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
"""
|
||||
Yes, I know that parsing xml with regexp ain't pretty,
|
||||
be my guest and fix it if you want.
|
||||
"""
|
||||
metadata = {}
|
||||
metadata = super().get_meta()
|
||||
zipin = zipfile.ZipFile(self.filename)
|
||||
for item in zipin.infolist():
|
||||
if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
|
||||
@@ -222,26 +328,31 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
|
||||
'application/vnd.oasis.opendocument.formula',
|
||||
'application/vnd.oasis.opendocument.image',
|
||||
}
|
||||
files_to_keep = {
|
||||
'META-INF/manifest.xml',
|
||||
'content.xml',
|
||||
'manifest.rdf',
|
||||
'mimetype',
|
||||
'settings.xml',
|
||||
'styles.xml',
|
||||
}
|
||||
files_to_omit = set(map(re.compile, { # type: ignore
|
||||
r'^meta\.xml$',
|
||||
'^Configurations2/',
|
||||
'^Thumbnails/',
|
||||
}))
|
||||
|
||||
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
|
||||
self.files_to_keep = set(map(re.compile, { # type: ignore
|
||||
r'^META-INF/manifest\.xml$',
|
||||
r'^content\.xml$',
|
||||
r'^manifest\.rdf$',
|
||||
r'^mimetype$',
|
||||
r'^settings\.xml$',
|
||||
r'^styles\.xml$',
|
||||
}))
|
||||
self.files_to_omit = set(map(re.compile, { # type: ignore
|
||||
r'^meta\.xml$',
|
||||
r'^Configurations2/',
|
||||
r'^Thumbnails/',
|
||||
}))
|
||||
|
||||
@staticmethod
|
||||
def __remove_revisions(full_path: str) -> bool:
|
||||
try:
|
||||
tree, namespace = _parse_xml(full_path)
|
||||
except ET.ParseError:
|
||||
except ET.ParseError as e:
|
||||
logging.error("Unable to parse %s: %s", full_path, e)
|
||||
return False
|
||||
|
||||
if 'office' not in namespace.keys(): # no revisions in the current file
|
||||
@@ -252,15 +363,25 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
|
||||
text.remove(changes)
|
||||
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
|
||||
return True
|
||||
|
||||
def _specific_cleanup(self, full_path: str) -> bool:
|
||||
if os.path.basename(full_path) == 'content.xml':
|
||||
return self.__remove_revisions(full_path)
|
||||
if os.stat(full_path).st_size == 0: # Don't process empty files
|
||||
return True
|
||||
|
||||
if os.path.basename(full_path).endswith('.xml'):
|
||||
if os.path.basename(full_path) == 'content.xml':
|
||||
if self.__remove_revisions(full_path) is False:
|
||||
return False
|
||||
|
||||
try:
|
||||
_sort_xml_attributes(full_path)
|
||||
except ET.ParseError as e:
|
||||
logging.error("Unable to parse %s: %s", full_path, e)
|
||||
return False
|
||||
return True
|
||||
|
||||
def get_meta(self) -> Dict[str, str]:
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
"""
|
||||
Yes, I know that parsing xml with regexp ain't pretty,
|
||||
be my guest and fix it if you want.
|
||||
|
@@ -18,6 +18,8 @@ def __load_all_parsers():
|
||||
continue
|
||||
elif fname.endswith('__init__.py'):
|
||||
continue
|
||||
elif fname.endswith('exiftool.py'):
|
||||
continue
|
||||
basename = os.path.basename(fname)
|
||||
name, _ = os.path.splitext(basename)
|
||||
importlib.import_module('.' + name, package='libmat2')
|
||||
@@ -33,10 +35,11 @@ def _get_parsers() -> List[T]:
|
||||
|
||||
|
||||
def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
|
||||
""" Return the appropriate parser for a giver filename. """
|
||||
mtype, _ = mimetypes.guess_type(filename)
|
||||
|
||||
_, extension = os.path.splitext(filename)
|
||||
if extension in UNSUPPORTED_EXTENSIONS:
|
||||
if extension.lower() in UNSUPPORTED_EXTENSIONS:
|
||||
return None, mtype
|
||||
|
||||
for parser_class in _get_parsers(): # type: ignore
|
||||
|
@@ -7,6 +7,7 @@ import re
|
||||
import logging
|
||||
import tempfile
|
||||
import io
|
||||
from typing import Dict, Union
|
||||
from distutils.version import LooseVersion
|
||||
|
||||
import cairo
|
||||
@@ -16,10 +17,8 @@ from gi.repository import Poppler, GLib
|
||||
|
||||
from . import abstract
|
||||
|
||||
logging.basicConfig(level=logging.ERROR)
|
||||
|
||||
poppler_version = Poppler.get_version()
|
||||
if LooseVersion(poppler_version) < LooseVersion('0.46'): # pragma: no cover
|
||||
if LooseVersion(poppler_version) < LooseVersion('0.46'): # pragma: no cover
|
||||
raise ValueError("MAT2 needs at least Poppler version 0.46 to work. \
|
||||
The installed version is %s." % poppler_version) # pragma: no cover
|
||||
|
||||
@@ -39,7 +38,12 @@ class PDFParser(abstract.AbstractParser):
|
||||
except GLib.GError: # Invalid PDF
|
||||
raise ValueError
|
||||
|
||||
def remove_all_lightweight(self):
|
||||
def remove_all(self) -> bool:
|
||||
if self.lightweight_cleaning is True:
|
||||
return self.__remove_all_lightweight()
|
||||
return self.__remove_all_thorough()
|
||||
|
||||
def __remove_all_lightweight(self) -> bool:
|
||||
"""
|
||||
Load the document into Poppler, render pages on a new PDFSurface.
|
||||
"""
|
||||
@@ -66,7 +70,7 @@ class PDFParser(abstract.AbstractParser):
|
||||
|
||||
return True
|
||||
|
||||
def remove_all(self):
|
||||
def __remove_all_thorough(self) -> bool:
|
||||
"""
|
||||
Load the document into Poppler, render pages on PNG,
|
||||
and shove those PNG into a new PDF.
|
||||
@@ -120,15 +124,14 @@ class PDFParser(abstract.AbstractParser):
|
||||
document.save('file://' + os.path.abspath(out_file))
|
||||
return True
|
||||
|
||||
|
||||
@staticmethod
|
||||
def __parse_metadata_field(data: str) -> dict:
|
||||
def __parse_metadata_field(data: str) -> Dict[str, str]:
|
||||
metadata = {}
|
||||
for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
|
||||
metadata[key] = value
|
||||
return metadata
|
||||
|
||||
def get_meta(self):
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
""" Return a dict with all the meta of the file
|
||||
"""
|
||||
metadata = {}
|
||||
|
@@ -3,9 +3,6 @@ from typing import Union, Tuple, Dict
|
||||
|
||||
from . import abstract
|
||||
|
||||
logging.basicConfig(level=logging.ERROR)
|
||||
|
||||
|
||||
class TorrentParser(abstract.AbstractParser):
|
||||
mimetypes = {'application/x-bittorrent', }
|
||||
whitelist = {b'announce', b'announce-list', b'info'}
|
||||
@@ -17,14 +14,13 @@ class TorrentParser(abstract.AbstractParser):
|
||||
if self.dict_repr is None:
|
||||
raise ValueError
|
||||
|
||||
def get_meta(self) -> Dict[str, str]:
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
metadata = {}
|
||||
for key, value in self.dict_repr.items():
|
||||
if key not in self.whitelist:
|
||||
metadata[key.decode('utf-8')] = value
|
||||
return metadata
|
||||
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
cleaned = dict()
|
||||
for key, value in self.dict_repr.items():
|
||||
@@ -123,9 +119,9 @@ class _BencodeHandler(object):
|
||||
try:
|
||||
ret, trail = self.__decode_func[s[0]](s)
|
||||
except (IndexError, KeyError, ValueError) as e:
|
||||
logging.debug("Not a valid bencoded string: %s", e)
|
||||
logging.warning("Not a valid bencoded string: %s", e)
|
||||
return None
|
||||
if trail != b'':
|
||||
logging.debug("Invalid bencoded value (data after valid prefix)")
|
||||
logging.warning("Invalid bencoded value (data after valid prefix)")
|
||||
return None
|
||||
return ret
|
||||
|
111
libmat2/video.py
Normal file
111
libmat2/video.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import os
|
||||
import subprocess
|
||||
import logging
|
||||
|
||||
from typing import Dict, Union
|
||||
|
||||
from . import exiftool
|
||||
|
||||
|
||||
class AbstractFFmpegParser(exiftool.ExiftoolParser):
|
||||
""" Abstract parser for all FFmpeg-based ones, mainly for video. """
|
||||
def remove_all(self) -> bool:
|
||||
cmd = [_get_ffmpeg_path(),
|
||||
'-i', self.filename, # input file
|
||||
'-y', # overwrite existing output file
|
||||
'-map', '0', # copy everything all streams from input to output
|
||||
'-codec', 'copy', # don't decode anything, just copy (speed!)
|
||||
'-loglevel', 'panic', # Don't show log
|
||||
'-hide_banner', # hide the banner
|
||||
'-map_metadata', '-1', # remove supperficial metadata
|
||||
'-map_chapters', '-1', # remove chapters
|
||||
'-disposition', '0', # Remove dispositions (check ffmpeg's manpage)
|
||||
'-fflags', '+bitexact', # don't add any metadata
|
||||
'-flags:v', '+bitexact', # don't add any metadata
|
||||
'-flags:a', '+bitexact', # don't add any metadata
|
||||
self.output_filename]
|
||||
try:
|
||||
subprocess.check_call(cmd)
|
||||
except subprocess.CalledProcessError as e:
|
||||
logging.error("Something went wrong during the processing of %s: %s", self.filename, e)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
class AVIParser(AbstractFFmpegParser):
|
||||
mimetypes = {'video/x-msvideo', }
|
||||
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
|
||||
'FileSize', 'FileModifyDate', 'FileAccessDate',
|
||||
'FileInodeChangeDate', 'FilePermissions', 'FileType',
|
||||
'FileTypeExtension', 'MIMEType', 'FrameRate', 'MaxDataRate',
|
||||
'FrameCount', 'StreamCount', 'StreamType', 'VideoCodec',
|
||||
'VideoFrameRate', 'VideoFrameCount', 'Quality',
|
||||
'SampleSize', 'BMPVersion', 'ImageWidth', 'ImageHeight',
|
||||
'Planes', 'BitDepth', 'Compression', 'ImageLength',
|
||||
'PixelsPerMeterX', 'PixelsPerMeterY', 'NumColors',
|
||||
'NumImportantColors', 'NumColors', 'NumImportantColors',
|
||||
'RedMask', 'GreenMask', 'BlueMask', 'AlphaMask',
|
||||
'ColorSpace', 'AudioCodec', 'AudioCodecRate',
|
||||
'AudioSampleCount', 'AudioSampleCount',
|
||||
'AudioSampleRate', 'Encoding', 'NumChannels',
|
||||
'SampleRate', 'AvgBytesPerSec', 'BitsPerSample',
|
||||
'Duration', 'ImageSize', 'Megapixels'}
|
||||
|
||||
class MP4Parser(AbstractFFmpegParser):
|
||||
mimetypes = {'video/mp4', }
|
||||
meta_whitelist = {'AudioFormat', 'AvgBitrate', 'Balance', 'TrackDuration',
|
||||
'XResolution', 'YResolution', 'ExifToolVersion',
|
||||
'FileAccessDate', 'FileInodeChangeDate', 'FileModifyDate',
|
||||
'FileName', 'FilePermissions', 'MIMEType', 'FileType',
|
||||
'FileTypeExtension', 'Directory', 'ImageWidth',
|
||||
'ImageSize', 'ImageHeight', 'FileSize', 'SourceFile',
|
||||
'BitDepth', 'Duration', 'AudioChannels',
|
||||
'AudioBitsPerSample', 'AudioSampleRate', 'Megapixels',
|
||||
'MovieDataSize', 'VideoFrameRate', 'MediaTimeScale',
|
||||
'SourceImageHeight', 'SourceImageWidth',
|
||||
'MatrixStructure', 'MediaDuration'}
|
||||
meta_key_value_whitelist = { # some metadata are mandatory :/
|
||||
'CreateDate': '0000:00:00 00:00:00',
|
||||
'CurrentTime': '0 s',
|
||||
'MediaCreateDate': '0000:00:00 00:00:00',
|
||||
'MediaLanguageCode': 'und',
|
||||
'MediaModifyDate': '0000:00:00 00:00:00',
|
||||
'ModifyDate': '0000:00:00 00:00:00',
|
||||
'OpColor': '0 0 0',
|
||||
'PosterTime': '0 s',
|
||||
'PreferredRate': '1',
|
||||
'PreferredVolume': '100.00%',
|
||||
'PreviewDuration': '0 s',
|
||||
'PreviewTime': '0 s',
|
||||
'SelectionDuration': '0 s',
|
||||
'SelectionTime': '0 s',
|
||||
'TrackCreateDate': '0000:00:00 00:00:00',
|
||||
'TrackModifyDate': '0000:00:00 00:00:00',
|
||||
'TrackVolume': '0.00%',
|
||||
}
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
logging.warning('The format of "%s" (video/mp4) has some mandatory '
|
||||
'metadata fields; mat2 filled them with standard data.',
|
||||
self.filename)
|
||||
return super().remove_all()
|
||||
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
meta = super().get_meta()
|
||||
|
||||
ret = dict() # type: Dict[str, Union[str, dict]]
|
||||
for key, value in meta.items():
|
||||
if key in self.meta_key_value_whitelist.keys():
|
||||
if value == self.meta_key_value_whitelist[key]:
|
||||
continue
|
||||
ret[key] = value
|
||||
return ret
|
||||
|
||||
|
||||
def _get_ffmpeg_path() -> str: # pragma: no cover
|
||||
ffmpeg_path = '/usr/bin/ffmpeg'
|
||||
if os.path.isfile(ffmpeg_path):
|
||||
if os.access(ffmpeg_path, os.X_OK):
|
||||
return ffmpeg_path
|
||||
|
||||
raise RuntimeError("Unable to find ffmpeg")
|
112
mat2
112
mat2
@@ -1,20 +1,28 @@
|
||||
#!/usr/bin/python3
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
from typing import Tuple
|
||||
from typing import Tuple, Generator, List, Union
|
||||
import sys
|
||||
import itertools
|
||||
import mimetypes
|
||||
import argparse
|
||||
import multiprocessing
|
||||
import logging
|
||||
import unicodedata
|
||||
|
||||
try:
|
||||
from libmat2 import parser_factory, UNSUPPORTED_EXTENSIONS, check_dependencies
|
||||
from libmat2 import parser_factory, UNSUPPORTED_EXTENSIONS
|
||||
from libmat2 import check_dependencies, UnknownMemberPolicy
|
||||
except ValueError as e:
|
||||
print(e)
|
||||
sys.exit(1)
|
||||
|
||||
__version__ = '0.3.0'
|
||||
__version__ = '0.6.0'
|
||||
|
||||
# Make pyflakes happy
|
||||
assert Tuple
|
||||
assert Union
|
||||
|
||||
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING)
|
||||
|
||||
|
||||
def __check_file(filename: str, mode: int=os.R_OK) -> bool:
|
||||
if not os.path.exists(filename):
|
||||
@@ -29,15 +37,20 @@ def __check_file(filename: str, mode: int=os.R_OK) -> bool:
|
||||
return True
|
||||
|
||||
|
||||
def create_arg_parser():
|
||||
def create_arg_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description='Metadata anonymisation toolkit 2')
|
||||
parser.add_argument('files', nargs='*', help='the files to process')
|
||||
parser.add_argument('-v', '--version', action='version',
|
||||
version='MAT2 %s' % __version__)
|
||||
parser.add_argument('-l', '--list', action='store_true',
|
||||
help='list all supported fileformats')
|
||||
parser.add_argument('-c', '--check-dependencies', action='store_true',
|
||||
parser.add_argument('--check-dependencies', action='store_true',
|
||||
help='check if MAT2 has all the dependencies it needs')
|
||||
parser.add_argument('-V', '--verbose', action='store_true',
|
||||
help='show more verbose status information')
|
||||
parser.add_argument('--unknown-members', metavar='policy', default='abort',
|
||||
help='how to handle unknown members of archive-style files (policy should' +
|
||||
' be one of: %s)' % ', '.join(p.value for p in UnknownMemberPolicy))
|
||||
|
||||
|
||||
info = parser.add_mutually_exclusive_group()
|
||||
@@ -56,16 +69,37 @@ def show_meta(filename: str):
|
||||
if p is None:
|
||||
print("[-] %s's format (%s) is not supported" % (filename, mtype))
|
||||
return
|
||||
__print_meta(filename, p.get_meta())
|
||||
|
||||
|
||||
def __print_meta(filename: str, metadata: dict, depth: int=1):
|
||||
padding = " " * depth*2
|
||||
if not metadata:
|
||||
print(padding + "No metadata found")
|
||||
return
|
||||
|
||||
print("[%s] Metadata for %s:" % ('+'*depth, filename))
|
||||
|
||||
for (k, v) in sorted(metadata.items()):
|
||||
if isinstance(v, dict):
|
||||
__print_meta(k, v, depth+1)
|
||||
continue
|
||||
|
||||
# Remove control characters
|
||||
# We might use 'Cc' instead of 'C', but better safe than sorry
|
||||
# https://www.unicode.org/reports/tr44/#GC_Values_Table
|
||||
try:
|
||||
v = ''.join(ch for ch in v if not unicodedata.category(ch).startswith('C'))
|
||||
except TypeError:
|
||||
pass # for things that aren't iterable
|
||||
|
||||
print("[+] Metadata for %s:" % filename)
|
||||
for k, v in p.get_meta().items():
|
||||
try: # FIXME this is ugly.
|
||||
print(" %s: %s" % (k, v))
|
||||
print(padding + " %s: %s" % (k, v))
|
||||
except UnicodeEncodeError:
|
||||
print(" %s: harmful content" % k)
|
||||
print(padding + " %s: harmful content" % k)
|
||||
|
||||
def clean_meta(params: Tuple[str, bool]) -> bool:
|
||||
filename, is_lightweigth = params
|
||||
|
||||
def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy) -> bool:
|
||||
if not __check_file(filename, os.R_OK|os.W_OK):
|
||||
return False
|
||||
|
||||
@@ -73,29 +107,36 @@ def clean_meta(params: Tuple[str, bool]) -> bool:
|
||||
if p is None:
|
||||
print("[-] %s's format (%s) is not supported" % (filename, mtype))
|
||||
return False
|
||||
if is_lightweigth:
|
||||
return p.remove_all_lightweight()
|
||||
return p.remove_all()
|
||||
p.unknown_member_policy = policy
|
||||
p.lightweight_cleaning = is_lightweight
|
||||
|
||||
try:
|
||||
return p.remove_all()
|
||||
except RuntimeError as e:
|
||||
print("[-] %s can't be cleaned: %s" % (filename, e))
|
||||
return False
|
||||
|
||||
|
||||
def show_parsers():
|
||||
|
||||
def show_parsers() -> bool:
|
||||
print('[+] Supported formats:')
|
||||
formats = list()
|
||||
for parser in parser_factory._get_parsers():
|
||||
formats = set() # Set[str]
|
||||
for parser in parser_factory._get_parsers(): # type: ignore
|
||||
for mtype in parser.mimetypes:
|
||||
extensions = set()
|
||||
extensions = set() # Set[str]
|
||||
for extension in mimetypes.guess_all_extensions(mtype):
|
||||
if extension[1:] not in UNSUPPORTED_EXTENSIONS: # skip the dot
|
||||
if extension not in UNSUPPORTED_EXTENSIONS:
|
||||
extensions.add(extension)
|
||||
if not extensions:
|
||||
# we're not supporting a single extension in the current
|
||||
# mimetype, so there is not point in showing the mimetype at all
|
||||
continue
|
||||
formats.append(' - %s (%s)' % (mtype, ', '.join(extensions)))
|
||||
formats.add(' - %s (%s)' % (mtype, ', '.join(extensions)))
|
||||
print('\n'.join(sorted(formats)))
|
||||
return True
|
||||
|
||||
|
||||
def __get_files_recursively(files):
|
||||
def __get_files_recursively(files: List[str]) -> Generator[str, None, None]:
|
||||
for f in files:
|
||||
if os.path.isdir(f):
|
||||
for path, _, _files in os.walk(f):
|
||||
@@ -106,19 +147,22 @@ def __get_files_recursively(files):
|
||||
elif __check_file(f):
|
||||
yield f
|
||||
|
||||
def main():
|
||||
def main() -> int:
|
||||
arg_parser = create_arg_parser()
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
if args.verbose:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
if not args.files:
|
||||
if args.list:
|
||||
show_parsers()
|
||||
return show_parsers()
|
||||
elif args.check_dependencies:
|
||||
print("Dependencies required for MAT2 %s:" % __version__)
|
||||
for key, value in sorted(check_dependencies().items()):
|
||||
print('- %s: %s' % (key, 'yes' if value else 'no'))
|
||||
else:
|
||||
return arg_parser.print_help()
|
||||
arg_parser.print_help()
|
||||
return 0
|
||||
|
||||
elif args.show:
|
||||
@@ -127,12 +171,16 @@ def main():
|
||||
return 0
|
||||
|
||||
else:
|
||||
p = multiprocessing.Pool()
|
||||
mode = (args.lightweight is True)
|
||||
l = zip(__get_files_recursively(args.files), itertools.repeat(mode))
|
||||
policy = UnknownMemberPolicy(args.unknown_members)
|
||||
if policy == UnknownMemberPolicy.KEEP:
|
||||
logging.warning('Keeping unknown member files may leak metadata in the resulting file!')
|
||||
|
||||
no_failure = True
|
||||
for f in __get_files_recursively(args.files):
|
||||
if clean_meta(f, args.lightweight, policy) is False:
|
||||
no_failure = False
|
||||
return 0 if no_failure is True else -1
|
||||
|
||||
ret = list(p.imap_unordered(clean_meta, list(l)))
|
||||
return 0 if all(ret) else -1
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
Binary file not shown.
@@ -14,7 +14,7 @@ thread, so we'll have to resort to using a `queue` to pass "messages" around.
|
||||
|
||||
import queue
|
||||
import threading
|
||||
from typing import Tuple
|
||||
from typing import Tuple, Optional, List
|
||||
from urllib.parse import unquote
|
||||
|
||||
import gi
|
||||
@@ -25,10 +25,8 @@ from gi.repository import Nautilus, GObject, Gtk, Gio, GLib, GdkPixbuf
|
||||
|
||||
from libmat2 import parser_factory
|
||||
|
||||
# make pyflakes happy
|
||||
assert Tuple
|
||||
|
||||
def _remove_metadata(fpath):
|
||||
def _remove_metadata(fpath) -> Tuple[bool, Optional[str]]:
|
||||
""" This is a simple wrapper around libmat2, because it's
|
||||
easier and cleaner this way.
|
||||
"""
|
||||
@@ -63,7 +61,7 @@ class ColumnExtension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationW
|
||||
self.infobar.get_content_area().pack_start(self.infobar_hbox, True, True, 0)
|
||||
self.infobar.show_all()
|
||||
|
||||
def get_widget(self, uri, window):
|
||||
def get_widget(self, uri, window) -> Gtk.Widget:
|
||||
""" This is the method that we have to implement (because we're
|
||||
a LocationWidgetProvider) in order to show our infobar.
|
||||
"""
|
||||
@@ -104,7 +102,6 @@ class ColumnExtension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationW
|
||||
box.add(self.__create_treeview())
|
||||
window.show_all()
|
||||
|
||||
|
||||
@staticmethod
|
||||
def __validate(fileinfo) -> Tuple[bool, str]:
|
||||
""" Validate if a given file FileInfo `fileinfo` can be processed.
|
||||
@@ -115,7 +112,6 @@ class ColumnExtension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationW
|
||||
return False, "Not writeable"
|
||||
return True, ""
|
||||
|
||||
|
||||
def __create_treeview(self) -> Gtk.TreeView:
|
||||
liststore = Gtk.ListStore(GdkPixbuf.Pixbuf, str, str)
|
||||
treeview = Gtk.TreeView(model=liststore)
|
||||
@@ -148,7 +144,6 @@ class ColumnExtension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationW
|
||||
treeview.show_all()
|
||||
return treeview
|
||||
|
||||
|
||||
def __create_progressbar(self) -> Gtk.ProgressBar:
|
||||
""" Create the progressbar used to notify that files are currently
|
||||
being processed.
|
||||
@@ -211,7 +206,6 @@ class ColumnExtension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationW
|
||||
processing_queue.put(None) # signal that we processed all the files
|
||||
return True
|
||||
|
||||
|
||||
def __cb_menu_activate(self, menu, files):
|
||||
""" This method is called when the user clicked the "clean metadata"
|
||||
menu item.
|
||||
@@ -228,12 +222,11 @@ class ColumnExtension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationW
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
|
||||
|
||||
def get_background_items(self, window, file):
|
||||
""" https://bugzilla.gnome.org/show_bug.cgi?id=784278 """
|
||||
return None
|
||||
|
||||
def get_file_items(self, window, files):
|
||||
def get_file_items(self, window, files) -> Optional[List[Nautilus.MenuItem]]:
|
||||
""" This method is the one allowing us to create a menu item.
|
||||
"""
|
||||
# Do not show the menu item if not a single file has a chance to be
|
6
setup.py
6
setup.py
@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
|
||||
|
||||
setuptools.setup(
|
||||
name="mat2",
|
||||
version='0.3.0',
|
||||
version='0.6.0',
|
||||
author="Julien (jvoisin) Voisin",
|
||||
author_email="julien.voisin+mat2@dustri.org",
|
||||
description="A handy tool to trash your metadata",
|
||||
@@ -20,7 +20,7 @@ setuptools.setup(
|
||||
'pycairo',
|
||||
],
|
||||
packages=setuptools.find_packages(exclude=('tests', )),
|
||||
classifiers=(
|
||||
classifiers=[
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Environment :: Console",
|
||||
"License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)",
|
||||
@@ -28,7 +28,7 @@ setuptools.setup(
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Topic :: Security",
|
||||
"Intended Audience :: End Users/Desktop",
|
||||
),
|
||||
],
|
||||
project_urls={
|
||||
'bugtacker': 'https://0xacab.org/jvoisin/mat2/issues',
|
||||
},
|
||||
|
BIN
tests/data/broken_xml_content_types.docx
Normal file
BIN
tests/data/broken_xml_content_types.docx
Normal file
Binary file not shown.
BIN
tests/data/control_chars.jpg
Normal file
BIN
tests/data/control_chars.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.9 KiB |
BIN
tests/data/dirty.avi
Normal file
BIN
tests/data/dirty.avi
Normal file
Binary file not shown.
Binary file not shown.
BIN
tests/data/dirty.mp4
Normal file
BIN
tests/data/dirty.mp4
Normal file
Binary file not shown.
BIN
tests/data/malformed_content_types.docx
Normal file
BIN
tests/data/malformed_content_types.docx
Normal file
Binary file not shown.
BIN
tests/data/no_content_types.docx
Normal file
BIN
tests/data/no_content_types.docx
Normal file
Binary file not shown.
BIN
tests/data/office_revision_session_ids.docx
Normal file
BIN
tests/data/office_revision_session_ids.docx
Normal file
Binary file not shown.
@@ -4,87 +4,102 @@ import subprocess
|
||||
import unittest
|
||||
|
||||
|
||||
mat2_binary = ['./mat2']
|
||||
|
||||
if 'MAT2_GLOBAL_PATH_TESTSUITE' in os.environ:
|
||||
# Debian runs tests after installing the package
|
||||
# https://0xacab.org/jvoisin/mat2/issues/16#note_153878
|
||||
mat2_binary = ['/usr/bin/env', 'mat2']
|
||||
|
||||
|
||||
class TestHelp(unittest.TestCase):
|
||||
def test_help(self):
|
||||
proc = subprocess.Popen(['./mat2', '--help'], stdout=subprocess.PIPE)
|
||||
proc = subprocess.Popen(mat2_binary + ['--help'], stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'usage: mat2 [-h] [-v] [-l] [-c] [-s | -L] [files [files ...]]', stdout)
|
||||
self.assertIn(b'usage: mat2 [-h] [-v] [-l] [--check-dependencies] [-V]',
|
||||
stdout)
|
||||
self.assertIn(b'[--unknown-members policy] [-s | -L]', stdout)
|
||||
|
||||
def test_no_arg(self):
|
||||
proc = subprocess.Popen(['./mat2'], stdout=subprocess.PIPE)
|
||||
proc = subprocess.Popen(mat2_binary, stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'usage: mat2 [-h] [-v] [-l] [-c] [-s | -L] [files [files ...]]', stdout)
|
||||
self.assertIn(b'usage: mat2 [-h] [-v] [-l] [--check-dependencies] [-V]',
|
||||
stdout)
|
||||
self.assertIn(b'[--unknown-members policy] [-s | -L]', stdout)
|
||||
|
||||
|
||||
class TestVersion(unittest.TestCase):
|
||||
def test_version(self):
|
||||
proc = subprocess.Popen(['./mat2', '--version'], stdout=subprocess.PIPE)
|
||||
proc = subprocess.Popen(mat2_binary + ['--version'], stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertTrue(stdout.startswith(b'MAT2 '))
|
||||
|
||||
class TestDependencies(unittest.TestCase):
|
||||
def test_dependencies(self):
|
||||
proc = subprocess.Popen(['./mat2', '--check-dependencies'], stdout=subprocess.PIPE)
|
||||
proc = subprocess.Popen(mat2_binary + ['--check-dependencies'], stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertTrue(b'MAT2' in stdout)
|
||||
|
||||
class TestReturnValue(unittest.TestCase):
|
||||
def test_nonzero(self):
|
||||
ret = subprocess.call(['./mat2', './mat2'], stdout=subprocess.DEVNULL)
|
||||
ret = subprocess.call(mat2_binary + ['mat2'], stdout=subprocess.DEVNULL)
|
||||
self.assertEqual(255, ret)
|
||||
|
||||
ret = subprocess.call(['./mat2', '--whololo'], stderr=subprocess.DEVNULL)
|
||||
ret = subprocess.call(mat2_binary + ['--whololo'], stderr=subprocess.DEVNULL)
|
||||
self.assertEqual(2, ret)
|
||||
|
||||
def test_zero(self):
|
||||
ret = subprocess.call(['./mat2'], stdout=subprocess.DEVNULL)
|
||||
ret = subprocess.call(mat2_binary, stdout=subprocess.DEVNULL)
|
||||
self.assertEqual(0, ret)
|
||||
|
||||
ret = subprocess.call(['./mat2', '--show', './mat2'], stdout=subprocess.DEVNULL)
|
||||
ret = subprocess.call(mat2_binary + ['--show', 'mat2'], stdout=subprocess.DEVNULL)
|
||||
self.assertEqual(0, ret)
|
||||
|
||||
|
||||
class TestCleanFolder(unittest.TestCase):
|
||||
def test_jpg(self):
|
||||
os.mkdir('./tests/data/folder/')
|
||||
try:
|
||||
os.mkdir('./tests/data/folder/')
|
||||
except FileExistsError:
|
||||
pass
|
||||
shutil.copy('./tests/data/dirty.jpg', './tests/data/folder/clean1.jpg')
|
||||
shutil.copy('./tests/data/dirty.jpg', './tests/data/folder/clean2.jpg')
|
||||
|
||||
proc = subprocess.Popen(['./mat2', '--show', './tests/data/folder/'],
|
||||
proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/folder/'],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'Comment: Created with GIMP', stdout)
|
||||
|
||||
proc = subprocess.Popen(['./mat2', './tests/data/folder/'],
|
||||
proc = subprocess.Popen(mat2_binary + ['./tests/data/folder/'],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
|
||||
os.remove('./tests/data/folder/clean1.jpg')
|
||||
os.remove('./tests/data/folder/clean2.jpg')
|
||||
|
||||
proc = subprocess.Popen(['./mat2', '--show', './tests/data/folder/'],
|
||||
proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/folder/'],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertNotIn(b'Comment: Created with GIMP', stdout)
|
||||
self.assertIn(b'No metadata found', stdout)
|
||||
|
||||
shutil.rmtree('./tests/data/folder/')
|
||||
|
||||
|
||||
|
||||
class TestCleanMeta(unittest.TestCase):
|
||||
def test_jpg(self):
|
||||
shutil.copy('./tests/data/dirty.jpg', './tests/data/clean.jpg')
|
||||
|
||||
proc = subprocess.Popen(['./mat2', '--show', './tests/data/clean.jpg'],
|
||||
proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/clean.jpg'],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'Comment: Created with GIMP', stdout)
|
||||
|
||||
proc = subprocess.Popen(['./mat2', './tests/data/clean.jpg'],
|
||||
proc = subprocess.Popen(mat2_binary + ['./tests/data/clean.jpg'],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
|
||||
proc = subprocess.Popen(['./mat2', '--show', './tests/data/clean.cleaned.jpg'],
|
||||
proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/clean.cleaned.jpg'],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertNotIn(b'Comment: Created with GIMP', stdout)
|
||||
@@ -94,32 +109,34 @@ class TestCleanMeta(unittest.TestCase):
|
||||
|
||||
class TestIsSupported(unittest.TestCase):
|
||||
def test_pdf(self):
|
||||
proc = subprocess.Popen(['./mat2', '--show', './tests/data/dirty.pdf'],
|
||||
proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/dirty.pdf'],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertNotIn(b"isn't supported", stdout)
|
||||
|
||||
class TestGetMeta(unittest.TestCase):
|
||||
maxDiff = None
|
||||
|
||||
def test_pdf(self):
|
||||
proc = subprocess.Popen(['./mat2', '--show', './tests/data/dirty.pdf'],
|
||||
proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/dirty.pdf'],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'producer: pdfTeX-1.40.14', stdout)
|
||||
self.assertIn(b'Producer: pdfTeX-1.40.14', stdout)
|
||||
|
||||
def test_png(self):
|
||||
proc = subprocess.Popen(['./mat2', '--show', './tests/data/dirty.png'],
|
||||
proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/dirty.png'],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'Comment: This is a comment, be careful!', stdout)
|
||||
|
||||
def test_jpg(self):
|
||||
proc = subprocess.Popen(['./mat2', '--show', './tests/data/dirty.jpg'],
|
||||
proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/dirty.jpg'],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'Comment: Created with GIMP', stdout)
|
||||
|
||||
def test_docx(self):
|
||||
proc = subprocess.Popen(['./mat2', '--show', './tests/data/dirty.docx'],
|
||||
proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/dirty.docx'],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'Application: LibreOffice/5.4.5.1$Linux_X86_64', stdout)
|
||||
@@ -127,7 +144,7 @@ class TestGetMeta(unittest.TestCase):
|
||||
self.assertIn(b'revision: 1', stdout)
|
||||
|
||||
def test_odt(self):
|
||||
proc = subprocess.Popen(['./mat2', '--show', './tests/data/dirty.odt'],
|
||||
proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/dirty.odt'],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'generator: LibreOffice/3.3$Unix', stdout)
|
||||
@@ -135,25 +152,32 @@ class TestGetMeta(unittest.TestCase):
|
||||
self.assertIn(b'date_time: 2011-07-26 02:40:16', stdout)
|
||||
|
||||
def test_mp3(self):
|
||||
proc = subprocess.Popen(['./mat2', '--show', './tests/data/dirty.mp3'],
|
||||
proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/dirty.mp3'],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'TALB: harmfull', stdout)
|
||||
self.assertIn(b'COMM::: Thank you for using MAT !', stdout)
|
||||
|
||||
def test_flac(self):
|
||||
proc = subprocess.Popen(['./mat2', '--show', './tests/data/dirty.flac'],
|
||||
stdout=subprocess.PIPE)
|
||||
proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/dirty.flac'],
|
||||
stdout=subprocess.PIPE, bufsize=0)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'comments: Thank you for using MAT !', stdout)
|
||||
self.assertIn(b'genre: Python', stdout)
|
||||
self.assertIn(b'title: I am so', stdout)
|
||||
|
||||
def test_ogg(self):
|
||||
proc = subprocess.Popen(['./mat2', '--show', './tests/data/dirty.ogg'],
|
||||
proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/dirty.ogg'],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'comments: Thank you for using MAT !', stdout)
|
||||
self.assertIn(b'genre: Python', stdout)
|
||||
self.assertIn(b'i am a : various comment', stdout)
|
||||
self.assertIn(b'artist: jvoisin', stdout)
|
||||
|
||||
class TestControlCharInjection(unittest.TestCase):
|
||||
def test_jpg(self):
|
||||
proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/control_chars.jpg'],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'Comment: GQ\n', stdout)
|
||||
|
@@ -1,10 +1,18 @@
|
||||
#!/usr/bin/python3
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import os
|
||||
import logging
|
||||
import zipfile
|
||||
|
||||
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
|
||||
from libmat2 import pdf, images, audio, office, parser_factory, torrent
|
||||
from libmat2 import harmless, video
|
||||
|
||||
# No need to logging messages, should something go wrong,
|
||||
# the testsuite _will_ fail.
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.FATAL)
|
||||
|
||||
|
||||
class TestInexistentFiles(unittest.TestCase):
|
||||
@@ -53,16 +61,21 @@ class TestUnsupportedFiles(unittest.TestCase):
|
||||
class TestCorruptedEmbedded(unittest.TestCase):
|
||||
def test_docx(self):
|
||||
shutil.copy('./tests/data/embedded_corrupted.docx', './tests/data/clean.docx')
|
||||
parser, mimetype = parser_factory.get_parser('./tests/data/clean.docx')
|
||||
parser, _ = parser_factory.get_parser('./tests/data/clean.docx')
|
||||
self.assertFalse(parser.remove_all())
|
||||
self.assertIsNotNone(parser.get_meta())
|
||||
os.remove('./tests/data/clean.docx')
|
||||
|
||||
def test_odt(self):
|
||||
expected = {
|
||||
'create_system': 'Weird',
|
||||
'date_time': '2018-06-10 17:18:18',
|
||||
'meta.xml': 'harmful content'
|
||||
}
|
||||
shutil.copy('./tests/data/embedded_corrupted.odt', './tests/data/clean.odt')
|
||||
parser, mimetype = parser_factory.get_parser('./tests/data/clean.odt')
|
||||
parser, _ = parser_factory.get_parser('./tests/data/clean.odt')
|
||||
self.assertFalse(parser.remove_all())
|
||||
self.assertEqual(parser.get_meta(), {'create_system': 'Weird', 'date_time': '2018-06-10 17:18:18', 'meta.xml': 'harmful content'})
|
||||
self.assertEqual(parser.get_meta(), expected)
|
||||
os.remove('./tests/data/clean.odt')
|
||||
|
||||
|
||||
@@ -75,6 +88,26 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase):
|
||||
os.remove('./tests/data/clean.py')
|
||||
|
||||
|
||||
class TestWrongContentTypesFileOffice(unittest.TestCase):
|
||||
def test_office_incomplete(self):
|
||||
shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx')
|
||||
p = office.MSOfficeParser('./tests/data/clean.docx')
|
||||
self.assertIsNotNone(p)
|
||||
self.assertFalse(p.remove_all())
|
||||
os.remove('./tests/data/clean.docx')
|
||||
|
||||
def test_office_broken(self):
|
||||
shutil.copy('./tests/data/broken_xml_content_types.docx', './tests/data/clean.docx')
|
||||
with self.assertRaises(ValueError):
|
||||
office.MSOfficeParser('./tests/data/clean.docx')
|
||||
os.remove('./tests/data/clean.docx')
|
||||
|
||||
def test_office_absent(self):
|
||||
shutil.copy('./tests/data/no_content_types.docx', './tests/data/clean.docx')
|
||||
with self.assertRaises(ValueError):
|
||||
office.MSOfficeParser('./tests/data/clean.docx')
|
||||
os.remove('./tests/data/clean.docx')
|
||||
|
||||
class TestCorruptedFiles(unittest.TestCase):
|
||||
def test_pdf(self):
|
||||
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
|
||||
@@ -90,7 +123,7 @@ class TestCorruptedFiles(unittest.TestCase):
|
||||
|
||||
def test_png2(self):
|
||||
shutil.copy('./tests/test_libmat2.py', './tests/clean.png')
|
||||
parser, mimetype = parser_factory.get_parser('./tests/clean.png')
|
||||
parser, _ = parser_factory.get_parser('./tests/clean.png')
|
||||
self.assertIsNone(parser)
|
||||
os.remove('./tests/clean.png')
|
||||
|
||||
@@ -134,25 +167,26 @@ class TestCorruptedFiles(unittest.TestCase):
|
||||
|
||||
def test_bmp(self):
|
||||
shutil.copy('./tests/data/dirty.png', './tests/data/clean.bmp')
|
||||
harmless.HarmlessParser('./tests/data/clean.bmp')
|
||||
ret = harmless.HarmlessParser('./tests/data/clean.bmp')
|
||||
self.assertIsNotNone(ret)
|
||||
os.remove('./tests/data/clean.bmp')
|
||||
|
||||
def test_docx(self):
|
||||
shutil.copy('./tests/data/dirty.png', './tests/data/clean.docx')
|
||||
with self.assertRaises(ValueError):
|
||||
office.MSOfficeParser('./tests/data/clean.docx')
|
||||
office.MSOfficeParser('./tests/data/clean.docx')
|
||||
os.remove('./tests/data/clean.docx')
|
||||
|
||||
def test_flac(self):
|
||||
shutil.copy('./tests/data/dirty.png', './tests/data/clean.flac')
|
||||
with self.assertRaises(ValueError):
|
||||
audio.FLACParser('./tests/data/clean.flac')
|
||||
audio.FLACParser('./tests/data/clean.flac')
|
||||
os.remove('./tests/data/clean.flac')
|
||||
|
||||
def test_mp3(self):
|
||||
shutil.copy('./tests/data/dirty.png', './tests/data/clean.mp3')
|
||||
with self.assertRaises(ValueError):
|
||||
audio.MP3Parser('./tests/data/clean.mp3')
|
||||
audio.MP3Parser('./tests/data/clean.mp3')
|
||||
os.remove('./tests/data/clean.mp3')
|
||||
|
||||
def test_jpg(self):
|
||||
@@ -160,3 +194,46 @@ class TestCorruptedFiles(unittest.TestCase):
|
||||
with self.assertRaises(ValueError):
|
||||
images.JPGParser('./tests/data/clean.jpg')
|
||||
os.remove('./tests/data/clean.jpg')
|
||||
|
||||
def test_png_lightweight(self):
|
||||
return
|
||||
shutil.copy('./tests/data/dirty.torrent', './tests/data/clean.png')
|
||||
p = images.PNGParser('./tests/data/clean.png')
|
||||
self.assertTrue(p.remove_all())
|
||||
os.remove('./tests/data/clean.png')
|
||||
|
||||
def test_avi(self):
|
||||
try:
|
||||
video._get_ffmpeg_path()
|
||||
except RuntimeError:
|
||||
raise unittest.SkipTest
|
||||
|
||||
shutil.copy('./tests/data/dirty.torrent', './tests/data/clean.avi')
|
||||
p = video.AVIParser('./tests/data/clean.avi')
|
||||
self.assertFalse(p.remove_all())
|
||||
os.remove('./tests/data/clean.avi')
|
||||
|
||||
def test_avi_injection(self):
|
||||
try:
|
||||
video._get_ffmpeg_path()
|
||||
except RuntimeError:
|
||||
raise unittest.SkipTest
|
||||
|
||||
shutil.copy('./tests/data/dirty.torrent', './tests/data/--output.avi')
|
||||
p = video.AVIParser('./tests/data/--output.avi')
|
||||
self.assertFalse(p.remove_all())
|
||||
os.remove('./tests/data/--output.avi')
|
||||
|
||||
def test_zip(self):
|
||||
with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
|
||||
zout.write('./tests/data/dirty.flac')
|
||||
zout.write('./tests/data/dirty.docx')
|
||||
zout.write('./tests/data/dirty.jpg')
|
||||
zout.write('./tests/data/embedded_corrupted.docx')
|
||||
p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip')
|
||||
self.assertEqual(mimetype, 'application/zip')
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
|
||||
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
||||
self.assertFalse(p.remove_all())
|
||||
os.remove('./tests/data/dirty.zip')
|
||||
|
135
tests/test_deep_cleaning.py
Normal file
135
tests/test_deep_cleaning.py
Normal file
@@ -0,0 +1,135 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import os
|
||||
import zipfile
|
||||
import tempfile
|
||||
|
||||
from libmat2 import office, parser_factory
|
||||
|
||||
class TestZipMetadata(unittest.TestCase):
|
||||
def __check_deep_meta(self, p):
|
||||
tempdir = tempfile.mkdtemp()
|
||||
zipin = zipfile.ZipFile(p.filename)
|
||||
zipin.extractall(tempdir)
|
||||
|
||||
for subdir, dirs, files in os.walk(tempdir):
|
||||
for f in files:
|
||||
complete_path = os.path.join(subdir, f)
|
||||
inside_p, _ = parser_factory.get_parser(complete_path)
|
||||
if inside_p is None:
|
||||
continue
|
||||
self.assertEqual(inside_p.get_meta(), {})
|
||||
shutil.rmtree(tempdir)
|
||||
|
||||
def __check_zip_meta(self, p):
|
||||
zipin = zipfile.ZipFile(p.filename)
|
||||
for item in zipin.infolist():
|
||||
self.assertEqual(item.comment, b'')
|
||||
self.assertEqual(item.date_time, (1980, 1, 1, 0, 0, 0))
|
||||
self.assertEqual(item.create_system, 3) # 3 is UNIX
|
||||
|
||||
def test_office(self):
|
||||
shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
|
||||
p = office.MSOfficeParser('./tests/data/clean.docx')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertIsNotNone(meta)
|
||||
self.assertEqual(meta['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
||||
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = office.MSOfficeParser('./tests/data/clean.cleaned.docx')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
|
||||
self.__check_zip_meta(p)
|
||||
self.__check_deep_meta(p)
|
||||
|
||||
os.remove('./tests/data/clean.docx')
|
||||
os.remove('./tests/data/clean.cleaned.docx')
|
||||
|
||||
def test_libreoffice(self):
|
||||
shutil.copy('./tests/data/dirty.odt', './tests/data/clean.odt')
|
||||
p = office.LibreOfficeParser('./tests/data/clean.odt')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertIsNotNone(meta)
|
||||
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = office.LibreOfficeParser('./tests/data/clean.cleaned.odt')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
|
||||
self.__check_zip_meta(p)
|
||||
self.__check_deep_meta(p)
|
||||
|
||||
os.remove('./tests/data/clean.odt')
|
||||
os.remove('./tests/data/clean.cleaned.odt')
|
||||
|
||||
|
||||
class TestZipOrder(unittest.TestCase):
|
||||
def test_libreoffice(self):
|
||||
shutil.copy('./tests/data/dirty.odt', './tests/data/clean.odt')
|
||||
p = office.LibreOfficeParser('./tests/data/clean.odt')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertIsNotNone(meta)
|
||||
|
||||
is_unordered = False
|
||||
with zipfile.ZipFile('./tests/data/clean.odt') as zin:
|
||||
previous_name = ''
|
||||
for item in zin.infolist():
|
||||
if previous_name == '':
|
||||
previous_name = item.filename
|
||||
continue
|
||||
elif item.filename < previous_name:
|
||||
is_unordered = True
|
||||
break
|
||||
self.assertTrue(is_unordered)
|
||||
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
with zipfile.ZipFile('./tests/data/clean.cleaned.odt') as zin:
|
||||
previous_name = ''
|
||||
for item in zin.infolist():
|
||||
if previous_name == '':
|
||||
previous_name = item.filename
|
||||
continue
|
||||
self.assertGreaterEqual(item.filename, previous_name)
|
||||
|
||||
os.remove('./tests/data/clean.odt')
|
||||
os.remove('./tests/data/clean.cleaned.odt')
|
||||
|
||||
class TestRsidRemoval(unittest.TestCase):
|
||||
def test_office(self):
|
||||
shutil.copy('./tests/data/office_revision_session_ids.docx', './tests/data/clean.docx')
|
||||
p = office.MSOfficeParser('./tests/data/clean.docx')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertIsNotNone(meta)
|
||||
|
||||
how_many_rsid = False
|
||||
with zipfile.ZipFile('./tests/data/clean.docx') as zin:
|
||||
for item in zin.infolist():
|
||||
if not item.filename.endswith('.xml'):
|
||||
continue
|
||||
num = zin.read(item).decode('utf-8').lower().count('w:rsid')
|
||||
how_many_rsid += num
|
||||
self.assertEqual(how_many_rsid, 11)
|
||||
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
with zipfile.ZipFile('./tests/data/clean.cleaned.docx') as zin:
|
||||
for item in zin.infolist():
|
||||
if not item.filename.endswith('.xml'):
|
||||
continue
|
||||
num = zin.read(item).decode('utf-8').lower().count('w:rsid')
|
||||
self.assertEqual(num, 0)
|
||||
|
||||
os.remove('./tests/data/clean.docx')
|
||||
os.remove('./tests/data/clean.cleaned.docx')
|
@@ -1,19 +1,22 @@
|
||||
#!/usr/bin/python3
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import os
|
||||
import zipfile
|
||||
import tempfile
|
||||
|
||||
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
|
||||
from libmat2 import check_dependencies
|
||||
from libmat2 import check_dependencies, video, archive
|
||||
|
||||
|
||||
class TestCheckDependencies(unittest.TestCase):
|
||||
def test_deps(self):
|
||||
ret = check_dependencies()
|
||||
for key, value in ret.items():
|
||||
try:
|
||||
ret = check_dependencies()
|
||||
except RuntimeError:
|
||||
return # this happens if not every dependency is installed
|
||||
|
||||
for value in ret.values():
|
||||
self.assertTrue(value)
|
||||
|
||||
|
||||
@@ -34,6 +37,32 @@ class TestParameterInjection(unittest.TestCase):
|
||||
self.assertEqual(meta['ModifyDate'], "2018:03:20 21:59:25")
|
||||
os.remove('-ver')
|
||||
|
||||
def test_ffmpeg_injection(self):
|
||||
try:
|
||||
video._get_ffmpeg_path()
|
||||
except RuntimeError:
|
||||
raise unittest.SkipTest
|
||||
|
||||
shutil.copy('./tests/data/dirty.avi', './--output')
|
||||
p = video.AVIParser('--output')
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['Software'], 'MEncoder SVN-r33148-4.0.1')
|
||||
os.remove('--output')
|
||||
|
||||
def test_ffmpeg_injection_complete_path(self):
|
||||
try:
|
||||
video._get_ffmpeg_path()
|
||||
except RuntimeError:
|
||||
raise unittest.SkipTest
|
||||
|
||||
shutil.copy('./tests/data/dirty.avi', './tests/data/ --output.avi')
|
||||
p = video.AVIParser('./tests/data/ --output.avi')
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['Software'], 'MEncoder SVN-r33148-4.0.1')
|
||||
self.assertTrue(p.remove_all())
|
||||
os.remove('./tests/data/ --output.avi')
|
||||
os.remove('./tests/data/ --output.cleaned.avi')
|
||||
|
||||
|
||||
class TestUnsupportedEmbeddedFiles(unittest.TestCase):
|
||||
def test_odt_with_svg(self):
|
||||
@@ -56,8 +85,8 @@ class TestGetMeta(unittest.TestCase):
|
||||
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
|
||||
self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'")
|
||||
self.assertEqual(meta['DocumentID'], "uuid:4a1a79c8-404e-4d38-9580-5bc081036e61")
|
||||
self.assertEqual(meta['PTEX.Fullbanner'], "This is pdfTeX, Version " \
|
||||
"3.1415926-2.5-1.40.14 (TeX Live 2013/Debian) kpathsea " \
|
||||
self.assertEqual(meta['PTEX.Fullbanner'], "This is pdfTeX, Version "
|
||||
"3.1415926-2.5-1.40.14 (TeX Live 2013/Debian) kpathsea "
|
||||
"version 6.1.1")
|
||||
|
||||
def test_torrent(self):
|
||||
@@ -97,6 +126,7 @@ class TestGetMeta(unittest.TestCase):
|
||||
p = audio.FLACParser('./tests/data/dirty.flac')
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['title'], 'I am so')
|
||||
self.assertEqual(meta['Cover 0'], {'Comment': 'Created with GIMP'})
|
||||
|
||||
def test_docx(self):
|
||||
p = office.MSOfficeParser('./tests/data/dirty.docx')
|
||||
@@ -123,6 +153,18 @@ class TestGetMeta(unittest.TestCase):
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta, {})
|
||||
|
||||
def test_zip(self):
|
||||
with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
|
||||
zout.write('./tests/data/dirty.flac')
|
||||
zout.write('./tests/data/dirty.docx')
|
||||
zout.write('./tests/data/dirty.jpg')
|
||||
p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip')
|
||||
self.assertEqual(mimetype, 'application/zip')
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
|
||||
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
||||
os.remove('./tests/data/dirty.zip')
|
||||
|
||||
|
||||
class TestRemovingThumbnails(unittest.TestCase):
|
||||
def test_odt(self):
|
||||
@@ -182,104 +224,6 @@ class TestRevisionsCleaning(unittest.TestCase):
|
||||
os.remove('./tests/data/revision_clean.docx')
|
||||
os.remove('./tests/data/revision_clean.cleaned.docx')
|
||||
|
||||
|
||||
class TestDeepCleaning(unittest.TestCase):
|
||||
def __check_deep_meta(self, p):
|
||||
tempdir = tempfile.mkdtemp()
|
||||
zipin = zipfile.ZipFile(p.filename)
|
||||
zipin.extractall(tempdir)
|
||||
|
||||
for subdir, dirs, files in os.walk(tempdir):
|
||||
for f in files:
|
||||
complete_path = os.path.join(subdir, f)
|
||||
inside_p, _ = parser_factory.get_parser(complete_path)
|
||||
if inside_p is None:
|
||||
continue
|
||||
self.assertEqual(inside_p.get_meta(), {})
|
||||
shutil.rmtree(tempdir)
|
||||
|
||||
|
||||
def __check_zip_meta(self, p):
|
||||
zipin = zipfile.ZipFile(p.filename)
|
||||
for item in zipin.infolist():
|
||||
self.assertEqual(item.comment, b'')
|
||||
self.assertEqual(item.date_time, (1980, 1, 1, 0, 0, 0))
|
||||
self.assertEqual(item.create_system, 3) # 3 is UNIX
|
||||
|
||||
|
||||
def test_office(self):
|
||||
shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
|
||||
p = office.MSOfficeParser('./tests/data/clean.docx')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertIsNotNone(meta)
|
||||
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = office.MSOfficeParser('./tests/data/clean.cleaned.docx')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
|
||||
self.__check_zip_meta(p)
|
||||
self.__check_deep_meta(p)
|
||||
|
||||
os.remove('./tests/data/clean.docx')
|
||||
os.remove('./tests/data/clean.cleaned.docx')
|
||||
|
||||
|
||||
def test_libreoffice(self):
|
||||
shutil.copy('./tests/data/dirty.odt', './tests/data/clean.odt')
|
||||
p = office.LibreOfficeParser('./tests/data/clean.odt')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertIsNotNone(meta)
|
||||
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = office.LibreOfficeParser('./tests/data/clean.cleaned.odt')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
|
||||
self.__check_zip_meta(p)
|
||||
self.__check_deep_meta(p)
|
||||
|
||||
os.remove('./tests/data/clean.odt')
|
||||
os.remove('./tests/data/clean.cleaned.odt')
|
||||
|
||||
class TestLightWeightCleaning(unittest.TestCase):
|
||||
def test_pdf(self):
|
||||
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
|
||||
p = pdf.PDFParser('./tests/data/clean.pdf')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
|
||||
|
||||
ret = p.remove_all_lightweight()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = pdf.PDFParser('./tests/data/clean.cleaned.pdf')
|
||||
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
|
||||
self.assertEqual(p.get_meta(), expected_meta)
|
||||
|
||||
os.remove('./tests/data/clean.pdf')
|
||||
os.remove('./tests/data/clean.cleaned.pdf')
|
||||
|
||||
def test_png(self):
|
||||
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
|
||||
p = images.PNGParser('./tests/data/clean.png')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
|
||||
|
||||
ret = p.remove_all_lightweight()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = images.PNGParser('./tests/data/clean.cleaned.png')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
|
||||
os.remove('./tests/data/clean.png')
|
||||
os.remove('./tests/data/clean.cleaned.png')
|
||||
|
||||
class TestCleaning(unittest.TestCase):
|
||||
def test_pdf(self):
|
||||
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
|
||||
@@ -294,9 +238,11 @@ class TestCleaning(unittest.TestCase):
|
||||
p = pdf.PDFParser('./tests/data/clean.cleaned.pdf')
|
||||
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
|
||||
self.assertEqual(p.get_meta(), expected_meta)
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.pdf')
|
||||
os.remove('./tests/data/clean.cleaned.pdf')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.pdf')
|
||||
|
||||
def test_png(self):
|
||||
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
|
||||
@@ -310,9 +256,11 @@ class TestCleaning(unittest.TestCase):
|
||||
|
||||
p = images.PNGParser('./tests/data/clean.cleaned.png')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.png')
|
||||
os.remove('./tests/data/clean.cleaned.png')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.png')
|
||||
|
||||
def test_jpg(self):
|
||||
shutil.copy('./tests/data/dirty.jpg', './tests/data/clean.jpg')
|
||||
@@ -326,9 +274,11 @@ class TestCleaning(unittest.TestCase):
|
||||
|
||||
p = images.JPGParser('./tests/data/clean.cleaned.jpg')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.jpg')
|
||||
os.remove('./tests/data/clean.cleaned.jpg')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.jpg')
|
||||
|
||||
def test_mp3(self):
|
||||
shutil.copy('./tests/data/dirty.mp3', './tests/data/clean.mp3')
|
||||
@@ -342,9 +292,11 @@ class TestCleaning(unittest.TestCase):
|
||||
|
||||
p = audio.MP3Parser('./tests/data/clean.cleaned.mp3')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.mp3')
|
||||
os.remove('./tests/data/clean.cleaned.mp3')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.mp3')
|
||||
|
||||
def test_ogg(self):
|
||||
shutil.copy('./tests/data/dirty.ogg', './tests/data/clean.ogg')
|
||||
@@ -358,9 +310,11 @@ class TestCleaning(unittest.TestCase):
|
||||
|
||||
p = audio.OGGParser('./tests/data/clean.cleaned.ogg')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.ogg')
|
||||
os.remove('./tests/data/clean.cleaned.ogg')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.ogg')
|
||||
|
||||
def test_flac(self):
|
||||
shutil.copy('./tests/data/dirty.flac', './tests/data/clean.flac')
|
||||
@@ -374,9 +328,11 @@ class TestCleaning(unittest.TestCase):
|
||||
|
||||
p = audio.FLACParser('./tests/data/clean.cleaned.flac')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.flac')
|
||||
os.remove('./tests/data/clean.cleaned.flac')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.flac')
|
||||
|
||||
def test_office(self):
|
||||
shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
|
||||
@@ -390,10 +346,11 @@ class TestCleaning(unittest.TestCase):
|
||||
|
||||
p = office.MSOfficeParser('./tests/data/clean.cleaned.docx')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.docx')
|
||||
os.remove('./tests/data/clean.cleaned.docx')
|
||||
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.docx')
|
||||
|
||||
def test_libreoffice(self):
|
||||
shutil.copy('./tests/data/dirty.odt', './tests/data/clean.odt')
|
||||
@@ -407,9 +364,11 @@ class TestCleaning(unittest.TestCase):
|
||||
|
||||
p = office.LibreOfficeParser('./tests/data/clean.cleaned.odt')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.odt')
|
||||
os.remove('./tests/data/clean.cleaned.odt')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.odt')
|
||||
|
||||
def test_tiff(self):
|
||||
shutil.copy('./tests/data/dirty.tiff', './tests/data/clean.tiff')
|
||||
@@ -423,9 +382,11 @@ class TestCleaning(unittest.TestCase):
|
||||
|
||||
p = images.TiffParser('./tests/data/clean.cleaned.tiff')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.tiff')
|
||||
os.remove('./tests/data/clean.cleaned.tiff')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.tiff')
|
||||
|
||||
def test_bmp(self):
|
||||
shutil.copy('./tests/data/dirty.bmp', './tests/data/clean.bmp')
|
||||
@@ -439,9 +400,11 @@ class TestCleaning(unittest.TestCase):
|
||||
|
||||
p = harmless.HarmlessParser('./tests/data/clean.cleaned.bmp')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.bmp')
|
||||
os.remove('./tests/data/clean.cleaned.bmp')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.bmp')
|
||||
|
||||
def test_torrent(self):
|
||||
shutil.copy('./tests/data/dirty.torrent', './tests/data/clean.torrent')
|
||||
@@ -455,9 +418,11 @@ class TestCleaning(unittest.TestCase):
|
||||
|
||||
p = torrent.TorrentParser('./tests/data/clean.cleaned.torrent')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.torrent')
|
||||
os.remove('./tests/data/clean.cleaned.torrent')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.torrent')
|
||||
|
||||
def test_odf(self):
|
||||
shutil.copy('./tests/data/dirty.odf', './tests/data/clean.odf')
|
||||
@@ -471,10 +436,11 @@ class TestCleaning(unittest.TestCase):
|
||||
|
||||
p = office.LibreOfficeParser('./tests/data/clean.cleaned.odf')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.odf')
|
||||
os.remove('./tests/data/clean.cleaned.odf')
|
||||
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.odf')
|
||||
|
||||
def test_odg(self):
|
||||
shutil.copy('./tests/data/dirty.odg', './tests/data/clean.odg')
|
||||
@@ -488,9 +454,11 @@ class TestCleaning(unittest.TestCase):
|
||||
|
||||
p = office.LibreOfficeParser('./tests/data/clean.cleaned.odg')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.odg')
|
||||
os.remove('./tests/data/clean.cleaned.odg')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.odg')
|
||||
|
||||
def test_txt(self):
|
||||
shutil.copy('./tests/data/dirty.txt', './tests/data/clean.txt')
|
||||
@@ -504,6 +472,75 @@ class TestCleaning(unittest.TestCase):
|
||||
|
||||
p = harmless.HarmlessParser('./tests/data/clean.cleaned.txt')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.txt')
|
||||
os.remove('./tests/data/clean.cleaned.txt')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.txt')
|
||||
|
||||
def test_avi(self):
|
||||
try:
|
||||
video._get_ffmpeg_path()
|
||||
except RuntimeError:
|
||||
raise unittest.SkipTest
|
||||
|
||||
shutil.copy('./tests/data/dirty.avi', './tests/data/clean.avi')
|
||||
p = video.AVIParser('./tests/data/clean.avi')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['Software'], 'MEncoder SVN-r33148-4.0.1')
|
||||
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = video.AVIParser('./tests/data/clean.cleaned.avi')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.avi')
|
||||
os.remove('./tests/data/clean.cleaned.avi')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.avi')
|
||||
|
||||
def test_zip(self):
|
||||
with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
|
||||
zout.write('./tests/data/dirty.flac')
|
||||
zout.write('./tests/data/dirty.docx')
|
||||
zout.write('./tests/data/dirty.jpg')
|
||||
p = archive.ZipParser('./tests/data/dirty.zip')
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
||||
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = archive.ZipParser('./tests/data/dirty.cleaned.zip')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/dirty.zip')
|
||||
os.remove('./tests/data/dirty.cleaned.zip')
|
||||
os.remove('./tests/data/dirty.cleaned.cleaned.zip')
|
||||
|
||||
|
||||
def test_mp4(self):
|
||||
try:
|
||||
video._get_ffmpeg_path()
|
||||
except RuntimeError:
|
||||
raise unittest.SkipTest
|
||||
|
||||
shutil.copy('./tests/data/dirty.mp4', './tests/data/clean.mp4')
|
||||
p = video.MP4Parser('./tests/data/clean.mp4')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['Encoder'], 'HandBrake 0.9.4 2009112300')
|
||||
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = video.MP4Parser('./tests/data/clean.cleaned.mp4')
|
||||
self.assertNotIn('Encoder', p.get_meta())
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove('./tests/data/clean.mp4')
|
||||
os.remove('./tests/data/clean.cleaned.mp4')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.mp4')
|
||||
|
106
tests/test_lightweigh_cleaning.py
Normal file
106
tests/test_lightweigh_cleaning.py
Normal file
@@ -0,0 +1,106 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import os
|
||||
|
||||
from libmat2 import pdf, images, torrent
|
||||
|
||||
class TestLightWeightCleaning(unittest.TestCase):
|
||||
def test_pdf(self):
|
||||
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
|
||||
p = pdf.PDFParser('./tests/data/clean.pdf')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
|
||||
|
||||
p.lightweight_cleaning = True
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = pdf.PDFParser('./tests/data/clean.cleaned.pdf')
|
||||
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
|
||||
self.assertEqual(p.get_meta(), expected_meta)
|
||||
|
||||
os.remove('./tests/data/clean.pdf')
|
||||
os.remove('./tests/data/clean.cleaned.pdf')
|
||||
|
||||
def test_png(self):
|
||||
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
|
||||
p = images.PNGParser('./tests/data/clean.png')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
|
||||
|
||||
p.lightweight_cleaning = True
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = images.PNGParser('./tests/data/clean.cleaned.png')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
|
||||
p = images.PNGParser('./tests/data/clean.png')
|
||||
p.lightweight_cleaning = True
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
os.remove('./tests/data/clean.png')
|
||||
os.remove('./tests/data/clean.cleaned.png')
|
||||
|
||||
def test_jpg(self):
|
||||
shutil.copy('./tests/data/dirty.jpg', './tests/data/clean.jpg')
|
||||
p = images.JPGParser('./tests/data/clean.jpg')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['Comment'], 'Created with GIMP')
|
||||
|
||||
p.lightweight_cleaning = True
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = images.JPGParser('./tests/data/clean.cleaned.jpg')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
|
||||
os.remove('./tests/data/clean.jpg')
|
||||
os.remove('./tests/data/clean.cleaned.jpg')
|
||||
|
||||
def test_torrent(self):
|
||||
shutil.copy('./tests/data/dirty.torrent', './tests/data/clean.torrent')
|
||||
p = torrent.TorrentParser('./tests/data/clean.torrent')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['created by'], b'mktorrent 1.0')
|
||||
|
||||
p.lightweight_cleaning = True
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = torrent.TorrentParser('./tests/data/clean.cleaned.torrent')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
|
||||
os.remove('./tests/data/clean.torrent')
|
||||
os.remove('./tests/data/clean.cleaned.torrent')
|
||||
|
||||
def test_tiff(self):
|
||||
shutil.copy('./tests/data/dirty.tiff', './tests/data/clean.tiff')
|
||||
p = images.TiffParser('./tests/data/clean.tiff')
|
||||
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['ImageDescription'], 'OLYMPUS DIGITAL CAMERA ')
|
||||
|
||||
p.lightweight_cleaning = True
|
||||
ret = p.remove_all()
|
||||
self.assertTrue(ret)
|
||||
|
||||
p = images.TiffParser('./tests/data/clean.cleaned.tiff')
|
||||
self.assertEqual(p.get_meta(),
|
||||
{
|
||||
'Orientation': 'Horizontal (normal)',
|
||||
'ResolutionUnit': 'inches',
|
||||
'XResolution': 72,
|
||||
'YResolution': 72
|
||||
}
|
||||
)
|
||||
|
||||
os.remove('./tests/data/clean.tiff')
|
||||
os.remove('./tests/data/clean.cleaned.tiff')
|
31
tests/test_policy.py
Normal file
31
tests/test_policy.py
Normal file
@@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import os
|
||||
|
||||
from libmat2 import office, UnknownMemberPolicy
|
||||
|
||||
class TestPolicy(unittest.TestCase):
|
||||
def test_policy_omit(self):
|
||||
shutil.copy('./tests/data/embedded.docx', './tests/data/clean.docx')
|
||||
p = office.MSOfficeParser('./tests/data/clean.docx')
|
||||
p.unknown_member_policy = UnknownMemberPolicy.OMIT
|
||||
self.assertTrue(p.remove_all())
|
||||
os.remove('./tests/data/clean.docx')
|
||||
os.remove('./tests/data/clean.cleaned.docx')
|
||||
|
||||
def test_policy_keep(self):
|
||||
shutil.copy('./tests/data/embedded.docx', './tests/data/clean.docx')
|
||||
p = office.MSOfficeParser('./tests/data/clean.docx')
|
||||
p.unknown_member_policy = UnknownMemberPolicy.KEEP
|
||||
self.assertTrue(p.remove_all())
|
||||
os.remove('./tests/data/clean.docx')
|
||||
os.remove('./tests/data/clean.cleaned.docx')
|
||||
|
||||
def test_policy_unknown(self):
|
||||
shutil.copy('./tests/data/embedded.docx', './tests/data/clean.docx')
|
||||
p = office.MSOfficeParser('./tests/data/clean.docx')
|
||||
with self.assertRaises(ValueError):
|
||||
p.unknown_member_policy = UnknownMemberPolicy('unknown_policy_name_totally_invalid')
|
||||
os.remove('./tests/data/clean.docx')
|
Reference in New Issue
Block a user