mirror of
https://0xacab.org/jvoisin/mat2
synced 2025-10-06 16:42:57 +02:00
Compare commits
185 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
235403bc11 | ||
|
102f08cd28 | ||
|
7a8ea224bc | ||
|
504efb2448 | ||
|
f07344444d | ||
|
473903b70e | ||
|
1438cf7bd4 | ||
|
e740a9559f | ||
|
2b58eece50 | ||
|
29f404bce3 | ||
|
6c966f2afa | ||
|
70d236a062 | ||
|
d61fb7f77a | ||
|
1aed4ff2a5 | ||
|
75c0a750c1 | ||
|
a47ac01eb6 | ||
|
156855ab7e | ||
|
09672a2dcc | ||
|
f2c898c92d | ||
|
f931a0ecee | ||
|
61f39c4bd0 | ||
|
1b9ce34e2c | ||
|
17e76ab6f0 | ||
|
94ef57c994 | ||
|
05d1ca5841 | ||
|
55b468ded7 | ||
|
0fcafa2edd | ||
|
7405955ab5 | ||
|
e6564509e1 | ||
|
bbd5b2817c | ||
|
73f2a87aa0 | ||
|
abcdf07ef4 | ||
|
a3081bce47 | ||
|
47d5529840 | ||
|
fa44794dfd | ||
|
04786d75da | ||
|
cb7b5747a8 | ||
|
8c26020f67 | ||
|
a0c97b25c4 | ||
|
1bcb945360 | ||
|
9159fe8705 | ||
|
1b9608aecf | ||
|
2ac8c24dac | ||
|
71ecac85b0 | ||
|
b9677d8655 | ||
|
6fde80d3e3 | ||
|
6c05360afa | ||
|
596696dfbc | ||
|
daa17a3e9c | ||
|
6061f47231 | ||
|
8b41764a3e | ||
|
ed0ffa5693 | ||
|
b1c03bce72 | ||
|
a63011b3f6 | ||
|
e41390eb64 | ||
|
66a36f6b15 | ||
|
3cb3f58084 | ||
|
39fb254e01 | ||
|
1f73a16ef3 | ||
|
e8b38f1101 | ||
|
8d7230ba16 | ||
|
2b02c82e7f | ||
|
b00e221675 | ||
|
62a45c29df | ||
|
6479d869e4 | ||
|
29057d6cdf | ||
|
180ea24e5a | ||
|
618e0a8e39 | ||
|
6d93cf9397 | ||
|
b1a16b334f | ||
|
0501359600 | ||
|
cc5be8608b | ||
|
292f44c086 | ||
|
2dd196c2c7 | ||
|
34eb878aae | ||
|
eec5c33a6b | ||
|
beebca4bf1 | ||
|
e2c4dbf721 | ||
|
704367f91e | ||
|
2639713709 | ||
|
b18e6e11f0 | ||
|
62dc8c71c1 | ||
|
697e9583b9 | ||
|
1b37604d3a | ||
|
1c3e2afa1e | ||
|
05b8e97b68 | ||
|
2a74a400e2 | ||
|
5ccddae7f5 | ||
|
12582ba2f5 | ||
|
35092562e6 | ||
|
e5dcd39225 | ||
|
660f0dea73 | ||
|
cd2b9af902 | ||
|
3378f3ab8c | ||
|
48680b9852 | ||
|
d555a02c90 | ||
|
143bb0a5f3 | ||
|
a1a7c76dc9 | ||
|
01b39aa68c | ||
|
e312868c4e | ||
|
b71bafd2cf | ||
|
22199df4d0 | ||
|
1703ed6ebb | ||
|
541b3c83b2 | ||
|
6afb0cb9d8 | ||
|
1c4e98425a | ||
|
fb7440ab5d | ||
|
0c91ac7367 | ||
|
708841f9f5 | ||
|
d4479d9baa | ||
|
08a5792a9a | ||
|
3b094ae449 | ||
|
0b094b594b | ||
|
8c1107c358 | ||
|
6df615281b | ||
|
49c8b14e59 | ||
|
bf0c777cb9 | ||
|
682552d152 | ||
|
c9be50f968 | ||
|
2eec653e99 | ||
|
85c08c5b68 | ||
|
c5841a241d | ||
|
d00ca800b2 | ||
|
8b42b28b70 | ||
|
e2362b8620 | ||
|
626669f95f | ||
|
497f5f71fc | ||
|
cd5f2eb71c | ||
|
ec082d6483 | ||
|
f8111547ae | ||
|
88fa71fbde | ||
|
6cd28ed46c | ||
|
92dcc8175d | ||
|
7131aa6fd7 | ||
|
7ce2b5121b | ||
|
a517f8d36e | ||
|
61dce89fbd | ||
|
88b7ec2c48 | ||
|
8bea98911e | ||
|
62ec8f6c1e | ||
|
148bcbba52 | ||
|
b3def8b5de | ||
|
77dde8a049 | ||
|
1b361ec27e | ||
|
58a1563a99 | ||
|
f638168033 | ||
|
b84f73c5c3 | ||
|
96e639dfd3 | ||
|
46b3ae1672 | ||
|
d0bc79442b | ||
|
17919c73a9 | ||
|
60d820b053 | ||
|
461534a966 | ||
|
d8b68ef68e | ||
|
c8dc020dc5 | ||
|
599909a760 | ||
|
d008b1e2f0 | ||
|
d7a03d907b | ||
|
a23dc001cd | ||
|
f93df85d03 | ||
|
e5b1068ed6 | ||
|
843c0d8cc5 | ||
|
56d2c4aa5f | ||
|
12f23e0150 | ||
|
72f41c5e05 | ||
|
5270071b94 | ||
|
5312603a88 | ||
|
ebe06cb8a9 | ||
|
6dd48de4ef | ||
|
e0f4f0e302 | ||
|
4acf3af002 | ||
|
ee704db2ff | ||
|
693408f1a6 | ||
|
0902e9e330 | ||
|
b2efffdaa4 | ||
|
7465cedee7 | ||
|
f5aef1b391 | ||
|
2e3496d3d4 | ||
|
be24c681ff | ||
|
efa525c102 | ||
|
f67cd9d7dc | ||
|
615997be38 | ||
|
4ba4b143e6 | ||
|
8c7b23be90 | ||
|
db797e3a52 |
45
.github/workflows/builds.yaml
vendored
Normal file
45
.github/workflows/builds.yaml
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
name: CI for Python versions
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
schedule:
|
||||
- cron: '0 16 * * 5'
|
||||
|
||||
jobs:
|
||||
linting:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/setup-python@v5
|
||||
- run: pip install ruff
|
||||
- run: |
|
||||
ruff check .
|
||||
build:
|
||||
needs: linting
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14.0-rc.2"]
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get install --no-install-recommends --no-install-suggests --yes \
|
||||
ffmpeg \
|
||||
gir1.2-gdkpixbuf-2.0 \
|
||||
gir1.2-poppler-0.18 \
|
||||
gir1.2-rsvg-2.0 \
|
||||
libimage-exiftool-perl \
|
||||
python3-gi-cairo \
|
||||
libcairo2-dev \
|
||||
libgirepository-2.0-dev \
|
||||
libgirepository1.0-dev \
|
||||
gobject-introspection \
|
||||
python3-mutagen
|
||||
pip install .
|
||||
- name: Build and run the testsuite
|
||||
run: python3 -m unittest discover -v
|
@@ -1,82 +0,0 @@
|
||||
variables:
|
||||
CONTAINER_REGISTRY: $CI_REGISTRY/georg/mat2-ci-images
|
||||
|
||||
stages:
|
||||
- linting
|
||||
- test
|
||||
|
||||
.prepare_env: &prepare_env
|
||||
before_script: # This is needed to not run the testsuite as root
|
||||
- useradd --home-dir ${CI_PROJECT_DIR} mat2
|
||||
- chown -R mat2 .
|
||||
|
||||
linting:bandit:
|
||||
image: $CONTAINER_REGISTRY:linting
|
||||
stage: linting
|
||||
script: # TODO: remove B405 and B314
|
||||
- bandit ./mat2 --format txt --skip B101
|
||||
- bandit -r ./nautilus/ --format txt --skip B101
|
||||
- bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108
|
||||
|
||||
linting:codespell:
|
||||
image: $CONTAINER_REGISTRY:linting
|
||||
stage: linting
|
||||
script:
|
||||
# Run codespell to check for spelling errors; ignore errors about binary
|
||||
# files, use a config with ignored words and exclude the git directory,
|
||||
# which might contain false positives
|
||||
- codespell -q 2 -I utils/ci/codespell/ignored_words.txt -S .git
|
||||
|
||||
linting:pylint:
|
||||
image: $CONTAINER_REGISTRY:linting
|
||||
stage: linting
|
||||
script:
|
||||
- pylint --disable=no-else-return,no-else-raise,no-else-continue,unnecessary-comprehension --extension-pkg-whitelist=cairo,gi ./libmat2 ./mat2
|
||||
# Once nautilus-python is in Debian, decomment it form the line below
|
||||
- pylint --disable=no-else-return,no-else-raise,no-else-continue,unnecessary-comprehension --extension-pkg-whitelist=Nautilus,GObject,Gtk,Gio,GLib,gi ./nautilus/mat2.py
|
||||
|
||||
linting:pyflakes:
|
||||
image: $CONTAINER_REGISTRY:linting
|
||||
stage: linting
|
||||
script:
|
||||
- pyflakes3 ./libmat2 ./mat2 ./tests/ ./nautilus
|
||||
|
||||
linting:mypy:
|
||||
image: $CONTAINER_REGISTRY:linting
|
||||
stage: linting
|
||||
script:
|
||||
- mypy --ignore-missing-imports mat2 libmat2/*.py ./nautilus/mat2.py
|
||||
|
||||
tests:archlinux:
|
||||
image: $CONTAINER_REGISTRY:archlinux
|
||||
stage: test
|
||||
script:
|
||||
- python3 setup.py test
|
||||
|
||||
tests:debian:
|
||||
image: $CONTAINER_REGISTRY:debian
|
||||
stage: test
|
||||
script:
|
||||
- apt-get -qqy purge bubblewrap
|
||||
- python3 setup.py test
|
||||
|
||||
tests:debian_with_bubblewrap:
|
||||
image: $CONTAINER_REGISTRY:debian
|
||||
stage: test
|
||||
<<: *prepare_env
|
||||
script:
|
||||
- su - mat2 -c "python3-coverage run --branch -m unittest discover -s tests/"
|
||||
- su - mat2 -c "python3-coverage report --fail-under=100 -m --include 'libmat2/*'"
|
||||
|
||||
tests:fedora:
|
||||
image: $CONTAINER_REGISTRY:fedora
|
||||
stage: test
|
||||
script:
|
||||
- python3 setup.py test
|
||||
|
||||
tests:gentoo:
|
||||
image: $CONTAINER_REGISTRY:gentoo
|
||||
stage: test
|
||||
<<: *prepare_env
|
||||
script:
|
||||
- su - mat2 -c "python3 -m unittest discover -v"
|
17
.pylintrc
17
.pylintrc
@@ -1,17 +0,0 @@
|
||||
[FORMAT]
|
||||
good-names=e,f,i,x,s
|
||||
max-locals=20
|
||||
|
||||
[MESSAGES CONTROL]
|
||||
disable=
|
||||
fixme,
|
||||
invalid-name,
|
||||
duplicate-code,
|
||||
missing-docstring,
|
||||
protected-access,
|
||||
abstract-method,
|
||||
wrong-import-position,
|
||||
catching-non-exception,
|
||||
cell-var-from-loop,
|
||||
locally-disabled,
|
||||
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
|
88
CHANGELOG.md
88
CHANGELOG.md
@@ -1,3 +1,91 @@
|
||||
# 0.13.5 - 2025-01-09
|
||||
- Keep orientation metadata on jpeg and tiff files
|
||||
- Improve cairo-related error/exceptions handling
|
||||
- Improve the logging
|
||||
- Improve the sandboxing
|
||||
- Improve Python3.12 support
|
||||
- Improve MSOffice documents handling
|
||||
|
||||
# 0.13.4 - 2023-08-02
|
||||
|
||||
- Add documentation about mat2 on OSX
|
||||
- Make use of python3.7 constructs to simplify code
|
||||
- Use moderner type annotations
|
||||
- Harden get_meta in archive.py against variants of CVE-2022-35410
|
||||
- Improve MSOffice document support
|
||||
- Package the manpage on pypi
|
||||
|
||||
# 0.13.3 - 2023-02-23
|
||||
|
||||
- Fix a decorator argument
|
||||
|
||||
# 0.13.2 - 2023-01-28
|
||||
|
||||
- Fix a crash on some python versions
|
||||
|
||||
# 0.13.1 - 2023-01-07
|
||||
|
||||
- Improve xlsx support
|
||||
- Remove the Nautilus extension
|
||||
|
||||
# 0.13.0 - 2022-07-06
|
||||
|
||||
- Fix an arbitrary file read (CVE-2022-35410)
|
||||
- Add support for heic files
|
||||
|
||||
# 0.12.4 - 2022-04-30
|
||||
|
||||
- Fix possible errors/crashes when processing multiple files
|
||||
via the command line interface
|
||||
- Use a fixed PDF version for the output
|
||||
- Improve compatibility with modern versions of rsvg
|
||||
- Improve the robustness of the command line interface with
|
||||
regard to control characters
|
||||
|
||||
# 0.12.3 - 2022-01-06
|
||||
|
||||
- Implement code for internationalization
|
||||
- Keep individual files compression type in zip files
|
||||
- Increase the robustness of mat2 against weird/corrupted files
|
||||
- Fix the dolphin integration
|
||||
- Add a fuzzer
|
||||
|
||||
# 0.12.2 - 2021-08-29
|
||||
|
||||
- Add support for aiff files
|
||||
- Improve MS Office support
|
||||
- Improve compatibility with newer/older version of mat2's dependencies
|
||||
- Fix possible issues with the resolution of processed pdf
|
||||
|
||||
# 0.12.1 - 2021-03-19
|
||||
|
||||
- Improve epub support
|
||||
- Improve MS Office support
|
||||
|
||||
# 0.12.0 - 2020-12-18
|
||||
|
||||
- Improve significantly MS Office formats support
|
||||
- Fix some typos in the Nautilus extension
|
||||
- Improve reliability of the mp3, pdf and svg parsers
|
||||
- Improve compatibility with ffmpeg when sandboxing is used
|
||||
- Improve the dolphin extension usability
|
||||
- libmat2 now raises a ValueError on malformed files while trying to
|
||||
find the right parser, instead of returning None
|
||||
|
||||
# 0.11.0 - 2020-03-29
|
||||
|
||||
- Improve significantly MS Office formats support
|
||||
- Refactor how mat2 looks for executables
|
||||
|
||||
# 0.10.1 - 2020-02-09
|
||||
|
||||
- Improve the documentation and the manpage
|
||||
- Improve the robustness of css, html, png, gdk-based, exiftool-based parsers
|
||||
- Future-proof a bit the testsuite
|
||||
- Handle tiff files with a .tif extension
|
||||
- Improve the sandbox' usability
|
||||
- Add support for wav files
|
||||
|
||||
# 0.10.0 - 2019-11-30
|
||||
|
||||
- Make mat2 work on Python3.8
|
||||
|
@@ -1,11 +1,17 @@
|
||||
# Contributing to mat2
|
||||
|
||||
The main repository for mat2 is on [0xacab]( https://0xacab.org/jvoisin/mat2 ),
|
||||
The main repository for mat2 is on [github]( https://github.com/jvoisin/mat2 ),
|
||||
but you can send patches to jvoisin by [email](https://dustri.org/) if you prefer.
|
||||
|
||||
Do feel free to pick up [an issue]( https://0xacab.org/jvoisin/mat2/issues )
|
||||
and to send a pull-request. Please do check that everything is fine by running the
|
||||
testsuite with `python3 -m unittest discover -v` before submitting one :)
|
||||
Do feel free to pick up [an issue]( https://github.com/jvoisin/mat2/issues )
|
||||
and to send a pull-request.
|
||||
|
||||
Before sending the pull-request, please do check that everything is fine by
|
||||
running the full test suite in GitLab. To do that, after forking mat2 in GitLab,
|
||||
you need to go in Settings -> CI/CD -> Runner and there enable shared runners.
|
||||
|
||||
Mat2 also has unit tests (that are also run in the full test suite). You can run
|
||||
them with `python3 -m unittest discover -v`.
|
||||
|
||||
If you're fixing a bug or adding a new feature, please add tests accordingly,
|
||||
this will greatly improve the odds of your merge-request getting merged.
|
||||
@@ -21,18 +27,19 @@ Since mat2 is written in Python3, please conform as much as possible to the
|
||||
|
||||
# Doing a release
|
||||
|
||||
1. Update the [changelog](https://0xacab.org/jvoisin/mat2/blob/master/CHANGELOG.md)
|
||||
2. Update the version in the [mat2](https://0xacab.org/jvoisin/mat2/blob/master/mat2) file
|
||||
3. Update the version in the [setup.py](https://0xacab.org/jvoisin/mat2/blob/master/setup.py) file
|
||||
4. Update the version and date in the [man page](https://0xacab.org/jvoisin/mat2/blob/master/doc/mat2.1)
|
||||
5. Commit the changelog, man page, mat2 and setup.py files
|
||||
6. Create a tag with `git tag -s $VERSION`
|
||||
7. Push the commit with `git push origin master`
|
||||
8. Push the tag with `git push --tags`
|
||||
9. Download the gitlab archive of the release
|
||||
10. Diff it against the local copy
|
||||
11. If there is no difference, sign the archive with `gpg --armor --detach-sign mat2-$VERSION.tar.xz`
|
||||
12. Upload the signature on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there
|
||||
13. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
|
||||
14. Sign'n'upload the new version on pypi with `python3 setup.py sdist bdist_wheel` then `twine upload -s dist/*`
|
||||
15. Do the secret release dance
|
||||
1. Update the [changelog](https://github.com/jvoisin/mat2/blob/master/CHANGELOG.md)
|
||||
2. Update the version in the [mat2](https://github.com/jvoisin/mat2/blob/master/mat2) file
|
||||
3. Update the version in the [setup.py](https://github.com/jvoisin/mat2/blob/master/setup.py) file
|
||||
4. Update the version in the [pyproject.toml](https://github.com/jvoisin/mat2/blob/master/yproject.toml) file
|
||||
5. Update the version and date in the [man page](https://github.com/jvoisin/mat2/blob/master/doc/mat2.1)
|
||||
6. Commit the modified files
|
||||
7. Create a tag with `git tag -s $VERSION`
|
||||
8. Push the commit with `git push origin master`
|
||||
9. Push the tag with `git push --tags`
|
||||
10. Download the gitlab archive of the release
|
||||
11. Diff it against the local copy
|
||||
12. If there is no difference, sign the archive with `gpg --armor --detach-sign mat2-$VERSION.tar.xz`
|
||||
13. Upload the signature on Gitlab's [tag page](https://github.com/jvoisin/mat2/tags) and add the changelog there
|
||||
14. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
|
||||
15. Sign'n'upload the new version on pypi with `python3 setup.py sdist bdist_wheel` then `twine upload -s dist/*`
|
||||
16. Do the secret release dance
|
||||
|
39
INSTALL.md
39
INSTALL.md
@@ -18,34 +18,53 @@ installed, mat2 uses it to sandbox any external processes it invokes.
|
||||
|
||||
## Arch Linux
|
||||
|
||||
Thanks to [Francois_B](https://www.sciunto.org/), there is an package available on
|
||||
[Arch linux's AUR](https://aur.archlinux.org/packages/mat2/).
|
||||
Thanks to [kpcyrd](https://archlinux.org/packages/?maintainer=kpcyrd), there is an package available on
|
||||
[Arch linux's AUR](https://archlinux.org/packages/extra/any/mat2/).
|
||||
|
||||
## Debian
|
||||
|
||||
There is a package available in [Debian](https://packages.debian.org/search?keywords=mat2&searchon=names§ion=all).
|
||||
There is a package available in [Debian](https://packages.debian.org/search?keywords=mat2&searchon=names§ion=all) and you can install mat2 with:
|
||||
|
||||
```
|
||||
apt install mat2
|
||||
```
|
||||
|
||||
## Fedora
|
||||
|
||||
Thanks to [atenart](https://ack.tf/), there is a package available on
|
||||
[Fedora's copr]( https://copr.fedorainfracloud.org/coprs/atenart/mat2/ ).
|
||||
|
||||
We use copr (cool other packages repo) as the Mat2 Nautilus plugin depends on
|
||||
python3-nautilus, which isn't available yet in Fedora (but is distributed
|
||||
through this copr).
|
||||
|
||||
First you need to enable Mat2's copr:
|
||||
First you need to enable mat2's copr:
|
||||
|
||||
```
|
||||
dnf -y copr enable atenart/mat2
|
||||
```
|
||||
|
||||
Then you can install both the Mat2 command and Nautilus extension:
|
||||
Then you can install mat2:
|
||||
|
||||
```
|
||||
dnf -y install mat2 mat2-nautilus
|
||||
dnf -y install mat2
|
||||
```
|
||||
|
||||
## Gentoo
|
||||
|
||||
mat2 is available in the [torbrowser overlay](https://github.com/MeisterP/torbrowser-overlay).
|
||||
|
||||
|
||||
# OSX
|
||||
|
||||
## Homebrew
|
||||
|
||||
mat2 is [available on homebrew](https://formulae.brew.sh/formula/mat2):
|
||||
|
||||
```
|
||||
brew install mat2
|
||||
```
|
||||
|
||||
## MacPorts
|
||||
|
||||
mat2 is [available on MacPorts](https://ports.macports.org/port/mat2/):
|
||||
|
||||
```
|
||||
port install mat2
|
||||
```
|
||||
|
156
README.md
156
README.md
@@ -1,155 +1 @@
|
||||
```
|
||||
_____ _____ _____ ___
|
||||
| | _ |_ _|_ | Keep your data,
|
||||
| | | | | | | | _| trash your meta!
|
||||
|_|_|_|__|__| |_| |___|
|
||||
|
||||
```
|
||||
|
||||
This software is currently in **beta**, please don't use it for anything
|
||||
critical.
|
||||
|
||||
# Metadata and privacy
|
||||
|
||||
Metadata consist of information that characterizes data.
|
||||
Metadata are used to provide documentation for data products.
|
||||
In essence, metadata answer who, what, when, where, why, and how about
|
||||
every facet of the data that are being documented.
|
||||
|
||||
Metadata within a file can tell a lot about you.
|
||||
Cameras record data about when a picture was taken and what
|
||||
camera was used. Office documents like PDF or Office automatically adds
|
||||
author and company information to documents and spreadsheets.
|
||||
Maybe you don't want to disclose those information.
|
||||
|
||||
This is precisely the job of mat2: getting rid, as much as possible, of
|
||||
metadata.
|
||||
|
||||
mat2 provides both a command line tool, and a graphical user interface
|
||||
via an extension for Nautilus, the default file manager of GNOME.
|
||||
|
||||
# Requirements
|
||||
|
||||
- `python3-mutagen` for audio support
|
||||
- `python3-gi-cairo` and `gir1.2-poppler-0.18` for PDF support
|
||||
- `gir1.2-gdkpixbuf-2.0` for images support
|
||||
- `gir1.2-rsvg-2.0` for svg support
|
||||
- `FFmpeg`, optionally, for video support
|
||||
- `libimage-exiftool-perl` for everything else
|
||||
- `bubblewrap`, optionally, for sandboxing
|
||||
|
||||
Please note that mat2 requires at least Python3.5.
|
||||
|
||||
# Running the test suite
|
||||
|
||||
```bash
|
||||
$ python3 -m unittest discover -v
|
||||
```
|
||||
|
||||
And if you want to see the coverage:
|
||||
|
||||
```bash
|
||||
$ python3-coverage run --branch -m unittest discover -s tests/
|
||||
$ python3-coverage report --include -m --include /libmat2/*'
|
||||
```
|
||||
|
||||
# How to use mat2
|
||||
|
||||
```bash
|
||||
usage: mat2 [-h] [-v] [-l] [--check-dependencies] [-V]
|
||||
[--unknown-members policy] [-s | -L]
|
||||
[files [files ...]]
|
||||
|
||||
Metadata anonymisation toolkit 2
|
||||
|
||||
positional arguments:
|
||||
files the files to process
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-v, --version show program's version number and exit
|
||||
-l, --list list all supported fileformats
|
||||
--check-dependencies check if mat2 has all the dependencies it needs
|
||||
-V, --verbose show more verbose status information
|
||||
--unknown-members policy
|
||||
how to handle unknown members of archive-style files
|
||||
(policy should be one of: abort, omit, keep) [Default:
|
||||
abort]
|
||||
-s, --show list harmful metadata detectable by mat2 without
|
||||
removing them
|
||||
-L, --lightweight remove SOME metadata
|
||||
```
|
||||
|
||||
Note that mat2 **will not** clean files in-place, but will produce, for
|
||||
example, with a file named "myfile.png" a cleaned version named
|
||||
"myfile.cleaned.png".
|
||||
|
||||
# Notes about detecting metadata
|
||||
|
||||
While mat2 is doing its very best to display metadata when the `--show` flag is
|
||||
passed, it doesn't mean that a file is clean from any metadata if mat2 doesn't
|
||||
show any. There is no reliable way to detect every single possible metadata for
|
||||
complex file formats.
|
||||
|
||||
This is why you shouldn't rely on metadata's presence to decide if your file must
|
||||
be cleaned or not.
|
||||
|
||||
# Notes about the lightweight mode
|
||||
|
||||
By default, mat2 might alter a bit the data of your files, in order to remove
|
||||
as much metadata as possible. For example, texts in PDF might not be selectable anymore,
|
||||
compressed images might get compressed again, …
|
||||
Since some users might be willing to trade some metadata's presence in exchange
|
||||
of the guarantee that mat2 won't modify the data of their files, there is the
|
||||
`-L` flag that precisely does that.
|
||||
|
||||
# Related software
|
||||
|
||||
- The first iteration of [MAT](https://mat.boum.org)
|
||||
- [Exiftool](https://sno.phy.queensu.ca/~phil/exiftool/mat)
|
||||
- [pdf-redact-tools](https://github.com/firstlookmedia/pdf-redact-tools), that
|
||||
tries to deal with *printer dots* too.
|
||||
- [pdfparanoia](https://github.com/kanzure/pdfparanoia), that removes
|
||||
watermarks from PDF.
|
||||
- [Scrambled Exif](https://f-droid.org/packages/com.jarsilio.android.scrambledeggsif/),
|
||||
an open-source Android application to remove metadata from pictures.
|
||||
|
||||
# Contact
|
||||
|
||||
If possible, use the [issues system](https://0xacab.org/jvoisin/mat2/issues)
|
||||
or the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
|
||||
Should a more private contact be needed (eg. for reporting security issues),
|
||||
you can email Julien (jvoisin) Voisin at `julien.voisin+mat2@dustri.org`,
|
||||
using the gpg key `9FCDEE9E1A381F311EA62A7404D041E8171901CC`.
|
||||
|
||||
# License
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Copyright 2018 Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org>
|
||||
Copyright 2016 Marie-Rose for mat2's logo
|
||||
|
||||
The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3,
|
||||
and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx
|
||||
|
||||
# Thanks
|
||||
|
||||
mat2 wouldn't exist without:
|
||||
|
||||
- the [Google Summer of Code](https://summerofcode.withgoogle.com/);
|
||||
- the fine people from [Tails]( https://tails.boum.org);
|
||||
- friends
|
||||
|
||||
Many thanks to them!
|
||||
|
||||
# This repository is deprecated, please use https://github.com/jvoisin/mat2 instead
|
@@ -19,14 +19,14 @@ details.
|
||||
# jpegoptim, optipng, …
|
||||
|
||||
While designed to reduce as much as possible the size of pictures,
|
||||
those software can be used to remove metadata. They usually have very good
|
||||
those software can be used to remove metadata. They usually have excellent
|
||||
support for a single picture format, and can be used in place of mat2 for them.
|
||||
|
||||
|
||||
# PDF Redact Tools
|
||||
|
||||
[PDF Redact Tools](https://github.com/firstlookmedia/pdf-redact-tools) is
|
||||
a software developed by the people from [First Look
|
||||
software developed by the people from [First Look
|
||||
Media](https://firstlook.media/), the entity behind, amongst other things,
|
||||
[The Intercept](https://theintercept.com/).
|
||||
|
||||
@@ -34,13 +34,13 @@ The tool uses roughly the same approach than mat2 to deal with PDF,
|
||||
which is unfortunately the only fileformat that it does support.
|
||||
It's interesting to note that it has counter-measures against
|
||||
[yellow dots](https://en.wikipedia.org/wiki/Machine_Identification_Code),
|
||||
a capacity that mat2 [doesn't possess yet](https://0xacab.org/jvoisin/mat2/issues/43).
|
||||
a capacity that mat2 doesn't have.
|
||||
|
||||
|
||||
# Exiv2
|
||||
|
||||
[Exiv2](https://www.exiv2.org/) was considered for mat2,
|
||||
but it currently [misses a lot of metadata](https://0xacab.org/jvoisin/mat2/issues/85)
|
||||
but it currently misses a lot of metadata.
|
||||
|
||||
|
||||
# Others non open source software/online service
|
||||
|
22
doc/mat2.1
22
doc/mat2.1
@@ -1,4 +1,4 @@
|
||||
.TH mat2 "1" "November 2019" "mat2 0.10.0" "User Commands"
|
||||
.TH mat2 "1" "January 2025" "mat2 0.13.5" "User Commands"
|
||||
|
||||
.SH NAME
|
||||
mat2 \- the metadata anonymisation toolkit 2
|
||||
@@ -62,11 +62,29 @@ mat2 ./myfile.pdf
|
||||
.fi
|
||||
.PP
|
||||
|
||||
.SH NOTES ABOUT METADATA
|
||||
|
||||
While mat2 is doing its very best to display metadata when the --show flag is
|
||||
passed, it doesn't mean that a file is clean from any metadata if mat2 doesn't
|
||||
show any. There is no reliable way to detect every single possible metadata for
|
||||
complex file formats.
|
||||
.PP
|
||||
This is why you shouldn't rely on metadata's presence to decide if your file must
|
||||
be cleaned or not.
|
||||
.PP
|
||||
Moreover, mat2 goes to great lengths to make sure that as much metadata as
|
||||
possible are removed. This might sometimes result in a loss of quality of the
|
||||
processed files. For example, textual based pdf file converted into image based
|
||||
one means that it'll be no longer possible to select text in them. If you're
|
||||
experiencing this, you might want to give the lightweight cleaning mode a try,
|
||||
but keep in mind by doing so, some metadata \fBwon't be cleaned\fR.
|
||||
|
||||
|
||||
.SH BUGS
|
||||
|
||||
While mat2 does its very best to remove every single metadata,
|
||||
it's still in beta, and \fBsome\fR might remain. Should you encounter
|
||||
some issues, check the bugtracker: https://0xacab.org/jvoisin/mat2/issues
|
||||
some issues, check the bugtracker: https://github.com/jvoisin/mat2/issues
|
||||
.PP
|
||||
Please use accordingly and be careful.
|
||||
|
||||
|
@@ -5,7 +5,7 @@ Thanks to [Miguel Marco](https://riemann.unizar.es/~mmarco/), here is an neat
|
||||
integration for [Dolphin](https://kde.org/applications/system/org.kde.dolphin),
|
||||
the KDE file manager:
|
||||
|
||||
1. Add the `mat.desktop` file either in
|
||||
1. Add the `mat2.desktop` file either in
|
||||
- `/usr/share/kservices5/ServiceMenus/` to install it globally
|
||||
- `~/.local/share/kservices5/ServiceMenus/` for a specific user
|
||||
2. Run `kbuildsycoca5` to update the corresponding database
|
||||
|
@@ -1,11 +1,13 @@
|
||||
[Desktop Entry]
|
||||
X-KDE-ServiceTypes=KonqPopupMenu/Plugin
|
||||
MimeType=application/pdf;application/vnd.oasis.opendocument.chart ;application/vnd.oasis.opendocument.formula ;application/vnd.oasis.opendocument.graphics ;application/vnd.oasis.opendocument.image ;application/vnd.oasis.opendocument.presentation ;application/vnd.oasis.opendocument.spreadsheet ;application/vnd.oasis.opendocument.text ;application/vnd.openxmlformats-officedocument.presentationml.presentation ;application/vnd.openxmlformats-officedocument.spreadsheetml.sheet ;application/vnd.openxmlformats-officedocument.wordprocessingml.document ;application/x-bittorrent ;application/zip ;audio/flac ;audio/mpeg ;audio/ogg ;audio/x-flac ;image/jpeg ;image/png ;image/tiff ;image/x-ms-bmp ;text/plain ;video/mp4 ;video/x-msvideo;
|
||||
MimeType=application/pdf;application/vnd.oasis.opendocument.chart;application/vnd.oasis.opendocument.formula;application/vnd.oasis.opendocument.graphics;application/vnd.oasis.opendocument.image;application/vnd.oasis.opendocument.presentation;application/vnd.oasis.opendocument.spreadsheet;application/vnd.oasis.opendocument.text;application/vnd.openxmlformats-officedocument.presentationml.presentation;application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;application/vnd.openxmlformats-officedocument.wordprocessingml.document;application/x-bittorrent;application/zip;audio/flac;audio/mpeg;audio/ogg;audio/x-flac;image/jpeg;image/png;image/tiff;image/x-ms-bmp;text/plain;video/mp4;video/x-msvideo;
|
||||
Actions=cleanMetadata;
|
||||
Type=Service
|
||||
|
||||
[Desktop Action cleanMetadata]
|
||||
Name=Clean metadata
|
||||
Name[de]=Metadaten löschen
|
||||
Name[es]=Limpiar metadatos
|
||||
Icon=/usr/share/icons/hicolor/scalable/apps/mat2.svg
|
||||
Exec=kdialog --yesno "$( mat2 -s %U )" --title "Clean Metadata?" && mat2 %U
|
||||
Exec=kdialog --yesno "$( mat2 -s %F )" --title "Clean Metadata?" && mat2 %U
|
||||
Exec[de]=kdialog --yesno "$( mat2 -s %F )" --title "Metadaten löschen?" && mat2 %U
|
||||
|
@@ -2,15 +2,10 @@
|
||||
|
||||
import enum
|
||||
import importlib
|
||||
from typing import Dict, Optional, Union
|
||||
from typing import Dict
|
||||
|
||||
from . import exiftool, video
|
||||
|
||||
# make pyflakes happy
|
||||
assert Dict
|
||||
assert Optional
|
||||
assert Union
|
||||
|
||||
# A set of extension that aren't supported, despite matching a supported mimetype
|
||||
UNSUPPORTED_EXTENSIONS = {
|
||||
'.asc',
|
||||
@@ -67,8 +62,9 @@ CMD_DEPENDENCIES = {
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def check_dependencies() -> Dict[str, Dict[str, bool]]:
|
||||
ret = dict() # type: Dict[str, dict]
|
||||
ret: Dict[str, Dict] = dict()
|
||||
|
||||
for key, value in DEPENDENCIES.items():
|
||||
ret[key] = {
|
||||
|
@@ -1,9 +1,7 @@
|
||||
import abc
|
||||
import os
|
||||
import re
|
||||
from typing import Set, Dict, Union
|
||||
|
||||
assert Set # make pyflakes happy
|
||||
from typing import Union, Set, Dict
|
||||
|
||||
|
||||
class AbstractParser(abc.ABC):
|
||||
@@ -11,8 +9,8 @@ class AbstractParser(abc.ABC):
|
||||
It might yield `ValueError` on instantiation on invalid files,
|
||||
and `RuntimeError` when something went wrong in `remove_all`.
|
||||
"""
|
||||
meta_list = set() # type: Set[str]
|
||||
mimetypes = set() # type: Set[str]
|
||||
meta_list: Set[str] = set()
|
||||
mimetypes: Set[str] = set()
|
||||
|
||||
def __init__(self, filename: str) -> None:
|
||||
"""
|
||||
@@ -35,8 +33,11 @@ class AbstractParser(abc.ABC):
|
||||
self.sandbox = True
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
"""Return all the metadata of the current file"""
|
||||
def get_meta(self) -> Dict[str, Union[str, Dict]]:
|
||||
"""Return all the metadata of the current file
|
||||
|
||||
:raises RuntimeError: Raised if the cleaning process went wrong.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def remove_all(self) -> bool:
|
||||
|
@@ -7,14 +7,10 @@ import tempfile
|
||||
import os
|
||||
import logging
|
||||
import shutil
|
||||
from typing import Dict, Set, Pattern, Union, Any, List
|
||||
from typing import Pattern, Union, Any, Set, Dict, List
|
||||
|
||||
from . import abstract, UnknownMemberPolicy, parser_factory
|
||||
|
||||
# Make pyflakes happy
|
||||
assert Set
|
||||
assert Pattern
|
||||
|
||||
# pylint: disable=not-callable,assignment-from-no-return,too-many-branches
|
||||
|
||||
# An ArchiveClass is a class representing an archive,
|
||||
@@ -48,20 +44,20 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
# We ignore typing here because mypy is too stupid
|
||||
self.archive_class = None # type: ignore
|
||||
self.member_class = None # type: ignore
|
||||
self.archive_class = None # type: ignore
|
||||
self.member_class = None # type: ignore
|
||||
|
||||
# Those are the files that have a format that _isn't_
|
||||
# supported by mat2, but that we want to keep anyway.
|
||||
self.files_to_keep = set() # type: Set[Pattern]
|
||||
self.files_to_keep: Set[Pattern] = set()
|
||||
|
||||
# Those are the files that we _do not_ want to keep,
|
||||
# no matter if they are supported or not.
|
||||
self.files_to_omit = set() # type: Set[Pattern]
|
||||
self.files_to_omit: Set[Pattern] = set()
|
||||
|
||||
# what should the parser do if it encounters an unknown file in
|
||||
# the archive?
|
||||
self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy
|
||||
self.unknown_member_policy: UnknownMemberPolicy = UnknownMemberPolicy.ABORT
|
||||
|
||||
# The LGTM comment is to mask a false-positive,
|
||||
# see https://lgtm.com/projects/g/jvoisin/mat2/
|
||||
@@ -73,15 +69,22 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||
def _specific_cleanup(self, full_path: str) -> bool:
|
||||
""" This method can be used to apply specific treatment
|
||||
to files present in the archive."""
|
||||
# pylint: disable=unused-argument,no-self-use
|
||||
# pylint: disable=unused-argument
|
||||
return True # pragma: no cover
|
||||
|
||||
def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
|
||||
""" This method can be used to extract specific metadata
|
||||
from files present in the archive."""
|
||||
# pylint: disable=unused-argument,no-self-use
|
||||
# pylint: disable=unused-argument
|
||||
return {} # pragma: no cover
|
||||
|
||||
def _final_checks(self) -> bool:
|
||||
""" This method is invoked after the file has been cleaned,
|
||||
allowing to run final verifications.
|
||||
"""
|
||||
# pylint: disable=unused-argument
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
|
||||
@@ -102,6 +105,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||
def _get_member_name(member: ArchiveMember) -> str:
|
||||
"""Return the name of the given member."""
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def _is_dir(member: ArchiveMember) -> bool:
|
||||
"""Return true is the given member is a directory."""
|
||||
|
||||
@abc.abstractmethod
|
||||
def _add_file_to_archive(self, archive: ArchiveClass, member: ArchiveMember,
|
||||
full_path: str):
|
||||
@@ -113,8 +121,20 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||
# pylint: disable=unused-argument
|
||||
return member
|
||||
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
meta = dict() # type: Dict[str, Union[str, dict]]
|
||||
@staticmethod
|
||||
def _get_member_compression(member: ArchiveMember):
|
||||
"""Get the compression of the archive member."""
|
||||
# pylint: disable=unused-argument
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _set_member_compression(member: ArchiveMember, compression) -> ArchiveMember:
|
||||
"""Set the compression of the archive member."""
|
||||
# pylint: disable=unused-argument
|
||||
return member
|
||||
|
||||
def get_meta(self) -> Dict[str, Union[str, Dict]]:
|
||||
meta: Dict[str, Union[str, Dict]] = dict()
|
||||
|
||||
with self.archive_class(self.filename) as zin:
|
||||
temp_folder = tempfile.mkdtemp()
|
||||
@@ -123,12 +143,20 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||
local_meta = self._get_member_meta(item)
|
||||
member_name = self._get_member_name(item)
|
||||
|
||||
if member_name[-1] == '/': # pragma: no cover
|
||||
# `is_dir` is added in Python3.6
|
||||
if self._is_dir(item): # pragma: no cover
|
||||
continue # don't keep empty folders
|
||||
|
||||
zin.extract(member=item, path=temp_folder)
|
||||
full_path = os.path.join(temp_folder, member_name)
|
||||
if not os.path.abspath(full_path).startswith(temp_folder):
|
||||
logging.error("%s contains a file (%s) pointing outside (%s) of its root.",
|
||||
self.filename, member_name, full_path)
|
||||
break
|
||||
|
||||
try:
|
||||
zin.extract(member=item, path=temp_folder)
|
||||
except OSError as e:
|
||||
logging.error("Unable to extraxt %s from %s: %s", item, self.filename, e)
|
||||
|
||||
os.chmod(full_path, stat.S_IRUSR)
|
||||
|
||||
specific_meta = self._specific_get_meta(full_path, member_name)
|
||||
@@ -136,6 +164,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||
|
||||
member_parser, _ = parser_factory.get_parser(full_path) # type: ignore
|
||||
if member_parser:
|
||||
member_parser.sandbox = self.sandbox
|
||||
local_meta = {**local_meta, **member_parser.get_meta()}
|
||||
|
||||
if local_meta:
|
||||
@@ -155,12 +184,12 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||
|
||||
# Sort the items to process, to reduce fingerprinting,
|
||||
# and keep them in the `items` variable.
|
||||
items = list() # type: List[ArchiveMember]
|
||||
items: List[ArchiveMember] = list()
|
||||
for item in sorted(self._get_all_members(zin), key=self._get_member_name):
|
||||
# Some fileformats do require to have the `mimetype` file
|
||||
# as the first file in the archive.
|
||||
if self._get_member_name(item) == 'mimetype':
|
||||
items = [item] + items
|
||||
items.insert(0, item)
|
||||
else:
|
||||
items.append(item)
|
||||
|
||||
@@ -168,18 +197,36 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||
# we're iterating (and thus inserting) them in lexicographic order.
|
||||
for item in items:
|
||||
member_name = self._get_member_name(item)
|
||||
if member_name[-1] == '/': # `is_dir` is added in Python3.6
|
||||
if self._is_dir(item):
|
||||
continue # don't keep empty folders
|
||||
|
||||
zin.extract(member=item, path=temp_folder)
|
||||
full_path = os.path.join(temp_folder, member_name)
|
||||
if not os.path.abspath(full_path).startswith(temp_folder):
|
||||
logging.error("%s contains a file (%s) pointing outside (%s) of its root.",
|
||||
self.filename, member_name, full_path)
|
||||
abort = True
|
||||
break
|
||||
|
||||
zin.extract(member=item, path=temp_folder)
|
||||
|
||||
try:
|
||||
original_permissions = os.stat(full_path).st_mode
|
||||
except FileNotFoundError:
|
||||
logging.error("Something went wrong during processing of "
|
||||
"%s in %s, likely a path traversal attack.",
|
||||
member_name, self.filename)
|
||||
abort = True
|
||||
# we're breaking instead of continuing, because this exception
|
||||
# is raised in case of weird path-traversal-like atttacks.
|
||||
break
|
||||
|
||||
original_permissions = os.stat(full_path).st_mode
|
||||
os.chmod(full_path, original_permissions | stat.S_IWUSR | stat.S_IRUSR)
|
||||
|
||||
original_compression = self._get_member_compression(item)
|
||||
|
||||
if self._specific_cleanup(full_path) is False:
|
||||
logging.warning("Something went wrong during deep cleaning of %s",
|
||||
member_name)
|
||||
logging.warning("Something went wrong during deep cleaning of %s in %s",
|
||||
member_name, self.filename)
|
||||
abort = True
|
||||
continue
|
||||
|
||||
@@ -205,6 +252,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||
abort = True
|
||||
continue
|
||||
else:
|
||||
member_parser.sandbox = self.sandbox
|
||||
if member_parser.remove_all() is False:
|
||||
logging.warning("In file %s, something went wrong \
|
||||
with the cleaning of %s \
|
||||
@@ -216,6 +264,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||
|
||||
zinfo = self.member_class(member_name) # type: ignore
|
||||
zinfo = self._set_member_permissions(zinfo, original_permissions)
|
||||
zinfo = self._set_member_compression(zinfo, original_compression)
|
||||
clean_zinfo = self._clean_member(zinfo)
|
||||
self._add_file_to_archive(zout, clean_zinfo, full_path)
|
||||
|
||||
@@ -223,11 +272,14 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||
if abort:
|
||||
os.remove(self.output_filename)
|
||||
return False
|
||||
if not self._final_checks():
|
||||
return False # pragma: no cover
|
||||
return True
|
||||
|
||||
|
||||
class TarParser(ArchiveBasedAbstractParser):
|
||||
mimetypes = {'application/x-tar'}
|
||||
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
# yes, it's tarfile.open and not tarfile.TarFile,
|
||||
@@ -337,6 +389,11 @@ class TarParser(ArchiveBasedAbstractParser):
|
||||
member.mode = permissions
|
||||
return member
|
||||
|
||||
@staticmethod
|
||||
def _is_dir(member: ArchiveMember) -> bool:
|
||||
assert isinstance(member, tarfile.TarInfo) # please mypy
|
||||
return member.isdir()
|
||||
|
||||
|
||||
class TarGzParser(TarParser):
|
||||
compression = ':gz'
|
||||
@@ -355,16 +412,17 @@ class TarXzParser(TarParser):
|
||||
|
||||
class ZipParser(ArchiveBasedAbstractParser):
|
||||
mimetypes = {'application/zip'}
|
||||
def __init__(self, filename):
|
||||
|
||||
def __init__(self, filename: str):
|
||||
super().__init__(filename)
|
||||
self.archive_class = zipfile.ZipFile
|
||||
self.member_class = zipfile.ZipInfo
|
||||
self.zip_compression_type = zipfile.ZIP_DEFLATED
|
||||
|
||||
def is_archive_valid(self):
|
||||
try:
|
||||
zipfile.ZipFile(self.filename)
|
||||
except zipfile.BadZipFile:
|
||||
with zipfile.ZipFile(self.filename):
|
||||
pass
|
||||
except (zipfile.BadZipFile, OSError):
|
||||
raise ValueError
|
||||
|
||||
@staticmethod
|
||||
@@ -400,7 +458,7 @@ class ZipParser(ArchiveBasedAbstractParser):
|
||||
assert isinstance(member, zipfile.ZipInfo) # please mypy
|
||||
with open(full_path, 'rb') as f:
|
||||
archive.writestr(member, f.read(),
|
||||
compress_type=self.zip_compression_type)
|
||||
compress_type=member.compress_type)
|
||||
|
||||
@staticmethod
|
||||
def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
|
||||
@@ -411,3 +469,19 @@ class ZipParser(ArchiveBasedAbstractParser):
|
||||
def _get_member_name(member: ArchiveMember) -> str:
|
||||
assert isinstance(member, zipfile.ZipInfo) # please mypy
|
||||
return member.filename
|
||||
|
||||
@staticmethod
|
||||
def _get_member_compression(member: ArchiveMember):
|
||||
assert isinstance(member, zipfile.ZipInfo) # please mypy
|
||||
return member.compress_type
|
||||
|
||||
@staticmethod
|
||||
def _set_member_compression(member: ArchiveMember, compression) -> ArchiveMember:
|
||||
assert isinstance(member, zipfile.ZipInfo) # please mypy
|
||||
member.compress_type = compression
|
||||
return member
|
||||
|
||||
@staticmethod
|
||||
def _is_dir(member: ArchiveMember) -> bool:
|
||||
assert isinstance(member, zipfile.ZipInfo) # please mypy
|
||||
return member.is_dir()
|
||||
|
@@ -2,42 +2,51 @@ import mimetypes
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from typing import Dict, Union
|
||||
from typing import Union, Dict
|
||||
|
||||
import mutagen
|
||||
|
||||
from . import abstract, parser_factory
|
||||
from . import abstract, parser_factory, video
|
||||
|
||||
|
||||
class MutagenParser(abstract.AbstractParser):
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
try:
|
||||
mutagen.File(self.filename)
|
||||
if mutagen.File(self.filename) is None:
|
||||
raise ValueError
|
||||
except mutagen.MutagenError:
|
||||
raise ValueError
|
||||
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
def get_meta(self) -> Dict[str, Union[str, Dict]]:
|
||||
f = mutagen.File(self.filename)
|
||||
if f.tags:
|
||||
return {k:', '.join(v) for k, v in f.tags.items()}
|
||||
return {k: ', '.join(map(str, v)) for k, v in f.tags.items()}
|
||||
return {}
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
shutil.copy(self.filename, self.output_filename)
|
||||
f = mutagen.File(self.output_filename)
|
||||
f.delete()
|
||||
f.save()
|
||||
try:
|
||||
f.delete()
|
||||
f.save()
|
||||
except mutagen.MutagenError:
|
||||
raise ValueError
|
||||
return True
|
||||
|
||||
|
||||
class MP3Parser(MutagenParser):
|
||||
mimetypes = {'audio/mpeg', }
|
||||
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
metadata = {} # type: Dict[str, Union[str, dict]]
|
||||
def get_meta(self) -> Dict[str, Union[str, Dict]]:
|
||||
metadata: Dict[str, Union[str, Dict]] = dict()
|
||||
meta = mutagen.File(self.filename).tags
|
||||
if not meta:
|
||||
return metadata
|
||||
for key in meta:
|
||||
if isinstance(key, tuple):
|
||||
metadata[key[0]] = key[1]
|
||||
continue
|
||||
if not hasattr(meta[key], 'text'): # pragma: no cover
|
||||
continue
|
||||
metadata[key.rstrip(' \t\r\n\0')] = ', '.join(map(str, meta[key].text))
|
||||
@@ -59,12 +68,12 @@ class FLACParser(MutagenParser):
|
||||
f.save(deleteid3=True)
|
||||
return True
|
||||
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
def get_meta(self) -> Dict[str, Union[str, Dict]]:
|
||||
meta = super().get_meta()
|
||||
for num, picture in enumerate(mutagen.File(self.filename).pictures):
|
||||
name = picture.desc if picture.desc else 'Cover %d' % num
|
||||
extension = mimetypes.guess_extension(picture.mime)
|
||||
if extension is None: # pragma: no cover
|
||||
if extension is None: # pragma: no cover
|
||||
meta[name] = 'harmful data'
|
||||
continue
|
||||
|
||||
@@ -73,7 +82,33 @@ class FLACParser(MutagenParser):
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(picture.data)
|
||||
p, _ = parser_factory.get_parser(fname) # type: ignore
|
||||
if p is None:
|
||||
raise ValueError
|
||||
p.sandbox = self.sandbox
|
||||
# Mypy chokes on ternaries :/
|
||||
meta[name] = p.get_meta() if p else 'harmful data' # type: ignore
|
||||
os.remove(fname)
|
||||
return meta
|
||||
|
||||
|
||||
class WAVParser(video.AbstractFFmpegParser):
|
||||
mimetypes = {'audio/x-wav', }
|
||||
meta_allowlist = {'AvgBytesPerSec', 'BitsPerSample', 'Directory',
|
||||
'Duration', 'Encoding', 'ExifToolVersion',
|
||||
'FileAccessDate', 'FileInodeChangeDate',
|
||||
'FileModifyDate', 'FileName', 'FilePermissions',
|
||||
'FileSize', 'FileType', 'FileTypeExtension',
|
||||
'MIMEType', 'NumChannels', 'SampleRate', 'SourceFile',
|
||||
}
|
||||
|
||||
|
||||
class AIFFParser(video.AbstractFFmpegParser):
|
||||
mimetypes = {'audio/aiff', 'audio/x-aiff'}
|
||||
meta_allowlist = {'AvgBytesPerSec', 'BitsPerSample', 'Directory',
|
||||
'Duration', 'Encoding', 'ExifToolVersion',
|
||||
'FileAccessDate', 'FileInodeChangeDate',
|
||||
'FileModifyDate', 'FileName', 'FilePermissions',
|
||||
'FileSize', 'FileType', 'FileTypeExtension',
|
||||
'MIMEType', 'NumChannels', 'SampleRate', 'SourceFile',
|
||||
'NumSampleFrames', 'SampleSize',
|
||||
}
|
||||
|
@@ -11,7 +11,8 @@ import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from typing import List, Optional
|
||||
import functools
|
||||
from typing import Optional, List
|
||||
|
||||
|
||||
__all__ = ['PIPE', 'run', 'CalledProcessError']
|
||||
@@ -21,16 +22,15 @@ CalledProcessError = subprocess.CalledProcessError
|
||||
# pylint: disable=subprocess-run-check
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=None)
|
||||
def _get_bwrap_path() -> str:
|
||||
bwrap_path = '/usr/bin/bwrap'
|
||||
if os.path.isfile(bwrap_path):
|
||||
if os.access(bwrap_path, os.X_OK):
|
||||
return bwrap_path
|
||||
which_path = shutil.which('bwrap')
|
||||
if which_path:
|
||||
return which_path
|
||||
|
||||
raise RuntimeError("Unable to find bwrap") # pragma: no cover
|
||||
|
||||
|
||||
# pylint: disable=bad-whitespace
|
||||
def _get_bwrap_args(tempdir: str,
|
||||
input_filename: str,
|
||||
output_filename: Optional[str] = None) -> List[str]:
|
||||
@@ -39,7 +39,7 @@ def _get_bwrap_args(tempdir: str,
|
||||
|
||||
# XXX: use --ro-bind-try once all supported platforms
|
||||
# have a bubblewrap recent enough to support it.
|
||||
ro_bind_dirs = ['/usr', '/lib', '/lib64', '/bin', '/sbin', cwd]
|
||||
ro_bind_dirs = ['/usr', '/lib', '/lib64', '/bin', '/sbin', '/etc/alternatives', cwd]
|
||||
for bind_dir in ro_bind_dirs:
|
||||
if os.path.isdir(bind_dir): # pragma: no cover
|
||||
ro_bind_args.extend(['--ro-bind', bind_dir, bind_dir])
|
||||
@@ -53,7 +53,6 @@ def _get_bwrap_args(tempdir: str,
|
||||
['--dev', '/dev',
|
||||
'--proc', '/proc',
|
||||
'--chdir', cwd,
|
||||
'--tmpfs', '/tmp',
|
||||
'--unshare-user-try',
|
||||
'--unshare-ipc',
|
||||
'--unshare-pid',
|
||||
@@ -79,7 +78,6 @@ def _get_bwrap_args(tempdir: str,
|
||||
return args
|
||||
|
||||
|
||||
# pylint: disable=bad-whitespace
|
||||
def run(args: List[str],
|
||||
input_filename: str,
|
||||
output_filename: Optional[str] = None,
|
||||
|
@@ -1,10 +1,13 @@
|
||||
import logging
|
||||
import re
|
||||
import uuid
|
||||
import zipfile
|
||||
import xml.etree.ElementTree as ET # type: ignore
|
||||
from typing import Any, Dict
|
||||
|
||||
from . import archive, office
|
||||
|
||||
|
||||
class EPUBParser(archive.ZipParser):
|
||||
mimetypes = {'application/epub+zip', }
|
||||
metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
|
||||
@@ -15,11 +18,27 @@ class EPUBParser(archive.ZipParser):
|
||||
'META-INF/container.xml',
|
||||
'mimetype',
|
||||
'OEBPS/content.opf',
|
||||
'content.opf',
|
||||
'hmh.opf',
|
||||
'OPS/.+.xml'
|
||||
}))
|
||||
self.files_to_omit = set(map(re.compile, { # type: ignore
|
||||
'iTunesMetadata.plist',
|
||||
'META-INF/calibre_bookmarks.txt',
|
||||
'OEBPS/package.opf',
|
||||
}))
|
||||
self.uniqid = uuid.uuid4()
|
||||
|
||||
def _specific_get_meta(self, full_path, file_path):
|
||||
if file_path != 'OEBPS/content.opf':
|
||||
def is_archive_valid(self):
|
||||
super().is_archive_valid()
|
||||
with zipfile.ZipFile(self.filename) as zin:
|
||||
for item in self._get_all_members(zin):
|
||||
member_name = self._get_member_name(item)
|
||||
if member_name.endswith('META-INF/encryption.xml'):
|
||||
raise ValueError('the file contains encrypted fonts')
|
||||
|
||||
def _specific_get_meta(self, full_path, file_path) -> Dict[str, Any]:
|
||||
if not file_path.endswith('.opf'):
|
||||
return {}
|
||||
|
||||
with open(full_path, encoding='utf-8') as f:
|
||||
@@ -30,14 +49,31 @@ class EPUBParser(archive.ZipParser):
|
||||
except (TypeError, UnicodeDecodeError):
|
||||
return {file_path: 'harmful content', }
|
||||
|
||||
def _specific_cleanup(self, full_path: str):
|
||||
if full_path.endswith('OEBPS/content.opf'):
|
||||
def _specific_cleanup(self, full_path: str) -> bool:
|
||||
if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'):
|
||||
return self.__handle_contentopf(full_path)
|
||||
elif full_path.endswith('OEBPS/toc.ncx'):
|
||||
return self.__handle_tocncx(full_path)
|
||||
elif re.search('/OPS/[^/]+.xml$', full_path):
|
||||
return self.__handle_ops_xml(full_path)
|
||||
return True
|
||||
|
||||
def __handle_tocncx(self, full_path: str):
|
||||
def __handle_ops_xml(self, full_path: str) -> bool:
|
||||
try:
|
||||
tree, namespace = office._parse_xml(full_path)
|
||||
except ET.ParseError: # pragma: nocover
|
||||
logging.error("Unable to parse %s in %s.", full_path, self.filename)
|
||||
return False
|
||||
|
||||
for item in tree.iterfind('.//', namespace): # pragma: nocover
|
||||
if item.tag.strip().lower().endswith('head'):
|
||||
item.clear()
|
||||
break
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8',
|
||||
short_empty_elements=False)
|
||||
return True
|
||||
|
||||
def __handle_tocncx(self, full_path: str) -> bool:
|
||||
try:
|
||||
tree, namespace = office._parse_xml(full_path)
|
||||
except ET.ParseError: # pragma: nocover
|
||||
@@ -53,7 +89,7 @@ class EPUBParser(archive.ZipParser):
|
||||
short_empty_elements=False)
|
||||
return True
|
||||
|
||||
def __handle_contentopf(self, full_path: str):
|
||||
def __handle_contentopf(self, full_path: str) -> bool:
|
||||
try:
|
||||
tree, namespace = office._parse_xml(full_path)
|
||||
except ET.ParseError:
|
||||
@@ -71,7 +107,7 @@ class EPUBParser(archive.ZipParser):
|
||||
item.append(uniqid)
|
||||
|
||||
# items without mandatory content
|
||||
for name in {'language', 'title'}:
|
||||
for name in ['language', 'title']:
|
||||
uniqid = ET.Element(self.metadata_namespace + name)
|
||||
item.append(uniqid)
|
||||
break # there is only a single <metadata> block
|
||||
|
@@ -2,31 +2,34 @@ import functools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
from typing import Dict, Union, Set
|
||||
from typing import Union, Set, Dict
|
||||
|
||||
from . import abstract
|
||||
from . import bubblewrap
|
||||
|
||||
# Make pyflakes happy
|
||||
assert Set
|
||||
|
||||
|
||||
class ExiftoolParser(abstract.AbstractParser):
|
||||
""" Exiftool is often the easiest way to get all the metadata
|
||||
from a import file, hence why several parsers are re-using its `get_meta`
|
||||
method.
|
||||
"""
|
||||
meta_allowlist = set() # type: Set[str]
|
||||
meta_allowlist: Set[str] = set()
|
||||
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
if self.sandbox:
|
||||
out = bubblewrap.run([_get_exiftool_path(), '-json', self.filename],
|
||||
input_filename=self.filename,
|
||||
check=True, stdout=subprocess.PIPE).stdout
|
||||
else:
|
||||
out = subprocess.run([_get_exiftool_path(), '-json', self.filename],
|
||||
check=True, stdout=subprocess.PIPE).stdout
|
||||
def get_meta(self) -> Dict[str, Union[str, Dict]]:
|
||||
try:
|
||||
if self.sandbox:
|
||||
out = bubblewrap.run([_get_exiftool_path(), '-json',
|
||||
self.filename],
|
||||
input_filename=self.filename,
|
||||
check=True, stdout=subprocess.PIPE).stdout
|
||||
else:
|
||||
out = subprocess.run([_get_exiftool_path(), '-json',
|
||||
self.filename],
|
||||
check=True, stdout=subprocess.PIPE).stdout
|
||||
except subprocess.CalledProcessError: # pragma: no cover
|
||||
raise ValueError
|
||||
meta = json.loads(out.decode('utf-8'))[0]
|
||||
for key in self.meta_allowlist:
|
||||
meta.pop(key, None)
|
||||
@@ -64,16 +67,14 @@ class ExiftoolParser(abstract.AbstractParser):
|
||||
return False
|
||||
return True
|
||||
|
||||
@functools.lru_cache()
|
||||
@functools.lru_cache(maxsize=None)
|
||||
def _get_exiftool_path() -> str: # pragma: no cover
|
||||
possible_pathes = {
|
||||
'/usr/bin/exiftool', # debian/fedora
|
||||
'/usr/bin/vendor_perl/exiftool', # archlinux
|
||||
}
|
||||
which_path = shutil.which('exiftool')
|
||||
if which_path:
|
||||
return which_path
|
||||
|
||||
for possible_path in possible_pathes:
|
||||
if os.path.isfile(possible_path):
|
||||
if os.access(possible_path, os.X_OK):
|
||||
return possible_path
|
||||
# Exiftool on Arch Linux has a weird path
|
||||
if os.access('/usr/bin/vendor_perl/exiftool', os.X_OK):
|
||||
return '/usr/bin/vendor_perl/exiftool'
|
||||
|
||||
raise RuntimeError("Unable to find exiftool")
|
||||
|
@@ -1,13 +1,13 @@
|
||||
import shutil
|
||||
from typing import Dict, Union
|
||||
from typing import Union, Dict
|
||||
from . import abstract
|
||||
|
||||
|
||||
class HarmlessParser(abstract.AbstractParser):
|
||||
""" This is the parser for filetypes that can not contain metadata. """
|
||||
mimetypes = {'text/plain', 'image/x-ms-bmp'}
|
||||
mimetypes = {'text/plain', 'image/x-ms-bmp', 'image/bmp'}
|
||||
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
def get_meta(self) -> Dict[str, Union[str, Dict]]:
|
||||
return dict()
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
|
@@ -1,7 +1,6 @@
|
||||
import imghdr
|
||||
import os
|
||||
import re
|
||||
from typing import Set, Dict, Union, Any
|
||||
from typing import Union, Any, Dict
|
||||
|
||||
import cairo
|
||||
|
||||
@@ -12,10 +11,6 @@ from gi.repository import GdkPixbuf, GLib, Rsvg
|
||||
|
||||
from . import exiftool, abstract
|
||||
|
||||
# Make pyflakes happy
|
||||
assert Set
|
||||
assert Any
|
||||
|
||||
class SVGParser(exiftool.ExiftoolParser):
|
||||
mimetypes = {'image/svg+xml', }
|
||||
meta_allowlist = {'Directory', 'ExifToolVersion', 'FileAccessDate',
|
||||
@@ -26,25 +21,40 @@ class SVGParser(exiftool.ExiftoolParser):
|
||||
}
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
svg = Rsvg.Handle.new_from_file(self.filename)
|
||||
dimensions = svg.get_dimensions()
|
||||
surface = cairo.SVGSurface(self.output_filename,
|
||||
dimensions.height,
|
||||
dimensions.width)
|
||||
try:
|
||||
svg = Rsvg.Handle.new_from_file(self.filename)
|
||||
except GLib.GError:
|
||||
raise ValueError
|
||||
|
||||
try:
|
||||
_, _, _, _, has_viewbox, viewbox = svg.get_intrinsic_dimensions()
|
||||
if has_viewbox is False:
|
||||
raise ValueError
|
||||
_, width, height = svg.get_intrinsic_size_in_pixels()
|
||||
except AttributeError:
|
||||
dimensions = svg.get_dimensions()
|
||||
height, width = dimensions.height, dimensions.width
|
||||
|
||||
surface = cairo.SVGSurface(self.output_filename, height, width)
|
||||
context = cairo.Context(surface)
|
||||
svg.render_cairo(context)
|
||||
try:
|
||||
svg.render_document(context, viewbox)
|
||||
except AttributeError:
|
||||
svg.render_cairo(context)
|
||||
|
||||
surface.finish()
|
||||
return True
|
||||
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
def get_meta(self) -> Dict[str, Union[str, Dict]]:
|
||||
meta = super().get_meta()
|
||||
|
||||
# The namespace is mandatory, but only the …/2000/svg is valid.
|
||||
ns = 'http://www.w3.org/2000/svg'
|
||||
if meta.get('Xmlns', ns) == ns:
|
||||
if meta.get('Xmlns') == ns:
|
||||
meta.pop('Xmlns')
|
||||
return meta
|
||||
|
||||
|
||||
class PNGParser(exiftool.ExiftoolParser):
|
||||
mimetypes = {'image/png', }
|
||||
meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName',
|
||||
@@ -58,12 +68,10 @@ class PNGParser(exiftool.ExiftoolParser):
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
|
||||
if imghdr.what(filename) != 'png':
|
||||
raise ValueError
|
||||
|
||||
try: # better fail here than later
|
||||
cairo.ImageSurface.create_from_png(self.filename)
|
||||
except MemoryError: # pragma: no cover
|
||||
except: # pragma: no cover
|
||||
# Cairo is returning some weird exceptions :/
|
||||
raise ValueError
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
@@ -97,7 +105,6 @@ class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
|
||||
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
# we can't use imghdr here because of https://bugs.python.org/issue28591
|
||||
try:
|
||||
GdkPixbuf.Pixbuf.new_from_file(self.filename)
|
||||
except GLib.GError:
|
||||
@@ -109,9 +116,16 @@ class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
|
||||
|
||||
_, extension = os.path.splitext(self.filename)
|
||||
pixbuf = GdkPixbuf.Pixbuf.new_from_file(self.filename)
|
||||
pixbuf = GdkPixbuf.Pixbuf.apply_embedded_orientation(pixbuf)
|
||||
if extension.lower() == '.jpg':
|
||||
extension = '.jpeg' # gdk is picky
|
||||
pixbuf.savev(self.output_filename, type=extension[1:], option_keys=[], option_values=[])
|
||||
elif extension.lower() == '.tif':
|
||||
extension = '.tiff' # gdk is picky
|
||||
try:
|
||||
pixbuf.savev(self.output_filename, type=extension[1:],
|
||||
option_keys=[], option_values=[])
|
||||
except GLib.GError: # pragma: no cover
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
@@ -125,7 +139,7 @@ class JPGParser(GdkPixbufAbstractParser):
|
||||
'MIMEType', 'ImageWidth', 'ImageSize', 'BitsPerSample',
|
||||
'ColorComponents', 'EncodingProcess', 'JFIFVersion',
|
||||
'ResolutionUnit', 'XResolution', 'YCbCrSubSampling',
|
||||
'YResolution', 'Megapixels', 'ImageHeight'}
|
||||
'YResolution', 'Megapixels', 'ImageHeight', 'Orientation'}
|
||||
|
||||
|
||||
class TiffParser(GdkPixbufAbstractParser):
|
||||
@@ -139,13 +153,14 @@ class TiffParser(GdkPixbufAbstractParser):
|
||||
'FileInodeChangeDate', 'FileModifyDate', 'FileName',
|
||||
'FilePermissions', 'FileSize', 'FileType',
|
||||
'FileTypeExtension', 'ImageHeight', 'ImageSize',
|
||||
'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile'}
|
||||
'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile', 'Orientation'}
|
||||
|
||||
|
||||
class PPMParser(abstract.AbstractParser):
|
||||
mimetypes = {'image/x-portable-pixmap'}
|
||||
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
meta = {} # type: Dict[str, Union[str, Dict[Any, Any]]]
|
||||
def get_meta(self) -> Dict[str, Union[str, Dict]]:
|
||||
meta: Dict[str, Union[str, Dict[Any, Any]]] = dict()
|
||||
with open(self.filename) as f:
|
||||
for idx, line in enumerate(f):
|
||||
if line.lstrip().startswith('#'):
|
||||
@@ -160,3 +175,36 @@ class PPMParser(abstract.AbstractParser):
|
||||
line = re.sub(r"\s+", "", line, flags=re.UNICODE)
|
||||
fout.write(line)
|
||||
return True
|
||||
|
||||
|
||||
class HEICParser(exiftool.ExiftoolParser):
|
||||
mimetypes = {'image/heic'}
|
||||
meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
|
||||
'FileSize', 'FileModifyDate', 'FileAccessDate',
|
||||
'FileInodeChangeDate', 'FilePermissions', 'FileType',
|
||||
'FileTypeExtension', 'MIMEType', 'MajorBrand', 'MinorVersion',
|
||||
'CompatibleBrands','HandlerType', 'PrimaryItemReference',
|
||||
'HEVCConfigurationVersion', 'GeneralProfileSpace',
|
||||
'GeneralTierFlag', 'GeneralProfileIDC',
|
||||
'GenProfileCompatibilityFlags', 'ConstraintIndicatorFlags',
|
||||
'GeneralLevelIDC', 'MinSpatialSegmentationIDC',
|
||||
'ParallelismType','ChromaFormat', 'BitDepthLuma', 'BitDepthChroma',
|
||||
'NumTemporalLayers', 'TemporalIDNested', 'ImageWidth',
|
||||
'ImageHeight', 'ImageSpatialExtent', 'ImagePixelDepth',
|
||||
'AverageFrameRate', 'ConstantFrameRate', 'MediaDataSize',
|
||||
'MediaDataOffset','ImageSize', 'Megapixels'}
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
return self._lightweight_cleanup()
|
||||
|
||||
class WEBPParser(GdkPixbufAbstractParser):
|
||||
mimetypes = {'image/webp'}
|
||||
meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName',
|
||||
'Directory', 'FileSize', 'FileModifyDate',
|
||||
'FileAccessDate', "FileInodeChangeDate",
|
||||
'FilePermissions', 'FileType', 'FileTypeExtension',
|
||||
'MIMEType', 'ImageWidth', 'ImageSize', 'BitsPerSample',
|
||||
'ColorComponents', 'EncodingProcess', 'JFIFVersion',
|
||||
'ResolutionUnit', 'XResolution', 'YCbCrSubSampling',
|
||||
'YResolution', 'Megapixels', 'ImageHeight', 'Orientation',
|
||||
'HorizontalScale', 'VerticalScale', 'VP8Version'}
|
||||
|
@@ -1,8 +1,10 @@
|
||||
import random
|
||||
import uuid
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import zipfile
|
||||
from typing import Dict, Set, Pattern, Tuple, Any
|
||||
from typing import Pattern, Any, Tuple, Dict
|
||||
|
||||
import xml.etree.ElementTree as ET # type: ignore
|
||||
|
||||
@@ -10,9 +12,6 @@ from .archive import ZipParser
|
||||
|
||||
# pylint: disable=line-too-long
|
||||
|
||||
# Make pyflakes happy
|
||||
assert Set
|
||||
assert Pattern
|
||||
|
||||
def _parse_xml(full_path: str) -> Tuple[ET.ElementTree, Dict[str, str]]:
|
||||
""" This function parses XML, with namespace support. """
|
||||
@@ -39,7 +38,7 @@ def _sort_xml_attributes(full_path: str) -> bool:
|
||||
for c in tree.getroot():
|
||||
c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
|
||||
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
|
||||
@@ -64,44 +63,99 @@ class MSOfficeParser(ZipParser):
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', # /word/footer.xml
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml', # /word/header.xml
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml', # /word/styles.xml
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml (used for bullet point formatting)
|
||||
'application/vnd.openxmlformats-officedocument.theme+xml', # /word/theme/theme[0-9].xml (used for font and background coloring, etc.)
|
||||
'application/vnd.openxmlformats-package.core-properties+xml', # /docProps/core.xml
|
||||
|
||||
# for more complicated powerpoints
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml',
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.notesMaster+xml',
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.handoutMaster+xml',
|
||||
'application/vnd.openxmlformats-officedocument.drawingml.diagramData+xml',
|
||||
'application/vnd.openxmlformats-officedocument.drawingml.diagramLayout+xml',
|
||||
'application/vnd.openxmlformats-officedocument.drawingml.diagramStyle+xml',
|
||||
'application/vnd.openxmlformats-officedocument.drawingml.diagramColors+xml',
|
||||
'application/vnd.ms-office.drawingml.diagramDrawing+xml',
|
||||
|
||||
# Do we want to keep the following ones?
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml',
|
||||
}
|
||||
|
||||
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
|
||||
# MSOffice documents are using various counters for cross-references,
|
||||
# we collect them all, to make sure that they're effectively counters,
|
||||
# and not unique id used for fingerprinting.
|
||||
self.__counters = {
|
||||
'cNvPr': set(),
|
||||
'rid': set(),
|
||||
}
|
||||
|
||||
self.files_to_keep = set(map(re.compile, { # type: ignore
|
||||
r'^\[Content_Types\]\.xml$',
|
||||
r'^_rels/\.rels$',
|
||||
r'^(?:word|ppt)/_rels/document\.xml\.rels$',
|
||||
r'^(?:word|ppt)/_rels/footer[0-9]*\.xml\.rels$',
|
||||
r'^(?:word|ppt)/_rels/header[0-9]*\.xml\.rels$',
|
||||
r'^xl/sharedStrings\.xml$', # https://docs.microsoft.com/en-us/office/open-xml/working-with-the-shared-string-table
|
||||
r'^xl/calcChain\.xml$',
|
||||
r'^(?:word|ppt|xl)/_rels/(document|workbook|presentation)\.xml\.rels$',
|
||||
r'^(?:word|ppt|xl)/_rels/footer[0-9]*\.xml\.rels$',
|
||||
r'^(?:word|ppt|xl)/_rels/header[0-9]*\.xml\.rels$',
|
||||
r'^(?:word|ppt|xl)/charts/_rels/chart[0-9]+\.xml\.rels$',
|
||||
r'^(?:word|ppt|xl)/charts/colors[0-9]+\.xml$',
|
||||
r'^(?:word|ppt|xl)/charts/style[0-9]+\.xml$',
|
||||
r'^(?:word|ppt|xl)/drawings/_rels/drawing[0-9]+\.xml\.rels$',
|
||||
r'^(?:word|ppt|xl)/styles\.xml$',
|
||||
# TODO: randomize axId ( https://docs.microsoft.com/en-us/openspecs/office_standards/ms-oi29500/089f849f-fcd6-4fa0-a281-35aa6a432a16 )
|
||||
r'^(?:word|ppt|xl)/charts/chart[0-9]*\.xml$',
|
||||
r'^xl/workbook\.xml$',
|
||||
r'^xl/worksheets/sheet[0-9]+\.xml$',
|
||||
r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$',
|
||||
|
||||
r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$',
|
||||
r'^(?:word|ppt|xl)/tableStyles\.xml$',
|
||||
r'^(?:word|ppt|xl)/tables/table[0-9]+\.xml$',
|
||||
r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$',
|
||||
r'^ppt/slides/slide[0-9]*\.xml$',
|
||||
# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
|
||||
r'^(?:word|ppt)/stylesWithEffects\.xml$',
|
||||
r'^(?:word|ppt|xl)/stylesWithEffects\.xml$',
|
||||
r'^ppt/presentation\.xml$',
|
||||
# TODO: check if p:bgRef can be randomized
|
||||
r'^ppt/slideMasters/slideMaster[0-9]+\.xml',
|
||||
r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels',
|
||||
r'^xl/worksheets/_rels/sheet[0-9]+\.xml\.rels',
|
||||
r'^(?:word|ppt|xl)/drawings/vmlDrawing[0-9]+\.vml',
|
||||
r'^(?:word|ppt|xl)/drawings/drawing[0-9]+\.xml',
|
||||
r'^(?:word|ppt|xl)/embeddings/Microsoft_Excel_Worksheet[0-9]+\.xlsx',
|
||||
# rels for complicated powerpoints
|
||||
r'^ppt/notesSlides/_rels/notesSlide[0-9]+\.xml\.rels',
|
||||
r'^ppt/notesMasters/_rels/notesMaster[0-9]+\.xml\.rels',
|
||||
r'^ppt/handoutMasters/_rels/handoutMaster[0-9]+\.xml\.rels',
|
||||
}))
|
||||
self.files_to_omit = set(map(re.compile, { # type: ignore
|
||||
r'^\[trash\]/',
|
||||
r'^customXml/',
|
||||
r'webSettings\.xml$',
|
||||
r'^docProps/custom\.xml$',
|
||||
r'^(?:word|ppt)/printerSettings/',
|
||||
r'^(?:word|ppt)/theme',
|
||||
r'^(?:word|ppt)/people\.xml$',
|
||||
r'^(?:word|ppt)/numbering\.xml$',
|
||||
r'^(?:word|ppt|xl)/printerSettings/',
|
||||
r'^(?:word|ppt|xl)/theme',
|
||||
r'^(?:word|ppt|xl)/people\.xml$',
|
||||
r'^(?:word|ppt|xl)/persons/person\.xml$',
|
||||
r'^(?:word|ppt|xl)/numbering\.xml$',
|
||||
r'^(?:word|ppt|xl)/tags/',
|
||||
r'^(?:word|ppt|xl)/glossary/',
|
||||
# View properties like view mode, last viewed slide etc
|
||||
r'^(?:word|ppt)/viewProps\.xml$',
|
||||
r'^(?:word|ppt|xl)/viewProps\.xml$',
|
||||
# Additional presentation-wide properties like printing properties,
|
||||
# presentation show properties etc.
|
||||
r'^(?:word|ppt)/presProps\.xml$',
|
||||
|
||||
r'^(?:word|ppt|xl)/presProps\.xml$',
|
||||
r'^(?:word|ppt|xl)/comments[0-9]*\.xml$',
|
||||
r'^(?:word|ppt|xl)/threadedComments/threadedComment[0-9]*\.xml$',
|
||||
r'^(?:word|ppt|xl)/commentsExtended\.xml$',
|
||||
r'^(?:word|ppt|xl)/commentsExtensible\.xml$',
|
||||
r'^(?:word|ppt|xl)/commentsIds\.xml$',
|
||||
# we have an allowlist in self.files_to_keep,
|
||||
# so we can trash everything else
|
||||
r'^(?:word|ppt)/_rels/',
|
||||
r'^(?:word|ppt|xl)/_rels/',
|
||||
r'docMetadata/LabelInfo\.xml$'
|
||||
}))
|
||||
|
||||
if self.__fill_files_to_keep_via_content_types() is False:
|
||||
@@ -118,13 +172,13 @@ class MSOfficeParser(ZipParser):
|
||||
return False
|
||||
xml_data = zin.read('[Content_Types].xml')
|
||||
|
||||
self.content_types = dict() # type: Dict[str, str]
|
||||
self.content_types: Dict[str, str] = dict()
|
||||
try:
|
||||
tree = ET.fromstring(xml_data)
|
||||
except ET.ParseError:
|
||||
return False
|
||||
for c in tree:
|
||||
if 'PartName' not in c.attrib or 'ContentType' not in c.attrib:
|
||||
if 'PartName' not in c.attrib or 'ContentType' not in c.attrib: # pragma: no cover
|
||||
continue
|
||||
elif c.attrib['ContentType'] in self.content_types_to_keep:
|
||||
fname = c.attrib['PartName'][1:] # remove leading `/`
|
||||
@@ -144,12 +198,12 @@ class MSOfficeParser(ZipParser):
|
||||
"""
|
||||
try:
|
||||
tree, namespace = _parse_xml(full_path)
|
||||
except ET.ParseError as e:
|
||||
except ET.ParseError as e: # pragma: no cover
|
||||
logging.error("Unable to parse %s: %s", full_path, e)
|
||||
return False
|
||||
|
||||
# rsid, tags or attributes, are always under the `w` namespace
|
||||
if 'w' not in namespace.keys():
|
||||
if 'w' not in namespace:
|
||||
return True
|
||||
|
||||
parent_map = {c:p for p in tree.iter() for c in p}
|
||||
@@ -166,7 +220,7 @@ class MSOfficeParser(ZipParser):
|
||||
for element in elements_to_remove:
|
||||
parent_map[element].remove(element)
|
||||
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
@@ -185,10 +239,10 @@ class MSOfficeParser(ZipParser):
|
||||
return False
|
||||
|
||||
# The nsid tag is always under the `w` namespace
|
||||
if 'w' not in namespace.keys():
|
||||
if 'w' not in namespace:
|
||||
return True
|
||||
|
||||
parent_map = {c:p for p in tree.iter() for c in p}
|
||||
parent_map = {c: p for p in tree.iter() for c in p}
|
||||
|
||||
elements_to_remove = list()
|
||||
for element in tree.iterfind('.//w:nsid', namespace):
|
||||
@@ -196,15 +250,14 @@ class MSOfficeParser(ZipParser):
|
||||
for element in elements_to_remove:
|
||||
parent_map[element].remove(element)
|
||||
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
|
||||
@staticmethod
|
||||
def __remove_revisions(full_path: str) -> bool:
|
||||
try:
|
||||
tree, namespace = _parse_xml(full_path)
|
||||
except ET.ParseError as e:
|
||||
except ET.ParseError as e: # pragma: no cover
|
||||
logging.error("Unable to parse %s: %s", full_path, e)
|
||||
return False
|
||||
|
||||
@@ -230,11 +283,82 @@ class MSOfficeParser(ZipParser):
|
||||
for children in element.iterfind('./*'):
|
||||
elements_ins.append((element, position, children))
|
||||
break
|
||||
|
||||
for (element, position, children) in elements_ins:
|
||||
parent_map[element].insert(position, children)
|
||||
|
||||
# the list can sometimes contain duplicate elements, so don't remove
|
||||
# until all children have been processed
|
||||
for (element, position, children) in elements_ins:
|
||||
if element in parent_map[element]:
|
||||
parent_map[element].remove(element)
|
||||
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def __remove_document_comment_meta(full_path: str) -> bool:
|
||||
try:
|
||||
tree, namespace = _parse_xml(full_path)
|
||||
except ET.ParseError as e: # pragma: no cover
|
||||
logging.error("Unable to parse %s: %s", full_path, e)
|
||||
return False
|
||||
|
||||
# search the docs to see if we can bail early
|
||||
range_start = tree.find('.//w:commentRangeStart', namespace)
|
||||
range_end = tree.find('.//w:commentRangeEnd', namespace)
|
||||
references = tree.find('.//w:commentReference', namespace)
|
||||
if range_start is None and range_end is None and references is None:
|
||||
return True # No comment meta tags are present
|
||||
|
||||
parent_map = {c:p for p in tree.iter() for c in p}
|
||||
|
||||
# iterate over the elements and add them to list
|
||||
elements_del = list()
|
||||
for element in tree.iterfind('.//w:commentRangeStart', namespace):
|
||||
elements_del.append(element)
|
||||
for element in tree.iterfind('.//w:commentRangeEnd', namespace):
|
||||
elements_del.append(element)
|
||||
for element in tree.iterfind('.//w:commentReference', namespace):
|
||||
elements_del.append(element)
|
||||
|
||||
# remove the elements
|
||||
for element in elements_del:
|
||||
parent_map[element].remove(element)
|
||||
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
def __remove_document_xml_rels_members(self, full_path: str) -> bool:
|
||||
""" Remove the dangling references from the word/_rels/document.xml.rels file, since MS office doesn't like them.
|
||||
"""
|
||||
try:
|
||||
tree, namespace = _parse_xml(full_path)
|
||||
except ET.ParseError as e: # pragma: no cover
|
||||
logging.error("Unable to parse %s: %s", full_path, e)
|
||||
return False
|
||||
|
||||
if len(namespace.items()) != 1: # pragma: no cover
|
||||
logging.debug("Got several namespaces for Types: %s", namespace.items())
|
||||
|
||||
removed_fnames = set()
|
||||
with zipfile.ZipFile(self.filename) as zin:
|
||||
for fname in [item.filename for item in zin.infolist()]:
|
||||
for file_to_omit in self.files_to_omit:
|
||||
if file_to_omit.search(fname):
|
||||
matches = map(lambda r: r.search(fname), self.files_to_keep)
|
||||
if any(matches): # the file is in the allowlist
|
||||
continue
|
||||
removed_fnames.add(fname)
|
||||
break
|
||||
|
||||
root = tree.getroot()
|
||||
for item in root.findall('{%s}Relationship' % namespace['']):
|
||||
name = 'word/' + item.attrib['Target'] # add the word/ prefix to the path, since all document rels are in the word/ directory
|
||||
if name in removed_fnames:
|
||||
root.remove(item)
|
||||
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
def __remove_content_type_members(self, full_path: str) -> bool:
|
||||
@@ -247,8 +371,8 @@ class MSOfficeParser(ZipParser):
|
||||
logging.error("Unable to parse %s: %s", full_path, e)
|
||||
return False
|
||||
|
||||
if len(namespace.items()) != 1:
|
||||
return False # there should be only one namespace for Types
|
||||
if len(namespace.items()) != 1: # pragma: no cover
|
||||
logging.debug("Got several namespaces for Types: %s", namespace.items())
|
||||
|
||||
removed_fnames = set()
|
||||
with zipfile.ZipFile(self.filename) as zin:
|
||||
@@ -267,25 +391,88 @@ class MSOfficeParser(ZipParser):
|
||||
if name in removed_fnames:
|
||||
root.remove(item)
|
||||
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
def _final_checks(self) -> bool:
|
||||
for k, v in self.__counters.items():
|
||||
if v and len(v) != max(v):
|
||||
# TODO: make this an error and return False
|
||||
# once the ability to correct the counters is implemented
|
||||
logging.warning("%s contains invalid %s: %s", self.filename, k, v)
|
||||
return True
|
||||
return True
|
||||
|
||||
def __collect_counters(self, full_path: str):
|
||||
with open(full_path, encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
# "relationship Id"
|
||||
for i in re.findall(r'(?:\s|r:)[iI][dD]="rId([0-9]+)"(?:\s|/)', content):
|
||||
self.__counters['rid'].add(int(i))
|
||||
# "connector for Non-visual property"
|
||||
for i in re.findall(r'<p:cNvPr id="([0-9]+)"', content):
|
||||
self.__counters['cNvPr'].add(int(i))
|
||||
|
||||
@staticmethod
|
||||
def __randomize_creationId(full_path: str) -> bool:
|
||||
try:
|
||||
tree, namespace = _parse_xml(full_path)
|
||||
except ET.ParseError as e: # pragma: no cover
|
||||
logging.error("Unable to parse %s: %s", full_path, e)
|
||||
return False
|
||||
|
||||
if 'p14' not in namespace:
|
||||
return True # pragma: no cover
|
||||
|
||||
for item in tree.iterfind('.//p14:creationId', namespace):
|
||||
item.set('val', '%s' % random.randint(0, 2**32))
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def __randomize_sldMasterId(full_path: str) -> bool:
|
||||
try:
|
||||
tree, namespace = _parse_xml(full_path)
|
||||
except ET.ParseError as e: # pragma: no cover
|
||||
logging.error("Unable to parse %s: %s", full_path, e)
|
||||
return False
|
||||
|
||||
if 'p' not in namespace:
|
||||
return True # pragma: no cover
|
||||
|
||||
for item in tree.iterfind('.//p:sldMasterId', namespace):
|
||||
item.set('id', '%s' % random.randint(0, 2**32))
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
def _specific_cleanup(self, full_path: str) -> bool:
|
||||
# pylint: disable=too-many-return-statements
|
||||
# pylint: disable=too-many-return-statements,too-many-branches
|
||||
if os.stat(full_path).st_size == 0: # Don't process empty files
|
||||
return True
|
||||
|
||||
if not full_path.endswith('.xml'):
|
||||
if not full_path.endswith(('.xml', '.xml.rels')):
|
||||
return True
|
||||
|
||||
if self.__randomize_creationId(full_path) is False:
|
||||
return False
|
||||
|
||||
self.__collect_counters(full_path)
|
||||
|
||||
if full_path.endswith('/[Content_Types].xml'):
|
||||
# this file contains references to files that we might
|
||||
# remove, and MS Office doesn't like dangling references
|
||||
if self.__remove_content_type_members(full_path) is False:
|
||||
if self.__remove_content_type_members(full_path) is False: # pragma: no cover
|
||||
return False
|
||||
elif full_path.endswith('/word/document.xml'):
|
||||
# this file contains the revisions
|
||||
if self.__remove_revisions(full_path) is False:
|
||||
return False # pragma: no cover
|
||||
# remove comment references and ranges
|
||||
if self.__remove_document_comment_meta(full_path) is False:
|
||||
return False # pragma: no cover
|
||||
elif full_path.endswith('/word/_rels/document.xml.rels'):
|
||||
# similar to the above, but for the document.xml.rels file
|
||||
if self.__remove_document_xml_rels_members(full_path) is False: # pragma: no cover
|
||||
return False
|
||||
elif full_path.endswith('/docProps/app.xml'):
|
||||
# This file must be present and valid,
|
||||
@@ -301,9 +488,19 @@ class MSOfficeParser(ZipParser):
|
||||
f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
|
||||
f.write(b'<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties">')
|
||||
f.write(b'</cp:coreProperties>')
|
||||
elif full_path.endswith('/ppt/tableStyles.xml'): # pragma: no cover
|
||||
# This file must be present and valid,
|
||||
# so we're removing as much as we can.
|
||||
with open(full_path, 'wb') as f:
|
||||
f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
|
||||
uid = str(uuid.uuid4()).encode('utf-8')
|
||||
f.write(b'<a:tblStyleLst def="{%s}" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"/>' % uid)
|
||||
elif full_path.endswith('ppt/presentation.xml'):
|
||||
if self.__randomize_sldMasterId(full_path) is False:
|
||||
return False # pragma: no cover
|
||||
|
||||
if self.__remove_rsid(full_path) is False:
|
||||
return False
|
||||
return False # pragma: no cover
|
||||
|
||||
if self.__remove_nsid(full_path) is False:
|
||||
return False # pragma: no cover
|
||||
@@ -328,7 +525,7 @@ class MSOfficeParser(ZipParser):
|
||||
# see: https://docs.microsoft.com/en-us/dotnet/framework/wpf/advanced/mc-ignorable-attribute
|
||||
with open(full_path, 'rb') as f:
|
||||
text = f.read()
|
||||
out = re.sub(b'mc:Ignorable="[^"]*"', b'', text, 1)
|
||||
out = re.sub(b'mc:Ignorable="[^"]*"', b'', text, count=1)
|
||||
with open(full_path, 'wb') as f:
|
||||
f.write(out)
|
||||
|
||||
@@ -344,8 +541,8 @@ class MSOfficeParser(ZipParser):
|
||||
|
||||
with open(full_path, encoding='utf-8') as f:
|
||||
try:
|
||||
results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I|re.M)
|
||||
return {k:v for (k, v) in results}
|
||||
results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I | re.M)
|
||||
return {k: v for (k, v) in results}
|
||||
except (TypeError, UnicodeDecodeError):
|
||||
# We didn't manage to parse the xml file
|
||||
return {file_path: 'harmful content', }
|
||||
@@ -362,7 +559,6 @@ class LibreOfficeParser(ZipParser):
|
||||
'application/vnd.oasis.opendocument.image',
|
||||
}
|
||||
|
||||
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
|
||||
@@ -389,14 +585,14 @@ class LibreOfficeParser(ZipParser):
|
||||
logging.error("Unable to parse %s: %s", full_path, e)
|
||||
return False
|
||||
|
||||
if 'office' not in namespace.keys(): # no revisions in the current file
|
||||
if 'office' not in namespace: # no revisions in the current file
|
||||
return True
|
||||
|
||||
for text in tree.getroot().iterfind('.//office:text', namespace):
|
||||
for changes in text.iterfind('.//text:tracked-changes', namespace):
|
||||
text.remove(changes)
|
||||
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
def _specific_cleanup(self, full_path: str) -> bool:
|
||||
|
@@ -1,9 +1,8 @@
|
||||
import logging
|
||||
import glob
|
||||
import os
|
||||
import mimetypes
|
||||
import importlib
|
||||
from typing import TypeVar, List, Tuple, Optional
|
||||
from typing import TypeVar, Optional, List, Tuple
|
||||
|
||||
from . import abstract, UNSUPPORTED_EXTENSIONS
|
||||
|
||||
@@ -12,6 +11,10 @@ T = TypeVar('T', bound='abstract.AbstractParser')
|
||||
mimetypes.add_type('application/epub+zip', '.epub')
|
||||
mimetypes.add_type('application/x-dtbncx+xml', '.ncx') # EPUB Navigation Control XML File
|
||||
|
||||
# This should be removed after we move to python3.10
|
||||
# https://github.com/python/cpython/commit/20a5b7e986377bdfd929d7e8c4e3db5847dfdb2d
|
||||
mimetypes.add_type('image/heic', '.heic')
|
||||
|
||||
|
||||
def __load_all_parsers():
|
||||
""" Loads every parser in a dynamic way """
|
||||
@@ -40,7 +43,10 @@ def _get_parsers() -> List[T]:
|
||||
|
||||
|
||||
def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
|
||||
""" Return the appropriate parser for a given filename. """
|
||||
""" Return the appropriate parser for a given filename.
|
||||
|
||||
:raises ValueError: Raised if the instantiation of the parser went wrong.
|
||||
"""
|
||||
mtype, _ = mimetypes.guess_type(filename)
|
||||
|
||||
_, extension = os.path.splitext(filename)
|
||||
@@ -53,10 +59,6 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
|
||||
|
||||
for parser_class in _get_parsers(): # type: ignore
|
||||
if mtype in parser_class.mimetypes:
|
||||
try:
|
||||
return parser_class(filename), mtype
|
||||
except ValueError as e:
|
||||
logging.info("Got an exception when trying to instantiate "
|
||||
"%s for %s: %s", parser_class, filename, e)
|
||||
return None, mtype
|
||||
# This instantiation might raise a ValueError on malformed files
|
||||
return parser_class(filename), mtype
|
||||
return None, mtype
|
||||
|
@@ -7,8 +7,7 @@ import re
|
||||
import logging
|
||||
import tempfile
|
||||
import io
|
||||
from typing import Dict, Union
|
||||
from distutils.version import LooseVersion
|
||||
from typing import Union, Dict
|
||||
|
||||
import cairo
|
||||
import gi
|
||||
@@ -17,10 +16,7 @@ from gi.repository import Poppler, GLib
|
||||
|
||||
from . import abstract
|
||||
|
||||
poppler_version = Poppler.get_version()
|
||||
if LooseVersion(poppler_version) < LooseVersion('0.46'): # pragma: no cover
|
||||
raise ValueError("mat2 needs at least Poppler version 0.46 to work. \
|
||||
The installed version is %s." % poppler_version) # pragma: no cover
|
||||
FIXED_PDF_VERSION = cairo.PDFVersion.VERSION_1_5
|
||||
|
||||
|
||||
class PDFParser(abstract.AbstractParser):
|
||||
@@ -32,7 +28,7 @@ class PDFParser(abstract.AbstractParser):
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
self.uri = 'file://' + os.path.abspath(self.filename)
|
||||
self.__scale = 2 # how much precision do we want for the render
|
||||
self.__scale = 200 / 72.0 # how much precision do we want for the render
|
||||
try: # Check now that the file is valid, to avoid surprises later
|
||||
Poppler.Document.new_from_file(self.uri, None)
|
||||
except GLib.GError: # Invalid PDF
|
||||
@@ -40,7 +36,10 @@ class PDFParser(abstract.AbstractParser):
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
if self.lightweight_cleaning is True:
|
||||
return self.__remove_all_lightweight()
|
||||
try:
|
||||
return self.__remove_all_lightweight()
|
||||
except (cairo.Error, MemoryError) as e:
|
||||
raise RuntimeError(e)
|
||||
return self.__remove_all_thorough()
|
||||
|
||||
def __remove_all_lightweight(self) -> bool:
|
||||
@@ -52,6 +51,7 @@ class PDFParser(abstract.AbstractParser):
|
||||
|
||||
tmp_path = tempfile.mkstemp()[1]
|
||||
pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) # resized later anyway
|
||||
pdf_surface.restrict_to_version(FIXED_PDF_VERSION)
|
||||
pdf_context = cairo.Context(pdf_surface) # context draws on the surface
|
||||
|
||||
for pagenum in range(pages_count):
|
||||
@@ -80,15 +80,19 @@ class PDFParser(abstract.AbstractParser):
|
||||
|
||||
_, tmp_path = tempfile.mkstemp()
|
||||
pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway
|
||||
pdf_surface.restrict_to_version(FIXED_PDF_VERSION)
|
||||
pdf_context = cairo.Context(pdf_surface)
|
||||
|
||||
for pagenum in range(pages_count):
|
||||
page = document.get_page(pagenum)
|
||||
if page is None: # pragma: no cover
|
||||
logging.error("Unable to get PDF pages")
|
||||
return False
|
||||
page_width, page_height = page.get_size()
|
||||
logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
|
||||
|
||||
width = int(page_width) * self.__scale
|
||||
height = int(page_height) * self.__scale
|
||||
width = int(page_width * self.__scale)
|
||||
height = int(page_height * self.__scale)
|
||||
img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height)
|
||||
img_context = cairo.Context(img_surface)
|
||||
|
||||
@@ -102,7 +106,11 @@ class PDFParser(abstract.AbstractParser):
|
||||
buf.seek(0)
|
||||
|
||||
img = cairo.ImageSurface.create_from_png(buf)
|
||||
pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale)
|
||||
if cairo.version_info < (1, 12, 0):
|
||||
pdf_surface.set_size(width, height)
|
||||
else:
|
||||
pdf_surface.set_size(page_width, page_height)
|
||||
pdf_surface.set_device_scale(1 / self.__scale, 1 / self.__scale)
|
||||
pdf_context.set_source_surface(img, 0, 0)
|
||||
pdf_context.paint()
|
||||
pdf_context.show_page() # draw pdf_context on pdf_surface
|
||||
@@ -122,6 +130,17 @@ class PDFParser(abstract.AbstractParser):
|
||||
document.set_creator('')
|
||||
document.set_creation_date(-1)
|
||||
document.save('file://' + os.path.abspath(out_file))
|
||||
|
||||
# Cairo adds "/Producer" and "/CreationDate", and Poppler sometimes
|
||||
# fails to remove them, we have to use this terrible regex.
|
||||
# It should(tm) be alright though, because cairo's output format
|
||||
# for metadata is fixed.
|
||||
with open(out_file, 'rb') as f:
|
||||
out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(),
|
||||
count=0, flags=re.DOTALL | re.IGNORECASE)
|
||||
with open(out_file, 'wb') as f:
|
||||
f.write(out)
|
||||
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
@@ -131,7 +150,7 @@ class PDFParser(abstract.AbstractParser):
|
||||
metadata[key] = value
|
||||
return metadata
|
||||
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
def get_meta(self) -> Dict[str, Union[str, Dict]]:
|
||||
""" Return a dict with all the meta of the file
|
||||
"""
|
||||
metadata = {}
|
||||
|
@@ -1,5 +1,5 @@
|
||||
import logging
|
||||
from typing import Union, Tuple, Dict
|
||||
from typing import Union, Dict, List, Tuple
|
||||
|
||||
from . import abstract
|
||||
|
||||
@@ -15,7 +15,7 @@ class TorrentParser(abstract.AbstractParser):
|
||||
if self.dict_repr is None:
|
||||
raise ValueError
|
||||
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
def get_meta(self) -> Dict[str, Union[str, Dict]]:
|
||||
metadata = {}
|
||||
for key, value in self.dict_repr.items():
|
||||
if key not in self.allowlist:
|
||||
@@ -76,7 +76,7 @@ class _BencodeHandler:
|
||||
s = s[1:]
|
||||
return s[colon:colon+str_len], s[colon+str_len:]
|
||||
|
||||
def __decode_list(self, s: bytes) -> Tuple[list, bytes]:
|
||||
def __decode_list(self, s: bytes) -> Tuple[List, bytes]:
|
||||
ret = list()
|
||||
s = s[1:] # skip leading `l`
|
||||
while s[0] != ord('e'):
|
||||
@@ -84,7 +84,7 @@ class _BencodeHandler:
|
||||
ret.append(value)
|
||||
return ret, s[1:]
|
||||
|
||||
def __decode_dict(self, s: bytes) -> Tuple[dict, bytes]:
|
||||
def __decode_dict(self, s: bytes) -> Tuple[Dict, bytes]:
|
||||
ret = dict()
|
||||
s = s[1:] # skip leading `d`
|
||||
while s[0] != ord(b'e'):
|
||||
@@ -113,10 +113,10 @@ class _BencodeHandler:
|
||||
ret += self.__encode_func[type(value)](value)
|
||||
return b'd' + ret + b'e'
|
||||
|
||||
def bencode(self, s: Union[dict, list, bytes, int]) -> bytes:
|
||||
def bencode(self, s: Union[Dict, List, bytes, int]) -> bytes:
|
||||
return self.__encode_func[type(s)](s)
|
||||
|
||||
def bdecode(self, s: bytes) -> Union[dict, None]:
|
||||
def bdecode(self, s: bytes) -> Union[Dict, None]:
|
||||
try:
|
||||
ret, trail = self.__decode_func[s[0]](s)
|
||||
except (IndexError, KeyError, ValueError) as e:
|
||||
|
@@ -1,9 +1,9 @@
|
||||
import subprocess
|
||||
import functools
|
||||
import os
|
||||
import shutil
|
||||
import logging
|
||||
|
||||
from typing import Dict, Union
|
||||
from typing import Union, Dict
|
||||
|
||||
from . import exiftool
|
||||
from . import bubblewrap
|
||||
@@ -12,7 +12,7 @@ from . import bubblewrap
|
||||
class AbstractFFmpegParser(exiftool.ExiftoolParser):
|
||||
""" Abstract parser for all FFmpeg-based ones, mainly for video. """
|
||||
# Some fileformats have mandatory metadata fields
|
||||
meta_key_value_allowlist = {} # type: Dict[str, Union[str, int]]
|
||||
meta_key_value_allowlist: Dict[str, Union[str, int]] = dict()
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
if self.meta_key_value_allowlist:
|
||||
@@ -45,12 +45,12 @@ class AbstractFFmpegParser(exiftool.ExiftoolParser):
|
||||
return False
|
||||
return True
|
||||
|
||||
def get_meta(self) -> Dict[str, Union[str, dict]]:
|
||||
def get_meta(self) -> Dict[str, Union[str, Dict]]:
|
||||
meta = super().get_meta()
|
||||
|
||||
ret = dict() # type: Dict[str, Union[str, dict]]
|
||||
ret: Dict[str, Union[str, Dict]] = dict()
|
||||
for key, value in meta.items():
|
||||
if key in self.meta_key_value_allowlist.keys():
|
||||
if key in self.meta_key_value_allowlist:
|
||||
if value == self.meta_key_value_allowlist[key]:
|
||||
continue
|
||||
ret[key] = value
|
||||
@@ -91,11 +91,11 @@ class AVIParser(AbstractFFmpegParser):
|
||||
'VideoFrameRate', 'VideoFrameCount', 'Quality',
|
||||
'SampleSize', 'BMPVersion', 'ImageWidth', 'ImageHeight',
|
||||
'Planes', 'BitDepth', 'Compression', 'ImageLength',
|
||||
'PixelsPerMeterX', 'PixelsPerMeterY', 'NumColors',
|
||||
'NumImportantColors', 'NumColors', 'NumImportantColors',
|
||||
'PixelsPerMeterX', 'PixelsPerMeterY',
|
||||
'NumImportantColors', 'NumColors',
|
||||
'RedMask', 'GreenMask', 'BlueMask', 'AlphaMask',
|
||||
'ColorSpace', 'AudioCodec', 'AudioCodecRate',
|
||||
'AudioSampleCount', 'AudioSampleCount',
|
||||
'AudioSampleCount',
|
||||
'AudioSampleRate', 'Encoding', 'NumChannels',
|
||||
'SampleRate', 'AvgBytesPerSec', 'BitsPerSample',
|
||||
'Duration', 'ImageSize', 'Megapixels'}
|
||||
@@ -135,11 +135,10 @@ class MP4Parser(AbstractFFmpegParser):
|
||||
}
|
||||
|
||||
|
||||
@functools.lru_cache()
|
||||
@functools.lru_cache(maxsize=None)
|
||||
def _get_ffmpeg_path() -> str: # pragma: no cover
|
||||
ffmpeg_path = '/usr/bin/ffmpeg'
|
||||
if os.path.isfile(ffmpeg_path):
|
||||
if os.access(ffmpeg_path, os.X_OK):
|
||||
return ffmpeg_path
|
||||
which_path = shutil.which('ffmpeg')
|
||||
if which_path:
|
||||
return which_path
|
||||
|
||||
raise RuntimeError("Unable to find ffmpeg")
|
||||
|
@@ -1,11 +1,10 @@
|
||||
from html import parser, escape
|
||||
from typing import Dict, Any, List, Tuple, Set, Optional
|
||||
from typing import Any, Optional, Dict, List, Tuple, Set
|
||||
import re
|
||||
import string
|
||||
|
||||
from . import abstract
|
||||
|
||||
assert Set
|
||||
|
||||
# pylint: disable=too-many-instance-attributes
|
||||
|
||||
@@ -17,7 +16,11 @@ class CSSParser(abstract.AbstractParser):
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
with open(self.filename, encoding='utf-8') as f:
|
||||
cleaned = re.sub(r'/\*.*?\*/', '', f.read(), 0, self.flags)
|
||||
try:
|
||||
content = f.read()
|
||||
except UnicodeDecodeError: # pragma: no cover
|
||||
raise ValueError
|
||||
cleaned = re.sub(r'/\*.*?\*/', '', content, count=0, flags=self.flags)
|
||||
with open(self.output_filename, 'w', encoding='utf-8') as f:
|
||||
f.write(cleaned)
|
||||
return True
|
||||
@@ -25,7 +28,11 @@ class CSSParser(abstract.AbstractParser):
|
||||
def get_meta(self) -> Dict[str, Any]:
|
||||
metadata = {}
|
||||
with open(self.filename, encoding='utf-8') as f:
|
||||
cssdoc = re.findall(r'/\*(.*?)\*/', f.read(), self.flags)
|
||||
try:
|
||||
content = f.read()
|
||||
except UnicodeDecodeError: # pragma: no cover
|
||||
raise ValueError
|
||||
cssdoc = re.findall(r'/\*(.*?)\*/', content, self.flags)
|
||||
for match in cssdoc:
|
||||
for line in match.splitlines():
|
||||
try:
|
||||
@@ -37,10 +44,10 @@ class CSSParser(abstract.AbstractParser):
|
||||
|
||||
|
||||
class AbstractHTMLParser(abstract.AbstractParser):
|
||||
tags_blocklist = set() # type: Set[str]
|
||||
tags_blocklist: Set[str] = set()
|
||||
# In some html/xml-based formats some tags are mandatory,
|
||||
# so we're keeping them, but are discarding their content
|
||||
tags_required_blocklist = set() # type: Set[str]
|
||||
tags_required_blocklist: Set[str] = set()
|
||||
|
||||
def __init__(self, filename):
|
||||
super().__init__(filename)
|
||||
@@ -84,7 +91,7 @@ class _HTMLParser(parser.HTMLParser):
|
||||
self.filename = filename
|
||||
self.__textrepr = ''
|
||||
self.__meta = {}
|
||||
self.__validation_queue = [] # type: List[str]
|
||||
self.__validation_queue: List[str] = list()
|
||||
|
||||
# We're using counters instead of booleans, to handle nested tags
|
||||
self.__in_dangerous_but_required_tag = 0
|
||||
@@ -96,6 +103,15 @@ class _HTMLParser(parser.HTMLParser):
|
||||
self.tag_required_blocklist = required_blocklisted_tags
|
||||
self.tag_blocklist = blocklisted_tags
|
||||
|
||||
def error(self, message): # pragma: no cover
|
||||
""" Amusingly, Python's documentation doesn't mention that this
|
||||
function needs to be implemented in subclasses of the parent class
|
||||
of parser.HTMLParser. This was found by fuzzing,
|
||||
triggering the following exception:
|
||||
NotImplementedError: subclasses of ParserBase must override error()
|
||||
"""
|
||||
raise ValueError(message)
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
|
||||
# Ignore the type, because mypy is too stupid to infer
|
||||
# that get_starttag_text() can't return None.
|
||||
@@ -142,7 +158,8 @@ class _HTMLParser(parser.HTMLParser):
|
||||
if data.strip():
|
||||
self.__textrepr += escape(data)
|
||||
|
||||
def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
|
||||
def handle_startendtag(self, tag: str,
|
||||
attrs: List[Tuple[str, Optional[str]]]):
|
||||
if tag in self.tag_required_blocklist | self.tag_blocklist:
|
||||
meta = {k:v for k, v in attrs}
|
||||
name = meta.get('name', 'harmful metadata')
|
||||
|
83
mat2
83
mat2
@@ -2,7 +2,7 @@
|
||||
|
||||
import os
|
||||
import shutil
|
||||
from typing import Tuple, List, Union, Set
|
||||
from typing import List, Set, Dict
|
||||
import sys
|
||||
import mimetypes
|
||||
import argparse
|
||||
@@ -13,34 +13,35 @@ import concurrent.futures
|
||||
try:
|
||||
from libmat2 import parser_factory, UNSUPPORTED_EXTENSIONS
|
||||
from libmat2 import check_dependencies, UnknownMemberPolicy
|
||||
except ValueError as e:
|
||||
print(e)
|
||||
except ValueError as ex:
|
||||
print(ex)
|
||||
sys.exit(1)
|
||||
|
||||
__version__ = '0.10.0'
|
||||
|
||||
# Make pyflakes happy
|
||||
assert Set
|
||||
assert Tuple
|
||||
assert Union
|
||||
__version__ = '0.13.5'
|
||||
|
||||
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING)
|
||||
|
||||
def __print_without_chars(s: str):
|
||||
""" Remove control characters
|
||||
We might use 'Cc' instead of 'C', but better safe than sorry
|
||||
https://www.unicode.org/reports/tr44/#GC_Values_Table
|
||||
"""
|
||||
print(''.join(ch for ch in s if not unicodedata.category(ch).startswith('C')))
|
||||
|
||||
def __check_file(filename: str, mode: int = os.R_OK) -> bool:
|
||||
if not os.path.exists(filename):
|
||||
print("[-] %s doesn't exist." % filename)
|
||||
__print_without_chars("[-] %s doesn't exist." % filename)
|
||||
return False
|
||||
elif not os.path.isfile(filename):
|
||||
print("[-] %s is not a regular file." % filename)
|
||||
__print_without_chars("[-] %s is not a regular file." % filename)
|
||||
return False
|
||||
elif not os.access(filename, mode):
|
||||
mode_str = [] # type: List[str]
|
||||
mode_str: List[str] = list()
|
||||
if mode & os.R_OK:
|
||||
mode_str += 'readable'
|
||||
if mode & os.W_OK:
|
||||
mode_str += 'writeable'
|
||||
print("[-] %s is not %s." % (filename, 'nor '.join(mode_str)))
|
||||
__print_without_chars("[-] %s is not %s." % (filename, 'nor '.join(mode_str)))
|
||||
return False
|
||||
return True
|
||||
|
||||
@@ -56,8 +57,8 @@ def create_arg_parser() -> argparse.ArgumentParser:
|
||||
', '.join(p.value for p in UnknownMemberPolicy))
|
||||
parser.add_argument('--inplace', action='store_true',
|
||||
help='clean in place, without backup')
|
||||
parser.add_argument('--no-sandbox', dest='sandbox', action='store_true',
|
||||
default=False, help='Disable bubblewrap\'s sandboxing.')
|
||||
parser.add_argument('--no-sandbox', dest='sandbox', action='store_false',
|
||||
default=True, help='Disable bubblewrap\'s sandboxing')
|
||||
|
||||
excl_group = parser.add_mutually_exclusive_group()
|
||||
excl_group.add_argument('files', nargs='*', help='the files to process',
|
||||
@@ -85,40 +86,38 @@ def show_meta(filename: str, sandbox: bool):
|
||||
if not __check_file(filename):
|
||||
return
|
||||
|
||||
p, mtype = parser_factory.get_parser(filename) # type: ignore
|
||||
try:
|
||||
p, mtype = parser_factory.get_parser(filename) # type: ignore
|
||||
except ValueError as e:
|
||||
__print_without_chars("[-] something went wrong when processing %s: %s" % (filename, e))
|
||||
return
|
||||
if p is None:
|
||||
print("[-] %s's format (%s) is not supported" % (filename, mtype))
|
||||
__print_without_chars("[-] %s's format (%s) is not supported" % (filename, mtype))
|
||||
return
|
||||
p.sandbox = sandbox
|
||||
__print_meta(filename, p.get_meta())
|
||||
|
||||
|
||||
def __print_meta(filename: str, metadata: dict, depth: int = 1):
|
||||
def __print_meta(filename: str, metadata: Dict, depth: int = 1):
|
||||
padding = " " * depth*2
|
||||
if not metadata:
|
||||
print(padding + "No metadata found in %s." % filename)
|
||||
__print_without_chars(padding + "No metadata found in %s." % filename)
|
||||
return
|
||||
|
||||
print("[%s] Metadata for %s:" % ('+'*depth, filename))
|
||||
__print_without_chars("[%s] Metadata for %s:" % ('+'*depth, filename))
|
||||
|
||||
for (k, v) in sorted(metadata.items()):
|
||||
if isinstance(v, dict):
|
||||
__print_meta(k, v, depth+1)
|
||||
continue
|
||||
|
||||
# Remove control characters
|
||||
# We might use 'Cc' instead of 'C', but better safe than sorry
|
||||
# https://www.unicode.org/reports/tr44/#GC_Values_Table
|
||||
try:
|
||||
v = ''.join(ch for ch in v if not unicodedata.category(ch).startswith('C'))
|
||||
try: # FIXME this is ugly.
|
||||
__print_without_chars(padding + " %s: %s" % (k, v))
|
||||
except UnicodeEncodeError:
|
||||
__print_without_chars(padding + " %s: harmful content" % k)
|
||||
except TypeError:
|
||||
pass # for things that aren't iterable
|
||||
|
||||
try: # FIXME this is ugly.
|
||||
print(padding + " %s: %s" % (k, v))
|
||||
except UnicodeEncodeError:
|
||||
print(padding + " %s: harmful content" % k)
|
||||
|
||||
|
||||
def clean_meta(filename: str, is_lightweight: bool, inplace: bool, sandbox: bool,
|
||||
policy: UnknownMemberPolicy) -> bool:
|
||||
@@ -126,9 +125,13 @@ def clean_meta(filename: str, is_lightweight: bool, inplace: bool, sandbox: bool
|
||||
if not __check_file(filename, mode):
|
||||
return False
|
||||
|
||||
p, mtype = parser_factory.get_parser(filename) # type: ignore
|
||||
try:
|
||||
p, mtype = parser_factory.get_parser(filename) # type: ignore
|
||||
except ValueError as e:
|
||||
__print_without_chars("[-] something went wrong when cleaning %s: %s" % (filename, e))
|
||||
return False
|
||||
if p is None:
|
||||
print("[-] %s's format (%s) is not supported" % (filename, mtype))
|
||||
__print_without_chars("[-] %s's format (%s) is not supported" % (filename, mtype))
|
||||
return False
|
||||
p.unknown_member_policy = policy
|
||||
p.lightweight_cleaning = is_lightweight
|
||||
@@ -143,7 +146,7 @@ def clean_meta(filename: str, is_lightweight: bool, inplace: bool, sandbox: bool
|
||||
os.rename(p.output_filename, filename)
|
||||
return ret
|
||||
except RuntimeError as e:
|
||||
print("[-] %s can't be cleaned: %s" % (filename, e))
|
||||
__print_without_chars("[-] %s can't be cleaned: %s" % (filename, e))
|
||||
return False
|
||||
|
||||
|
||||
@@ -165,7 +168,7 @@ def show_parsers():
|
||||
|
||||
|
||||
def __get_files_recursively(files: List[str]) -> List[str]:
|
||||
ret = set() # type: Set[str]
|
||||
ret: Set[str] = set()
|
||||
for f in files:
|
||||
if os.path.isdir(f):
|
||||
for path, _, _files in os.walk(f):
|
||||
@@ -183,16 +186,16 @@ def main() -> int:
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
if args.verbose:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
logging.getLogger(__name__).setLevel(logging.DEBUG)
|
||||
|
||||
if not args.files:
|
||||
if args.list:
|
||||
show_parsers()
|
||||
return 0
|
||||
elif args.check_dependencies:
|
||||
print("Dependencies for mat2 %s:" % __version__)
|
||||
__print_without_chars("Dependencies for mat2 %s:" % __version__)
|
||||
for key, value in sorted(check_dependencies().items()):
|
||||
print('- %s: %s %s' % (key, 'yes' if value['found'] else 'no',
|
||||
__print_without_chars('- %s: %s %s' % (key, 'yes' if value['found'] else 'no',
|
||||
'(optional)' if not value['required'] else ''))
|
||||
else:
|
||||
arg_parser.print_help()
|
||||
@@ -213,14 +216,14 @@ def main() -> int:
|
||||
files = __get_files_recursively(args.files)
|
||||
# We have to use Processes instead of Threads, since
|
||||
# we're using tempfile.mkdtemp, which isn't thread-safe.
|
||||
futures = list()
|
||||
with concurrent.futures.ProcessPoolExecutor() as executor:
|
||||
futures = list()
|
||||
for f in files:
|
||||
future = executor.submit(clean_meta, f, args.lightweight,
|
||||
inplace, args.sandbox, policy)
|
||||
futures.append(future)
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
no_failure &= future.result()
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
no_failure &= future.result()
|
||||
return 0 if no_failure is True else -1
|
||||
|
||||
|
||||
|
@@ -1,15 +0,0 @@
|
||||
# mat2's Nautilus extension
|
||||
|
||||
# Dependencies
|
||||
|
||||
- Nautilus (now known as [Files](https://wiki.gnome.org/action/show/Apps/Files))
|
||||
- [nautilus-python](https://gitlab.gnome.org/GNOME/nautilus-python) >= 2.10
|
||||
|
||||
# Installation
|
||||
|
||||
Simply copy the `mat2.py` file to `~/.local/share/nautilus-python/extensions`,
|
||||
and launch Nautilus; you should now have a "Remove metadata" item in the
|
||||
right-clic menu on supported files.
|
||||
|
||||
Please note: This is not needed if using a distribution provided package. It
|
||||
only applies if installing from source.
|
244
nautilus/mat2.py
244
nautilus/mat2.py
@@ -1,244 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
Because writing GUI is non-trivial (cf. https://0xacab.org/jvoisin/mat2/issues/3),
|
||||
we decided to write a Nautilus extensions instead
|
||||
(cf. https://0xacab.org/jvoisin/mat2/issues/2).
|
||||
|
||||
The code is a little bit convoluted because Gtk isn't thread-safe,
|
||||
so we're not allowed to call anything Gtk-related outside of the main
|
||||
thread, so we'll have to resort to using a `queue` to pass "messages" around.
|
||||
"""
|
||||
|
||||
# pylint: disable=no-name-in-module,unused-argument,no-self-use,import-error
|
||||
|
||||
import queue
|
||||
import threading
|
||||
from typing import Tuple, Optional, List
|
||||
from urllib.parse import unquote
|
||||
|
||||
import gi
|
||||
gi.require_version('Nautilus', '3.0')
|
||||
gi.require_version('Gtk', '3.0')
|
||||
gi.require_version('GdkPixbuf', '2.0')
|
||||
from gi.repository import Nautilus, GObject, Gtk, Gio, GLib, GdkPixbuf
|
||||
|
||||
from libmat2 import parser_factory
|
||||
|
||||
|
||||
def _remove_metadata(fpath) -> Tuple[bool, Optional[str]]:
|
||||
""" This is a simple wrapper around libmat2, because it's
|
||||
easier and cleaner this way.
|
||||
"""
|
||||
parser, mtype = parser_factory.get_parser(fpath)
|
||||
if parser is None:
|
||||
return False, mtype
|
||||
return parser.remove_all(), mtype
|
||||
|
||||
class Mat2Extension(GObject.GObject, Nautilus.MenuProvider, Nautilus.LocationWidgetProvider):
|
||||
""" This class adds an item to the right-clic menu in Nautilus. """
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.infobar_hbox = None
|
||||
self.infobar = None
|
||||
self.failed_items = list()
|
||||
|
||||
def __infobar_failure(self):
|
||||
""" Add an hbox to the `infobar` warning about the fact that we didn't
|
||||
manage to remove the metadata from every single file.
|
||||
"""
|
||||
self.infobar.set_show_close_button(True)
|
||||
self.infobar_hbox = Gtk.Box(orientation=Gtk.Orientation.HORIZONTAL)
|
||||
|
||||
btn = Gtk.Button("Show")
|
||||
btn.connect("clicked", self.__cb_show_failed)
|
||||
self.infobar_hbox.pack_end(btn, False, False, 0)
|
||||
|
||||
infobar_msg = Gtk.Label("Failed to clean some items")
|
||||
self.infobar_hbox.pack_start(infobar_msg, False, False, 0)
|
||||
|
||||
self.infobar.get_content_area().pack_start(self.infobar_hbox, True, True, 0)
|
||||
self.infobar.show_all()
|
||||
|
||||
def get_widget(self, uri, window) -> Gtk.Widget:
|
||||
""" This is the method that we have to implement (because we're
|
||||
a LocationWidgetProvider) in order to show our infobar.
|
||||
"""
|
||||
self.infobar = Gtk.InfoBar()
|
||||
self.infobar.set_message_type(Gtk.MessageType.ERROR)
|
||||
self.infobar.connect("response", self.__cb_infobar_response)
|
||||
|
||||
return self.infobar
|
||||
|
||||
def __cb_infobar_response(self, infobar, response):
|
||||
""" Callback for the infobar close button.
|
||||
"""
|
||||
if response == Gtk.ResponseType.CLOSE:
|
||||
self.infobar_hbox.destroy()
|
||||
self.infobar.hide()
|
||||
|
||||
def __cb_show_failed(self, button):
|
||||
""" Callback to show a popup containing a list of files
|
||||
that we didn't manage to clean.
|
||||
"""
|
||||
|
||||
# FIXME this should be done only once the window is destroyed
|
||||
self.infobar_hbox.destroy()
|
||||
self.infobar.hide()
|
||||
|
||||
window = Gtk.Window()
|
||||
headerbar = Gtk.HeaderBar()
|
||||
window.set_titlebar(headerbar)
|
||||
headerbar.props.title = "Metadata removal failed"
|
||||
|
||||
close_buton = Gtk.Button("Close")
|
||||
close_buton.connect("clicked", lambda _: window.close())
|
||||
headerbar.pack_end(close_buton)
|
||||
|
||||
box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL)
|
||||
window.add(box)
|
||||
|
||||
box.add(self.__create_treeview())
|
||||
window.show_all()
|
||||
|
||||
@staticmethod
|
||||
def __validate(fileinfo) -> Tuple[bool, str]:
|
||||
""" Validate if a given file FileInfo `fileinfo` can be processed.
|
||||
Returns a boolean, and a textreason why"""
|
||||
if fileinfo.get_uri_scheme() != "file" or fileinfo.is_directory():
|
||||
return False, "Not a file"
|
||||
elif not fileinfo.can_write():
|
||||
return False, "Not writeable"
|
||||
return True, ""
|
||||
|
||||
def __create_treeview(self) -> Gtk.TreeView:
|
||||
liststore = Gtk.ListStore(GdkPixbuf.Pixbuf, str, str)
|
||||
treeview = Gtk.TreeView(model=liststore)
|
||||
|
||||
renderer_pixbuf = Gtk.CellRendererPixbuf()
|
||||
column_pixbuf = Gtk.TreeViewColumn("Icon", renderer_pixbuf, pixbuf=0)
|
||||
treeview.append_column(column_pixbuf)
|
||||
|
||||
for idx, name in enumerate(['File', 'Reason']):
|
||||
renderer_text = Gtk.CellRendererText()
|
||||
column_text = Gtk.TreeViewColumn(name, renderer_text, text=idx+1)
|
||||
treeview.append_column(column_text)
|
||||
|
||||
for (fname, mtype, reason) in self.failed_items:
|
||||
# This part is all about adding mimetype icons to the liststore
|
||||
icon = Gio.content_type_get_icon('text/plain' if not mtype else mtype)
|
||||
# in case we don't have the corresponding icon,
|
||||
# we're adding `text/plain`, because we have this one for sure™
|
||||
names = icon.get_names() + ['text/plain', ]
|
||||
icon_theme = Gtk.IconTheme.get_default()
|
||||
for name in names:
|
||||
try:
|
||||
img = icon_theme.load_icon(name, Gtk.IconSize.BUTTON, 0)
|
||||
break
|
||||
except GLib.GError:
|
||||
pass
|
||||
|
||||
liststore.append([img, fname, reason])
|
||||
|
||||
treeview.show_all()
|
||||
return treeview
|
||||
|
||||
def __create_progressbar(self) -> Gtk.ProgressBar:
|
||||
""" Create the progressbar used to notify that files are currently
|
||||
being processed.
|
||||
"""
|
||||
self.infobar.set_show_close_button(False)
|
||||
self.infobar.set_message_type(Gtk.MessageType.INFO)
|
||||
self.infobar_hbox = Gtk.Box(orientation=Gtk.Orientation.HORIZONTAL)
|
||||
|
||||
progressbar = Gtk.ProgressBar()
|
||||
self.infobar_hbox.pack_start(progressbar, True, True, 0)
|
||||
progressbar.set_show_text(True)
|
||||
|
||||
self.infobar.get_content_area().pack_start(self.infobar_hbox, True, True, 0)
|
||||
self.infobar.show_all()
|
||||
|
||||
return progressbar
|
||||
|
||||
def __update_progressbar(self, processing_queue, progressbar) -> bool:
|
||||
""" This method is run via `Glib.add_idle` to update the progressbar."""
|
||||
try:
|
||||
fname = processing_queue.get(block=False)
|
||||
except queue.Empty:
|
||||
return True
|
||||
|
||||
# `None` is the marker put in the queue to signal that every selected
|
||||
# file was processed.
|
||||
if fname is None:
|
||||
self.infobar_hbox.destroy()
|
||||
self.infobar.hide()
|
||||
if self.failed_items:
|
||||
self.__infobar_failure()
|
||||
if not processing_queue.empty():
|
||||
print("Something went wrong, the queue isn't empty :/")
|
||||
return False
|
||||
|
||||
progressbar.pulse()
|
||||
progressbar.set_text("Cleaning %s" % fname)
|
||||
progressbar.show_all()
|
||||
self.infobar_hbox.show_all()
|
||||
self.infobar.show_all()
|
||||
return True
|
||||
|
||||
def __clean_files(self, files: list, processing_queue: queue.Queue) -> bool:
|
||||
""" This method is threaded in order to avoid blocking the GUI
|
||||
while cleaning up the files.
|
||||
"""
|
||||
for fileinfo in files:
|
||||
fname = fileinfo.get_name()
|
||||
processing_queue.put(fname)
|
||||
|
||||
valid, reason = self.__validate(fileinfo)
|
||||
if not valid:
|
||||
self.failed_items.append((fname, None, reason))
|
||||
continue
|
||||
|
||||
fpath = unquote(fileinfo.get_uri()[7:]) # `len('file://') = 7`
|
||||
success, mtype = _remove_metadata(fpath)
|
||||
if not success:
|
||||
self.failed_items.append((fname, mtype, 'Unsupported/invalid'))
|
||||
processing_queue.put(None) # signal that we processed all the files
|
||||
return True
|
||||
|
||||
def __cb_menu_activate(self, menu, files):
|
||||
""" This method is called when the user clicked the "clean metadata"
|
||||
menu item.
|
||||
"""
|
||||
self.failed_items = list()
|
||||
progressbar = self.__create_progressbar()
|
||||
progressbar.set_pulse_step = 1.0 / len(files)
|
||||
self.infobar.show_all()
|
||||
|
||||
processing_queue = queue.Queue()
|
||||
GLib.idle_add(self.__update_progressbar, processing_queue, progressbar)
|
||||
|
||||
thread = threading.Thread(target=self.__clean_files, args=(files, processing_queue))
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
|
||||
def get_background_items(self, window, file):
|
||||
""" https://bugzilla.gnome.org/show_bug.cgi?id=784278 """
|
||||
return None
|
||||
|
||||
def get_file_items(self, window, files) -> Optional[List[Nautilus.MenuItem]]:
|
||||
""" This method is the one allowing us to create a menu item.
|
||||
"""
|
||||
# Do not show the menu item if not a single file has a chance to be
|
||||
# processed by mat2.
|
||||
if not any([is_valid for (is_valid, _) in map(self.__validate, files)]):
|
||||
return None
|
||||
|
||||
item = Nautilus.MenuItem(
|
||||
name="mat2::Remove_metadata",
|
||||
label="Remove metadata",
|
||||
tip="Remove metadata"
|
||||
)
|
||||
item.connect('activate', self.__cb_menu_activate, files)
|
||||
|
||||
return [item, ]
|
21
pyproject.toml
Normal file
21
pyproject.toml
Normal file
@@ -0,0 +1,21 @@
|
||||
[project]
|
||||
name = "mat2"
|
||||
version = "0.13.5"
|
||||
description = "mat2 is a metadata removal tool, supporting a wide range of commonly used file formats, written in python3: at its core, it's a library, used by an eponymous command-line interface, as well as several file manager extensions."
|
||||
readme = "README.md"
|
||||
license = {file = "LICENSE"}
|
||||
requires-python = ">=3.9"
|
||||
dependencies = [
|
||||
'mutagen',
|
||||
'PyGObject',
|
||||
'pycairo',
|
||||
]
|
||||
[project.urls]
|
||||
Repository = "https://github.com/jvoisin/mat2"
|
||||
Issues = "https://github.com/jvoisin/mat2/issues"
|
||||
Changelog = "https://github.com/jvoisin/mat2/blob/master/CHANGELOG.md"
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py39"
|
||||
# E501 Line too long
|
||||
ignore = ["E501", "F401", "E402", "E722"]
|
7
setup.py
7
setup.py
@@ -5,13 +5,13 @@ with open("README.md", encoding='utf-8') as fh:
|
||||
|
||||
setuptools.setup(
|
||||
name="mat2",
|
||||
version='0.10.0',
|
||||
version='0.13.5',
|
||||
author="Julien (jvoisin) Voisin",
|
||||
author_email="julien.voisin+mat2@dustri.org",
|
||||
description="A handy tool to trash your metadata",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://0xacab.org/jvoisin/mat2",
|
||||
url="https://github.com/jvoisin/mat2",
|
||||
python_requires = '>=3.5.0',
|
||||
scripts=['mat2'],
|
||||
install_requires=[
|
||||
@@ -20,6 +20,7 @@ setuptools.setup(
|
||||
'pycairo',
|
||||
],
|
||||
packages=setuptools.find_packages(exclude=('tests', )),
|
||||
data_files = [('share/man/man1', ['doc/mat2.1'])],
|
||||
classifiers=[
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Environment :: Console",
|
||||
@@ -30,6 +31,6 @@ setuptools.setup(
|
||||
"Intended Audience :: End Users/Desktop",
|
||||
],
|
||||
project_urls={
|
||||
'bugtacker': 'https://0xacab.org/jvoisin/mat2/issues',
|
||||
'bugtacker': 'https://github.com/jvoisin/mat2/issues',
|
||||
},
|
||||
)
|
||||
|
BIN
tests/data/comment.docx
Normal file
BIN
tests/data/comment.docx
Normal file
Binary file not shown.
BIN
tests/data/dirty.aiff
Normal file
BIN
tests/data/dirty.aiff
Normal file
Binary file not shown.
BIN
tests/data/dirty.heic
Normal file
BIN
tests/data/dirty.heic
Normal file
Binary file not shown.
BIN
tests/data/dirty.wav
Normal file
BIN
tests/data/dirty.wav
Normal file
Binary file not shown.
BIN
tests/data/dirty.webp
Normal file
BIN
tests/data/dirty.webp
Normal file
Binary file not shown.
After Width: | Height: | Size: 38 KiB |
Binary file not shown.
BIN
tests/data/narrated_powerpoint_presentation.pptx
Normal file
BIN
tests/data/narrated_powerpoint_presentation.pptx
Normal file
Binary file not shown.
BIN
tests/dirty.epub
BIN
tests/dirty.epub
Binary file not shown.
54
tests/fuzz.py
Normal file
54
tests/fuzz.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import mimetypes
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.append('..')
|
||||
|
||||
import atheris
|
||||
|
||||
with atheris.instrument_imports(enable_loader_override=False):
|
||||
from libmat2 import parser_factory, UNSUPPORTED_EXTENSIONS
|
||||
|
||||
extensions = set()
|
||||
for parser in parser_factory._get_parsers(): # type: ignore
|
||||
for mtype in parser.mimetypes:
|
||||
if mtype.startswith('video'):
|
||||
continue
|
||||
if 'aif' in mtype:
|
||||
continue
|
||||
if 'wav' in mtype:
|
||||
continue
|
||||
if 'gif' in mtype:
|
||||
continue
|
||||
if 'aifc' in mtype:
|
||||
continue
|
||||
for extension in mimetypes.guess_all_extensions(mtype):
|
||||
if extension not in UNSUPPORTED_EXTENSIONS:
|
||||
extensions.add(extension)
|
||||
extensions = list(extensions)
|
||||
|
||||
|
||||
|
||||
def TestOneInput(data):
|
||||
fdp = atheris.FuzzedDataProvider(data)
|
||||
extension = fdp.PickValueInList(extensions)
|
||||
data = fdp.ConsumeBytes(sys.maxsize)
|
||||
|
||||
fname = '/tmp/mat2_fuzz' + extension
|
||||
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(data)
|
||||
try:
|
||||
p, _ = parser_factory.get_parser(fname)
|
||||
if p:
|
||||
p.sandbox = False
|
||||
p.get_meta()
|
||||
p.remove_all()
|
||||
p, _ = parser_factory.get_parser(fname)
|
||||
p.get_meta()
|
||||
except ValueError:
|
||||
pass
|
||||
os.remove(fname)
|
||||
|
||||
atheris.Setup(sys.argv, TestOneInput)
|
||||
atheris.Fuzz()
|
@@ -1,4 +1,3 @@
|
||||
import sys
|
||||
import random
|
||||
import os
|
||||
import shutil
|
||||
@@ -22,18 +21,24 @@ class TestHelp(unittest.TestCase):
|
||||
def test_help(self):
|
||||
proc = subprocess.Popen(mat2_binary + ['--help'], stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'mat2 [-h] [-V] [--unknown-members policy] [--inplace] [--no-sandbox]',
|
||||
stdout)
|
||||
self.assertIn(b' [-v] [-l] [--check-dependencies] [-L | -s]', stdout)
|
||||
self.assertIn(b'[files [files ...]]', stdout)
|
||||
self.assertIn(b'mat2 [-h] [-V]', stdout)
|
||||
self.assertIn(b'[--unknown-members policy]', stdout)
|
||||
self.assertIn(b'[--inplace]', stdout)
|
||||
self.assertIn(b'[--no-sandbox]', stdout)
|
||||
self.assertIn(b' [-v] [-l]', stdout)
|
||||
self.assertIn(b'[--check-dependencies]', stdout)
|
||||
self.assertIn(b'[-L | -s]', stdout)
|
||||
self.assertIn(b'[files ...]', stdout)
|
||||
|
||||
def test_no_arg(self):
|
||||
proc = subprocess.Popen(mat2_binary, stdout=subprocess.PIPE)
|
||||
stdout, _ = proc.communicate()
|
||||
self.assertIn(b'mat2 [-h] [-V] [--unknown-members policy] [--inplace] [--no-sandbox]',
|
||||
stdout)
|
||||
self.assertIn(b'mat2 [-h] [-V]', stdout)
|
||||
self.assertIn(b'[--unknown-members policy]', stdout)
|
||||
self.assertIn(b'[--inplace]', stdout)
|
||||
self.assertIn(b'[--no-sandbox]', stdout)
|
||||
self.assertIn(b' [-v] [-l] [--check-dependencies] [-L | -s]', stdout)
|
||||
self.assertIn(b'[files [files ...]]', stdout)
|
||||
self.assertIn(b'[files ...]', stdout)
|
||||
|
||||
|
||||
class TestVersion(unittest.TestCase):
|
||||
@@ -231,6 +236,11 @@ class TestGetMeta(unittest.TestCase):
|
||||
self.assertIn(b'i am a : various comment', stdout)
|
||||
self.assertIn(b'artist: jvoisin', stdout)
|
||||
|
||||
#def test_webp(self):
|
||||
# proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/dirty.webp'],
|
||||
# stdout=subprocess.PIPE)
|
||||
# stdout, _ = proc.communicate()
|
||||
# self.assertIn(b'Warning: [minor] Improper EXIF header', stdout)
|
||||
|
||||
class TestControlCharInjection(unittest.TestCase):
|
||||
def test_jpg(self):
|
||||
@@ -261,14 +271,7 @@ class TestCommandLineParallel(unittest.TestCase):
|
||||
def test_different(self):
|
||||
src = './tests/data/'
|
||||
dst = './tests/data/parallel'
|
||||
if sys.version_info >= (3, 8):
|
||||
with os.scandir(src) as itr:
|
||||
entries = list(itr)
|
||||
shutil._copytree(entries=entries, src=src, dst=dst, symlinks=False,
|
||||
ignore=None, copy_function=shutil.copy2,
|
||||
ignore_dangling_symlinks=False)
|
||||
else:
|
||||
shutil.copytree(src, dst)
|
||||
shutil.copytree(src, dst)
|
||||
|
||||
proc = subprocess.Popen(mat2_binary + glob.glob('./tests/data/parallel/dirty.*'),
|
||||
stdout=subprocess.PIPE)
|
||||
@@ -280,7 +283,7 @@ class TestCommandLineParallel(unittest.TestCase):
|
||||
self.assertIsNotNone(p)
|
||||
p = parser_factory.get_parser(p.output_filename)
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
shutil.rmtree('./tests/data/parallel')
|
||||
shutil.rmtree('./tests/data/parallel/')
|
||||
|
||||
def test_faulty(self):
|
||||
for i in range(self.iterations):
|
||||
|
@@ -14,7 +14,7 @@ from libmat2 import harmless, video, web, archive
|
||||
|
||||
# No need to logging messages, should something go wrong,
|
||||
# the testsuite _will_ fail.
|
||||
logger = logging.getLogger()
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.FATAL)
|
||||
|
||||
|
||||
@@ -65,8 +65,10 @@ class TestCorruptedEmbedded(unittest.TestCase):
|
||||
def test_docx(self):
|
||||
shutil.copy('./tests/data/embedded_corrupted.docx', './tests/data/clean.docx')
|
||||
parser, _ = parser_factory.get_parser('./tests/data/clean.docx')
|
||||
self.assertFalse(parser.remove_all())
|
||||
self.assertIsNotNone(parser.get_meta())
|
||||
with self.assertRaises(ValueError):
|
||||
parser.remove_all()
|
||||
with self.assertRaises(ValueError):
|
||||
self.assertIsNotNone(parser.get_meta())
|
||||
os.remove('./tests/data/clean.docx')
|
||||
|
||||
def test_odt(self):
|
||||
@@ -89,9 +91,8 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase):
|
||||
class TestWrongContentTypesFileOffice(unittest.TestCase):
|
||||
def test_office_incomplete(self):
|
||||
shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx')
|
||||
p = office.MSOfficeParser('./tests/data/clean.docx')
|
||||
self.assertIsNotNone(p)
|
||||
self.assertFalse(p.remove_all())
|
||||
with self.assertRaises(ValueError):
|
||||
office.MSOfficeParser('./tests/data/clean.docx')
|
||||
os.remove('./tests/data/clean.docx')
|
||||
|
||||
def test_office_broken(self):
|
||||
@@ -121,8 +122,8 @@ class TestCorruptedFiles(unittest.TestCase):
|
||||
|
||||
def test_png2(self):
|
||||
shutil.copy('./tests/test_libmat2.py', './tests/clean.png')
|
||||
parser, _ = parser_factory.get_parser('./tests/clean.png')
|
||||
self.assertIsNone(parser)
|
||||
with self.assertRaises(ValueError):
|
||||
parser_factory.get_parser('./tests/clean.png')
|
||||
os.remove('./tests/clean.png')
|
||||
|
||||
def test_torrent(self):
|
||||
@@ -187,6 +188,15 @@ class TestCorruptedFiles(unittest.TestCase):
|
||||
audio.MP3Parser('./tests/data/clean.mp3')
|
||||
os.remove('./tests/data/clean.mp3')
|
||||
|
||||
def test_wrong_tif(self):
|
||||
shutil.copy('./tests/data/dirty.tiff', './tests/data/clean.tif')
|
||||
p = images.TiffParser('./tests/data/clean.tif')
|
||||
p.remove_all()
|
||||
p = images.TiffParser('./tests/data/clean.cleaned.tif')
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
os.remove('./tests/data/clean.tif')
|
||||
os.remove('./tests/data/clean.cleaned.tif')
|
||||
|
||||
def test_jpg(self):
|
||||
shutil.copy('./tests/data/dirty.mp3', './tests/data/clean.jpg')
|
||||
with self.assertRaises(ValueError):
|
||||
@@ -229,10 +239,10 @@ class TestCorruptedFiles(unittest.TestCase):
|
||||
zout.write('./tests/data/embedded_corrupted.docx')
|
||||
p, mimetype = parser_factory.get_parser('./tests/data/clean.zip')
|
||||
self.assertEqual(mimetype, 'application/zip')
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
|
||||
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
||||
self.assertFalse(p.remove_all())
|
||||
with self.assertRaises(ValueError):
|
||||
p.get_meta()
|
||||
with self.assertRaises(ValueError):
|
||||
self.assertFalse(p.remove_all())
|
||||
os.remove('./tests/data/clean.zip')
|
||||
|
||||
def test_html(self):
|
||||
@@ -307,10 +317,10 @@ class TestCorruptedFiles(unittest.TestCase):
|
||||
zout.addfile(tarinfo, f)
|
||||
p, mimetype = parser_factory.get_parser('./tests/data/clean.tar')
|
||||
self.assertEqual(mimetype, 'application/x-tar')
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['./tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
|
||||
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
||||
self.assertFalse(p.remove_all())
|
||||
with self.assertRaises(ValueError):
|
||||
p.get_meta()
|
||||
with self.assertRaises(ValueError):
|
||||
self.assertFalse(p.remove_all())
|
||||
os.remove('./tests/data/clean.tar')
|
||||
|
||||
shutil.copy('./tests/data/dirty.png', './tests/data/clean.tar')
|
||||
|
@@ -4,6 +4,7 @@ import unittest
|
||||
import shutil
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import tarfile
|
||||
import tempfile
|
||||
import zipfile
|
||||
@@ -113,6 +114,11 @@ class TestGetMeta(unittest.TestCase):
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['Comment'], 'Created with GIMP')
|
||||
|
||||
#def test_webp(self):
|
||||
# p = images.WEBPParser('./tests/data/dirty.webp')
|
||||
# meta = p.get_meta()
|
||||
# self.assertEqual(meta['Warning'], '[minor] Improper EXIF header')
|
||||
|
||||
def test_ppm(self):
|
||||
p = images.PPMParser('./tests/data/dirty.ppm')
|
||||
meta = p.get_meta()
|
||||
@@ -127,6 +133,11 @@ class TestGetMeta(unittest.TestCase):
|
||||
self.assertEqual(meta['Model'], 'C7070WZ')
|
||||
self.assertEqual(meta['ModifyDate'], '2005:12:26 17:09:35')
|
||||
|
||||
def test_wav(self):
|
||||
p = audio.WAVParser('./tests/data/dirty.wav')
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['Artist'], 'jvoisin')
|
||||
|
||||
def test_mp3(self):
|
||||
p = audio.MP3Parser('./tests/data/dirty.mp3')
|
||||
meta = p.get_meta()
|
||||
@@ -170,14 +181,30 @@ class TestGetMeta(unittest.TestCase):
|
||||
|
||||
def test_zip(self):
|
||||
with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
|
||||
zout.write('./tests/data/dirty.flac')
|
||||
zout.write('./tests/data/dirty.docx')
|
||||
zout.write('./tests/data/dirty.jpg')
|
||||
zout.write('./tests/data/dirty.flac',
|
||||
compress_type = zipfile.ZIP_STORED)
|
||||
zout.write('./tests/data/dirty.docx',
|
||||
compress_type = zipfile.ZIP_DEFLATED)
|
||||
zout.write('./tests/data/dirty.jpg',
|
||||
compress_type = zipfile.ZIP_BZIP2)
|
||||
zout.write('./tests/data/dirty.txt',
|
||||
compress_type = zipfile.ZIP_LZMA)
|
||||
p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip')
|
||||
self.assertEqual(mimetype, 'application/zip')
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
|
||||
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
||||
|
||||
with zipfile.ZipFile('./tests/data/dirty.zip') as zipin:
|
||||
members = {
|
||||
'tests/data/dirty.flac' : zipfile.ZIP_STORED,
|
||||
'tests/data/dirty.docx': zipfile.ZIP_DEFLATED,
|
||||
'tests/data/dirty.jpg' : zipfile.ZIP_BZIP2,
|
||||
'tests/data/dirty.txt' : zipfile.ZIP_LZMA,
|
||||
}
|
||||
for k, v in members.items():
|
||||
self.assertEqual(zipin.getinfo(k).compress_type, v)
|
||||
|
||||
os.remove('./tests/data/dirty.zip')
|
||||
|
||||
def test_wmv(self):
|
||||
@@ -225,6 +252,17 @@ class TestGetMeta(unittest.TestCase):
|
||||
p = images.SVGParser('./tests/data/weird.svg')
|
||||
self.assertEqual(p.get_meta()['Xmlns'], 'http://www.w3.org/1337/svg')
|
||||
|
||||
def test_aiff(self):
|
||||
p = audio.AIFFParser('./tests/data/dirty.aiff')
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['Name'], 'I am so')
|
||||
|
||||
def test_heic(self):
|
||||
p = images.HEICParser('./tests/data/dirty.heic')
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['ProfileCopyright'], 'Public Domain')
|
||||
self.assertEqual(meta['ProfileDescription'], 'GIMP built-in sRGB')
|
||||
|
||||
|
||||
class TestRemovingThumbnails(unittest.TestCase):
|
||||
def test_odt(self):
|
||||
@@ -301,7 +339,23 @@ class TestCleaning(unittest.TestCase):
|
||||
'parser': images.JPGParser,
|
||||
'meta': {'Comment': 'Created with GIMP'},
|
||||
'expected_meta': {},
|
||||
#}, {
|
||||
# 'name': 'webp',
|
||||
# 'parser': images.WEBPParser,
|
||||
# 'meta': {'Warning': '[minor] Improper EXIF header'},
|
||||
# 'expected_meta': {},
|
||||
}, {
|
||||
'name': 'wav',
|
||||
'parser': audio.WAVParser,
|
||||
'meta': {'Comment': 'Zomg, a comment!'},
|
||||
'expected_meta': {},
|
||||
}, {
|
||||
'name': 'aiff',
|
||||
'parser': audio.AIFFParser,
|
||||
'meta': {'Annotation': 'Thank you for using MAT !'},
|
||||
'expected_meta': {},
|
||||
},
|
||||
{
|
||||
'name': 'mp3',
|
||||
'parser': audio.MP3Parser,
|
||||
'meta': {'TXXX:I am a': 'various comment'},
|
||||
@@ -392,7 +446,7 @@ class TestCleaning(unittest.TestCase):
|
||||
'name': 'gif',
|
||||
'parser': images.GIFParser,
|
||||
'meta': {'Comment': 'this is a test comment'},
|
||||
'expected_meta': {},
|
||||
'expected_meta': {'TransparentColor': '5'},
|
||||
},{
|
||||
'name': 'css',
|
||||
'parser': web.CSSParser,
|
||||
@@ -408,7 +462,10 @@ class TestCleaning(unittest.TestCase):
|
||||
'meta': {
|
||||
'WorkDescription': "This is a test svg image for mat2's testsuite",
|
||||
},
|
||||
'expected_meta': {},
|
||||
'expected_meta': {
|
||||
'ImageSize': '128x128',
|
||||
'Megapixels': '0.016',
|
||||
},
|
||||
} ,{
|
||||
'name': 'ppm',
|
||||
'parser': images.PPMParser,
|
||||
@@ -431,15 +488,26 @@ class TestCleaning(unittest.TestCase):
|
||||
'meta': {
|
||||
'Encoder': 'HandBrake 0.9.4 2009112300',
|
||||
},
|
||||
'expected_meta':
|
||||
{'CompatibleBrands': ['isom', 'iso2', 'avc1', 'mp41'],
|
||||
'expected_meta': {
|
||||
'AverageBitrate': 465641,
|
||||
'BufferSize': 0,
|
||||
'ColorPrimaries': 'BT.709',
|
||||
'ColorProfiles': 'nclx',
|
||||
'ColorRepresentation': 'nclx 1 1 1',
|
||||
'CompatibleBrands': ['isom', 'iso2', 'avc1', 'mp41'],
|
||||
'CompressorID': 'avc1',
|
||||
'CompressorName': 'JVT/AVC Coding',
|
||||
'GraphicsMode': 'srcCopy',
|
||||
'HandlerDescription': 'SoundHandler',
|
||||
'HandlerType': 'Metadata',
|
||||
'HandlerVendorID': 'Apple',
|
||||
'MajorBrand': 'MP4 Base Media v1 [IS0 14496-12:2003]',
|
||||
'MajorBrand': 'Base Media v1 [IS0 14496-12:2003]',
|
||||
'MatrixCoefficients': 'BT.709',
|
||||
'MaxBitrate': 465641,
|
||||
'MediaDataOffset': 48,
|
||||
'MediaDataSize': 379872,
|
||||
'MediaHeaderVersion': 0,
|
||||
'MediaLanguageCode': 'eng',
|
||||
'MinorVersion': '0.2.0',
|
||||
'MovieDataOffset': 48,
|
||||
'MovieHeaderVersion': 0,
|
||||
@@ -449,7 +517,13 @@ class TestCleaning(unittest.TestCase):
|
||||
'TimeScale': 1000,
|
||||
'TrackHeaderVersion': 0,
|
||||
'TrackID': 1,
|
||||
'TrackLayer': 0},
|
||||
'TrackLayer': 0,
|
||||
'TransferCharacteristics': 'BT.709',
|
||||
'VideoFullRangeFlag': 'Limited',
|
||||
},
|
||||
'extra_expected_meta': {
|
||||
'VideoFullRangeFlag': 0,
|
||||
}
|
||||
},{
|
||||
'name': 'wmv',
|
||||
'ffmpeg': 1,
|
||||
@@ -458,40 +532,93 @@ class TestCleaning(unittest.TestCase):
|
||||
'EncodingSettings': 'Lavf52.103.0',
|
||||
},
|
||||
'expected_meta': {},
|
||||
},{
|
||||
'name': 'heic',
|
||||
'parser': images.HEICParser,
|
||||
'meta': {},
|
||||
'expected_meta': {
|
||||
'BlueMatrixColumn': '0.14305 0.06061 0.71393',
|
||||
'BlueTRC': '(Binary data 32 bytes, use -b option to extract)',
|
||||
'CMMFlags': 'Not Embedded, Independent',
|
||||
'ChromaticAdaptation': '1.04788 0.02292 -0.05022 0.02959 0.99048 -0.01707 -0.00925 0.01508 0.75168',
|
||||
'ChromaticityChannel1': '0.64 0.33002',
|
||||
'ChromaticityChannel2': '0.3 0.60001',
|
||||
'ChromaticityChannel3': '0.15001 0.06',
|
||||
'ChromaticityChannels': 3,
|
||||
'ChromaticityColorant': 'Unknown',
|
||||
'ColorSpaceData': 'RGB ',
|
||||
'ConnectionSpaceIlluminant': '0.9642 1 0.82491',
|
||||
'DeviceAttributes': 'Reflective, Glossy, Positive, Color',
|
||||
'DeviceManufacturer': '',
|
||||
'DeviceMfgDesc': 'GIMP',
|
||||
'DeviceModel': '',
|
||||
'DeviceModelDesc': 'sRGB',
|
||||
'ExifByteOrder': 'Big-endian (Motorola, MM)',
|
||||
'GreenMatrixColumn': '0.38512 0.7169 0.09706',
|
||||
'GreenTRC': '(Binary data 32 bytes, use -b option to extract)',
|
||||
'MediaWhitePoint': '0.9642 1 0.82491',
|
||||
'PrimaryPlatform': 'Apple Computer Inc.',
|
||||
'ProfileCMMType': 'Little CMS',
|
||||
'ProfileClass': 'Display Device Profile',
|
||||
'ProfileConnectionSpace': 'XYZ ',
|
||||
'ProfileCopyright': 'Public Domain',
|
||||
'ProfileCreator': 'Little CMS',
|
||||
'ProfileDateTime': '2022:05:15 16:29:22',
|
||||
'ProfileDescription': 'GIMP built-in sRGB',
|
||||
'ProfileFileSignature': 'acsp',
|
||||
'ProfileID': 0,
|
||||
'ProfileVersion': '4.3.0',
|
||||
'RedMatrixColumn': '0.43604 0.22249 0.01392',
|
||||
'RedTRC': '(Binary data 32 bytes, use -b option to extract)',
|
||||
'RenderingIntent': 'Perceptual',
|
||||
'Warning': 'Bad IFD0 directory',
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
def test_all_parametred(self):
|
||||
for case in self.data:
|
||||
if 'ffmpeg' in case:
|
||||
try:
|
||||
video._get_ffmpeg_path()
|
||||
except RuntimeError:
|
||||
raise unittest.SkipTest
|
||||
with self.subTest(case=case):
|
||||
if 'ffmpeg' in case:
|
||||
try:
|
||||
video._get_ffmpeg_path()
|
||||
except RuntimeError:
|
||||
raise unittest.SkipTest
|
||||
|
||||
print('[+] Testing %s' % case['name'])
|
||||
target = './tests/data/clean.' + case['name']
|
||||
shutil.copy('./tests/data/dirty.' + case['name'], target)
|
||||
p1 = case['parser'](target)
|
||||
print('[+] Testing %s' % case['name'])
|
||||
target = './tests/data/clean.' + case['name']
|
||||
shutil.copy('./tests/data/dirty.' + case['name'], target)
|
||||
p1 = case['parser'](target)
|
||||
|
||||
meta = p1.get_meta()
|
||||
for k, v in case['meta'].items():
|
||||
if isinstance(v, dict):
|
||||
for _k, _v in v.items():
|
||||
self.assertEqual(meta[k][_k], _v)
|
||||
else:
|
||||
self.assertEqual(meta[k], v)
|
||||
for k, v in p1.get_meta().items():
|
||||
if k not in case['meta']:
|
||||
continue
|
||||
if isinstance(v, dict):
|
||||
for _k, _v in v.items():
|
||||
if _k in case['meta'][k]:
|
||||
self.assertEqual(_v, case['meta'][k][_k])
|
||||
else:
|
||||
self.assertEqual(v, case['meta'][k])
|
||||
|
||||
p1.lightweight_cleaning = True
|
||||
self.assertTrue(p1.remove_all())
|
||||
p1.lightweight_cleaning = True
|
||||
self.assertTrue(p1.remove_all())
|
||||
|
||||
p2 = case['parser'](p1.output_filename)
|
||||
self.assertEqual(p2.get_meta(), case['expected_meta'])
|
||||
self.assertTrue(p2.remove_all())
|
||||
p2 = case['parser'](p1.output_filename)
|
||||
meta = p2.get_meta()
|
||||
if meta:
|
||||
for k, v in p2.get_meta().items():
|
||||
self.assertIn(k, case['expected_meta'], '"%s" is not in "%s" (%s), with all of them being %s' % (k, case['expected_meta'], case['name'], p2.get_meta().items()))
|
||||
if str(case['expected_meta'][k]) in str(v):
|
||||
continue
|
||||
if 'extra_expected_meta' in case and k in case['extra_expected_meta']:
|
||||
if str(case['extra_expected_meta'][k]) in str(v):
|
||||
continue
|
||||
self.assertTrue(False, "got a different value (%s) than excepted (%s) for %s, with all of them being %s" % (str(v), meta, k, p2.get_meta().items()))
|
||||
self.assertTrue(p2.remove_all())
|
||||
|
||||
os.remove(target)
|
||||
os.remove(p1.output_filename)
|
||||
os.remove(p2.output_filename)
|
||||
os.remove(target)
|
||||
os.remove(p1.output_filename)
|
||||
os.remove(p2.output_filename)
|
||||
|
||||
|
||||
def test_html(self):
|
||||
@@ -512,14 +639,20 @@ class TestCleaning(unittest.TestCase):
|
||||
os.remove('./tests/data/clean.cleaned.html')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.html')
|
||||
|
||||
with open('./tests/data/clean.html', 'w') as f:
|
||||
f.write('<title><title><pouet/><meta/></title></title><test/>')
|
||||
p = web.HTMLParser('./tests/data/clean.html')
|
||||
self.assertTrue(p.remove_all())
|
||||
with open('./tests/data/clean.cleaned.html', 'r') as f:
|
||||
self.assertEqual(f.read(), '<title></title><test/>')
|
||||
if sys.version_info >= (3, 13):
|
||||
with open('./tests/data/clean.html', 'w') as f:
|
||||
f.write('<title><title><pouet/><meta/></title></title><test/>')
|
||||
with self.assertRaises(ValueError):
|
||||
p = web.HTMLParser('./tests/data/clean.html')
|
||||
else:
|
||||
with open('./tests/data/clean.html', 'w') as f:
|
||||
f.write('<title><title><pouet/><meta/></title></title><test/>')
|
||||
p = web.HTMLParser('./tests/data/clean.html')
|
||||
self.assertTrue(p.remove_all())
|
||||
with open('./tests/data/clean.cleaned.html', 'r') as f:
|
||||
self.assertEqual(f.read(), '<title></title><test/>')
|
||||
os.remove('./tests/data/clean.cleaned.html')
|
||||
os.remove('./tests/data/clean.html')
|
||||
os.remove('./tests/data/clean.cleaned.html')
|
||||
|
||||
with open('./tests/data/clean.html', 'w') as f:
|
||||
f.write('<test><title>Some<b>metadata</b><br/></title></test>')
|
||||
@@ -564,9 +697,14 @@ class TestCleaning(unittest.TestCase):
|
||||
class TestCleaningArchives(unittest.TestCase):
|
||||
def test_zip(self):
|
||||
with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
|
||||
zout.write('./tests/data/dirty.flac')
|
||||
zout.write('./tests/data/dirty.docx')
|
||||
zout.write('./tests/data/dirty.jpg')
|
||||
zout.write('./tests/data/dirty.flac',
|
||||
compress_type = zipfile.ZIP_STORED)
|
||||
zout.write('./tests/data/dirty.docx',
|
||||
compress_type = zipfile.ZIP_DEFLATED)
|
||||
zout.write('./tests/data/dirty.jpg',
|
||||
compress_type = zipfile.ZIP_BZIP2)
|
||||
zout.write('./tests/data/dirty.txt',
|
||||
compress_type = zipfile.ZIP_LZMA)
|
||||
p = archive.ZipParser('./tests/data/dirty.zip')
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
|
||||
@@ -578,6 +716,16 @@ class TestCleaningArchives(unittest.TestCase):
|
||||
self.assertEqual(p.get_meta(), {})
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
with zipfile.ZipFile('./tests/data/dirty.zip') as zipin:
|
||||
members = {
|
||||
'tests/data/dirty.flac' : zipfile.ZIP_STORED,
|
||||
'tests/data/dirty.docx': zipfile.ZIP_DEFLATED,
|
||||
'tests/data/dirty.jpg' : zipfile.ZIP_BZIP2,
|
||||
'tests/data/dirty.txt' : zipfile.ZIP_LZMA,
|
||||
}
|
||||
for k, v in members.items():
|
||||
self.assertEqual(zipin.getinfo(k).compress_type, v)
|
||||
|
||||
os.remove('./tests/data/dirty.zip')
|
||||
os.remove('./tests/data/dirty.cleaned.zip')
|
||||
os.remove('./tests/data/dirty.cleaned.cleaned.zip')
|
||||
@@ -761,3 +909,107 @@ class TestNoSandbox(unittest.TestCase):
|
||||
os.remove('./tests/data/clean.png')
|
||||
os.remove('./tests/data/clean.cleaned.png')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.png')
|
||||
|
||||
class TestComplexOfficeFiles(unittest.TestCase):
|
||||
def test_complex_pptx(self):
|
||||
target = './tests/data/clean.pptx'
|
||||
shutil.copy('./tests/data/narrated_powerpoint_presentation.pptx', target)
|
||||
p = office.MSOfficeParser(target)
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
os.remove(target)
|
||||
os.remove(p.output_filename)
|
||||
|
||||
class TextDocx(unittest.TestCase):
|
||||
def test_comment_xml_is_removed(self):
|
||||
with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
|
||||
# Check if 'word/comments.xml' exists in the zip
|
||||
self.assertIn('word/comments.xml', zipin.namelist())
|
||||
|
||||
shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
|
||||
p = office.MSOfficeParser('./tests/data/comment_clean.docx')
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
|
||||
# Check if 'word/comments.xml' exists in the zip
|
||||
self.assertNotIn('word/comments.xml', zipin.namelist())
|
||||
|
||||
os.remove('./tests/data/comment_clean.docx')
|
||||
os.remove('./tests/data/comment_clean.cleaned.docx')
|
||||
|
||||
def test_xml_is_utf8(self):
|
||||
with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
|
||||
c = zipin.open('word/document.xml')
|
||||
content = c.read()
|
||||
|
||||
# ensure encoding is utf-8
|
||||
r = b'encoding=(\'|\")UTF-8(\'|\")'
|
||||
match = re.search(r, content, re.IGNORECASE)
|
||||
self.assertIsNotNone(match)
|
||||
|
||||
shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
|
||||
p = office.MSOfficeParser('./tests/data/comment_clean.docx')
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
|
||||
c = zipin.open('word/document.xml')
|
||||
content = c.read()
|
||||
|
||||
# ensure encoding is still utf-8
|
||||
r = b'encoding=(\'|\")UTF-8(\'|\")'
|
||||
match = re.search(r, content, re.IGNORECASE)
|
||||
self.assertIsNotNone(match)
|
||||
|
||||
os.remove('./tests/data/comment_clean.docx')
|
||||
os.remove('./tests/data/comment_clean.cleaned.docx')
|
||||
|
||||
def test_comment_references_are_removed(self):
|
||||
with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
|
||||
c = zipin.open('word/document.xml')
|
||||
content = c.read()
|
||||
|
||||
r = b'w:commentRangeStart'
|
||||
self.assertIn(r, content)
|
||||
r = b'w:commentRangeEnd'
|
||||
self.assertIn(r, content)
|
||||
r = b'w:commentReference'
|
||||
self.assertIn(r, content)
|
||||
|
||||
shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
|
||||
p = office.MSOfficeParser('./tests/data/comment_clean.docx')
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
|
||||
c = zipin.open('word/document.xml')
|
||||
content = c.read()
|
||||
|
||||
r = b'w:commentRangeStart'
|
||||
self.assertNotIn(r, content)
|
||||
r = b'w:commentRangeEnd'
|
||||
self.assertNotIn(r, content)
|
||||
r = b'w:commentReference'
|
||||
self.assertNotIn(r, content)
|
||||
|
||||
os.remove('./tests/data/comment_clean.docx')
|
||||
os.remove('./tests/data/comment_clean.cleaned.docx')
|
||||
|
||||
def test_clean_document_xml_rels(self):
|
||||
with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
|
||||
c = zipin.open('word/_rels/document.xml.rels')
|
||||
content = c.read()
|
||||
r = b'Target="comments.xml"'
|
||||
self.assertIn(r, content)
|
||||
|
||||
shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
|
||||
p = office.MSOfficeParser('./tests/data/comment_clean.docx')
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
|
||||
c = zipin.open('word/_rels/document.xml.rels')
|
||||
content = c.read()
|
||||
r = b'Target="comments.xml"'
|
||||
self.assertNotIn(r, content)
|
||||
|
||||
os.remove('./tests/data/comment_clean.docx')
|
||||
os.remove('./tests/data/comment_clean.cleaned.docx')
|
||||
|
||||
|
@@ -23,6 +23,11 @@ class TestLightWeightCleaning(unittest.TestCase):
|
||||
'parser': images.JPGParser,
|
||||
'meta': {'Comment': 'Created with GIMP'},
|
||||
'expected_meta': {},
|
||||
#}, {
|
||||
# 'name': 'webp',
|
||||
# 'parser': images.WEBPParser,
|
||||
# 'meta': {'Warning': '[minor] Improper EXIF header'},
|
||||
# 'expected_meta': {},
|
||||
}, {
|
||||
'name': 'torrent',
|
||||
'parser': torrent.TorrentParser,
|
||||
@@ -33,7 +38,6 @@ class TestLightWeightCleaning(unittest.TestCase):
|
||||
'parser': images.TiffParser,
|
||||
'meta': {'ImageDescription': 'OLYMPUS DIGITAL CAMERA '},
|
||||
'expected_meta': {
|
||||
'Orientation': 'Horizontal (normal)',
|
||||
'ResolutionUnit': 'inches',
|
||||
'XResolution': 72,
|
||||
'YResolution': 72
|
||||
|
@@ -1,3 +0,0 @@
|
||||
# Words to be ignored by codespell.
|
||||
# Put one word per line and sort alphabetically.
|
||||
process'
|
Reference in New Issue
Block a user