mirror of
https://0xacab.org/jvoisin/mat2
synced 2025-10-06 16:42:57 +02:00
Compare commits
35 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
235403bc11 | ||
|
102f08cd28 | ||
|
7a8ea224bc | ||
|
504efb2448 | ||
|
f07344444d | ||
|
473903b70e | ||
|
1438cf7bd4 | ||
|
e740a9559f | ||
|
2b58eece50 | ||
|
29f404bce3 | ||
|
6c966f2afa | ||
|
70d236a062 | ||
|
d61fb7f77a | ||
|
1aed4ff2a5 | ||
|
75c0a750c1 | ||
|
a47ac01eb6 | ||
|
156855ab7e | ||
|
09672a2dcc | ||
|
f2c898c92d | ||
|
f931a0ecee | ||
|
61f39c4bd0 | ||
|
1b9ce34e2c | ||
|
17e76ab6f0 | ||
|
94ef57c994 | ||
|
05d1ca5841 | ||
|
55b468ded7 | ||
|
0fcafa2edd | ||
|
7405955ab5 | ||
|
e6564509e1 | ||
|
bbd5b2817c | ||
|
73f2a87aa0 | ||
|
abcdf07ef4 | ||
|
a3081bce47 | ||
|
47d5529840 | ||
|
fa44794dfd |
45
.github/workflows/builds.yaml
vendored
Normal file
45
.github/workflows/builds.yaml
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
name: CI for Python versions
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
schedule:
|
||||
- cron: '0 16 * * 5'
|
||||
|
||||
jobs:
|
||||
linting:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/setup-python@v5
|
||||
- run: pip install ruff
|
||||
- run: |
|
||||
ruff check .
|
||||
build:
|
||||
needs: linting
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14.0-rc.2"]
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get install --no-install-recommends --no-install-suggests --yes \
|
||||
ffmpeg \
|
||||
gir1.2-gdkpixbuf-2.0 \
|
||||
gir1.2-poppler-0.18 \
|
||||
gir1.2-rsvg-2.0 \
|
||||
libimage-exiftool-perl \
|
||||
python3-gi-cairo \
|
||||
libcairo2-dev \
|
||||
libgirepository-2.0-dev \
|
||||
libgirepository1.0-dev \
|
||||
gobject-introspection \
|
||||
python3-mutagen
|
||||
pip install .
|
||||
- name: Build and run the testsuite
|
||||
run: python3 -m unittest discover -v
|
@@ -1,97 +0,0 @@
|
||||
variables:
|
||||
CONTAINER_REGISTRY: $CI_REGISTRY/georg/mat2-ci-images
|
||||
GIT_DEPTH: "5"
|
||||
GIT_STRATEGY: clone
|
||||
|
||||
stages:
|
||||
- linting
|
||||
- test
|
||||
|
||||
.prepare_env: &prepare_env
|
||||
before_script: # This is needed to not run the testsuite as root
|
||||
- useradd --home-dir ${CI_PROJECT_DIR} mat2
|
||||
- chown -R mat2 .
|
||||
|
||||
linting:ruff:
|
||||
image: $CONTAINER_REGISTRY:linting
|
||||
stage: linting
|
||||
script:
|
||||
- apt update
|
||||
- apt install -qqy --no-install-recommends python3-venv
|
||||
- python3 -m venv venv
|
||||
- source venv/bin/activate
|
||||
- pip3 install ruff
|
||||
- ruff check .
|
||||
|
||||
linting:mypy:
|
||||
image: $CONTAINER_REGISTRY:linting
|
||||
stage: linting
|
||||
script:
|
||||
- mypy --ignore-missing-imports mat2 libmat2/*.py
|
||||
|
||||
tests:archlinux:
|
||||
image: $CONTAINER_REGISTRY:archlinux
|
||||
stage: test
|
||||
script:
|
||||
- python3 -m unittest discover -v
|
||||
|
||||
tests:debian:
|
||||
image: $CONTAINER_REGISTRY:debian
|
||||
stage: test
|
||||
<<: *prepare_env
|
||||
script:
|
||||
- apt-get -qqy purge bubblewrap
|
||||
- su - mat2 -c "python3-coverage run --branch -m unittest discover -s tests/"
|
||||
- su - mat2 -c "python3-coverage report --fail-under=95 -m --include 'libmat2/*'"
|
||||
|
||||
tests:debian_with_bubblewrap:
|
||||
image: $CONTAINER_REGISTRY:debian
|
||||
stage: test
|
||||
allow_failure: true
|
||||
<<: *prepare_env
|
||||
script:
|
||||
- apt-get -qqy install bubblewrap
|
||||
- python3 -m unittest discover -v
|
||||
|
||||
tests:fedora:
|
||||
image: $CONTAINER_REGISTRY:fedora
|
||||
stage: test
|
||||
script:
|
||||
- python3 -m unittest discover -v
|
||||
|
||||
tests:gentoo:
|
||||
image: $CONTAINER_REGISTRY:gentoo
|
||||
stage: test
|
||||
<<: *prepare_env
|
||||
script:
|
||||
- su - mat2 -c "python3 -m unittest discover -v"
|
||||
|
||||
tests:python3.7:
|
||||
image: $CONTAINER_REGISTRY:python3.7
|
||||
stage: test
|
||||
script:
|
||||
- python3 -m unittest discover -v
|
||||
|
||||
tests:python3.8:
|
||||
image: $CONTAINER_REGISTRY:python3.8
|
||||
stage: test
|
||||
script:
|
||||
- python3 -m unittest discover -v
|
||||
|
||||
tests:python3.9:
|
||||
image: $CONTAINER_REGISTRY:python3.9
|
||||
stage: test
|
||||
script:
|
||||
- python3 -m unittest discover -v
|
||||
|
||||
tests:python3.10:
|
||||
image: $CONTAINER_REGISTRY:python3.10
|
||||
stage: test
|
||||
script:
|
||||
- python3 -m unittest discover -v
|
||||
|
||||
tests:python3.11:
|
||||
image: $CONTAINER_REGISTRY:python3.11
|
||||
stage: test
|
||||
script:
|
||||
- python3 -m unittest discover -v
|
18
.pylintrc
18
.pylintrc
@@ -1,18 +0,0 @@
|
||||
[FORMAT]
|
||||
good-names=e,f,i,x,s
|
||||
max-locals=20
|
||||
|
||||
[MESSAGES CONTROL]
|
||||
disable=
|
||||
fixme,
|
||||
invalid-name,
|
||||
duplicate-code,
|
||||
missing-docstring,
|
||||
protected-access,
|
||||
abstract-method,
|
||||
wrong-import-position,
|
||||
catching-non-exception,
|
||||
cell-var-from-loop,
|
||||
locally-disabled,
|
||||
raise-missing-from,
|
||||
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
|
@@ -1,3 +1,11 @@
|
||||
# 0.13.5 - 2025-01-09
|
||||
- Keep orientation metadata on jpeg and tiff files
|
||||
- Improve cairo-related error/exceptions handling
|
||||
- Improve the logging
|
||||
- Improve the sandboxing
|
||||
- Improve Python3.12 support
|
||||
- Improve MSOffice documents handling
|
||||
|
||||
# 0.13.4 - 2023-08-02
|
||||
|
||||
- Add documentation about mat2 on OSX
|
||||
|
@@ -1,9 +1,9 @@
|
||||
# Contributing to mat2
|
||||
|
||||
The main repository for mat2 is on [0xacab]( https://0xacab.org/jvoisin/mat2 ),
|
||||
The main repository for mat2 is on [github]( https://github.com/jvoisin/mat2 ),
|
||||
but you can send patches to jvoisin by [email](https://dustri.org/) if you prefer.
|
||||
|
||||
Do feel free to pick up [an issue]( https://0xacab.org/jvoisin/mat2/issues )
|
||||
Do feel free to pick up [an issue]( https://github.com/jvoisin/mat2/issues )
|
||||
and to send a pull-request.
|
||||
|
||||
Before sending the pull-request, please do check that everything is fine by
|
||||
@@ -27,11 +27,11 @@ Since mat2 is written in Python3, please conform as much as possible to the
|
||||
|
||||
# Doing a release
|
||||
|
||||
1. Update the [changelog](https://0xacab.org/jvoisin/mat2/blob/master/CHANGELOG.md)
|
||||
2. Update the version in the [mat2](https://0xacab.org/jvoisin/mat2/blob/master/mat2) file
|
||||
3. Update the version in the [setup.py](https://0xacab.org/jvoisin/mat2/blob/master/setup.py) file
|
||||
4. Update the version in the [pyproject.toml](https://0xacab.org/jvoisin/mat2/blob/master/yproject.toml) file
|
||||
5. Update the version and date in the [man page](https://0xacab.org/jvoisin/mat2/blob/master/doc/mat2.1)
|
||||
1. Update the [changelog](https://github.com/jvoisin/mat2/blob/master/CHANGELOG.md)
|
||||
2. Update the version in the [mat2](https://github.com/jvoisin/mat2/blob/master/mat2) file
|
||||
3. Update the version in the [setup.py](https://github.com/jvoisin/mat2/blob/master/setup.py) file
|
||||
4. Update the version in the [pyproject.toml](https://github.com/jvoisin/mat2/blob/master/yproject.toml) file
|
||||
5. Update the version and date in the [man page](https://github.com/jvoisin/mat2/blob/master/doc/mat2.1)
|
||||
6. Commit the modified files
|
||||
7. Create a tag with `git tag -s $VERSION`
|
||||
8. Push the commit with `git push origin master`
|
||||
@@ -39,7 +39,7 @@ Since mat2 is written in Python3, please conform as much as possible to the
|
||||
10. Download the gitlab archive of the release
|
||||
11. Diff it against the local copy
|
||||
12. If there is no difference, sign the archive with `gpg --armor --detach-sign mat2-$VERSION.tar.xz`
|
||||
13. Upload the signature on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there
|
||||
13. Upload the signature on Gitlab's [tag page](https://github.com/jvoisin/mat2/tags) and add the changelog there
|
||||
14. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
|
||||
15. Sign'n'upload the new version on pypi with `python3 setup.py sdist bdist_wheel` then `twine upload -s dist/*`
|
||||
16. Do the secret release dance
|
||||
|
@@ -19,7 +19,7 @@ installed, mat2 uses it to sandbox any external processes it invokes.
|
||||
## Arch Linux
|
||||
|
||||
Thanks to [kpcyrd](https://archlinux.org/packages/?maintainer=kpcyrd), there is an package available on
|
||||
[Arch linux's AUR](https://archlinux.org/packages/community/any/mat2/).
|
||||
[Arch linux's AUR](https://archlinux.org/packages/extra/any/mat2/).
|
||||
|
||||
## Debian
|
||||
|
||||
|
194
README.md
194
README.md
@@ -1,193 +1 @@
|
||||
```
|
||||
_____ _____ _____ ___
|
||||
| | _ |_ _|_ | Keep your data,
|
||||
| | | | |_| | | | | _| trash your meta!
|
||||
|_|_|_|_| |_| |_| |___|
|
||||
|
||||
```
|
||||
|
||||
# Metadata and privacy
|
||||
|
||||
Metadata consist of information that characterizes data.
|
||||
Metadata are used to provide documentation for data products.
|
||||
In essence, metadata answer who, what, when, where, why, and how about
|
||||
every facet of the data that are being documented.
|
||||
|
||||
Metadata within a file can tell a lot about you.
|
||||
Cameras record data about when a picture was taken and what
|
||||
camera was used. Office documents like PDF or Office automatically adds
|
||||
author and company information to documents and spreadsheets.
|
||||
Maybe you don't want to disclose those information.
|
||||
|
||||
This is precisely the job of mat2: getting rid, as much as possible, of
|
||||
metadata.
|
||||
|
||||
mat2 provides:
|
||||
- a library called `libmat2`;
|
||||
- a command line tool called `mat2`,
|
||||
- a service menu for Dolphin, KDE's default file manager
|
||||
|
||||
If you prefer a regular graphical user interface, you might be interested in
|
||||
[Metadata Cleaner](https://metadatacleaner.romainvigier.fr/), which is using
|
||||
`mat2` under the hood.
|
||||
|
||||
# Requirements
|
||||
|
||||
- `python3-mutagen` for audio support
|
||||
- `python3-gi-cairo` and `gir1.2-poppler-0.18` for PDF support
|
||||
- `gir1.2-gdkpixbuf-2.0` for images support
|
||||
- `gir1.2-rsvg-2.0` for svg support
|
||||
- `FFmpeg`, optionally, for video support
|
||||
- `libimage-exiftool-perl` for everything else
|
||||
- `bubblewrap`, optionally, for sandboxing
|
||||
|
||||
Please note that mat2 requires at least Python3.5.
|
||||
|
||||
# Requirements setup on macOS (OS X) using [Homebrew](https://brew.sh/)
|
||||
|
||||
```bash
|
||||
brew install exiftool cairo pygobject3 poppler gdk-pixbuf librsvg ffmpeg
|
||||
```
|
||||
|
||||
# Running the test suite
|
||||
|
||||
```bash
|
||||
$ python3 -m unittest discover -v
|
||||
```
|
||||
|
||||
And if you want to see the coverage:
|
||||
|
||||
```bash
|
||||
$ python3-coverage run --branch -m unittest discover -s tests/
|
||||
$ python3-coverage report --include -m --include /libmat2/*'
|
||||
```
|
||||
|
||||
# How to use mat2
|
||||
|
||||
```
|
||||
usage: mat2 [-h] [-V] [--unknown-members policy] [--inplace] [--no-sandbox]
|
||||
[-v] [-l] [--check-dependencies] [-L | -s]
|
||||
[files [files ...]]
|
||||
|
||||
Metadata anonymisation toolkit 2
|
||||
|
||||
positional arguments:
|
||||
files the files to process
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-V, --verbose show more verbose status information
|
||||
--unknown-members policy
|
||||
how to handle unknown members of archive-style files
|
||||
(policy should be one of: abort, omit, keep) [Default:
|
||||
abort]
|
||||
--inplace clean in place, without backup
|
||||
--no-sandbox Disable bubblewrap's sandboxing
|
||||
-v, --version show program's version number and exit
|
||||
-l, --list list all supported fileformats
|
||||
--check-dependencies check if mat2 has all the dependencies it needs
|
||||
-L, --lightweight remove SOME metadata
|
||||
-s, --show list harmful metadata detectable by mat2 without
|
||||
removing them
|
||||
```
|
||||
|
||||
Note that mat2 **will not** clean files in-place, but will produce, for
|
||||
example, with a file named "myfile.png" a cleaned version named
|
||||
"myfile.cleaned.png".
|
||||
|
||||
## Web interface
|
||||
|
||||
It's possible to run mat2 as a web service, via
|
||||
[mat2-web](https://0xacab.org/jvoisin/mat2-web).
|
||||
|
||||
If you're using WordPress, you might be interested in [wp-mat](https://git.autistici.org/noblogs/wp-mat)
|
||||
and [wp-mat-server](https://git.autistici.org/noblogs/wp-mat-server).
|
||||
|
||||
## Desktop GUI
|
||||
|
||||
For GNU/Linux desktops, it's possible to use the
|
||||
[Metadata Cleaner](https://gitlab.com/rmnvgr/metadata-cleaner) GTK application.
|
||||
|
||||
# Supported formats
|
||||
|
||||
The following formats are supported: avi, bmp, css, epub/ncx, flac, gif, jpeg,
|
||||
m4a/mp2/mp3/…, mp4, odc/odf/odg/odi/odp/ods/odt/…, off/opus/oga/spx/…, pdf,
|
||||
png, ppm, pptx/xlsx/docx/…, svg/svgz/…, tar/tar.gz/tar.bz2/tar.xz/…, tiff,
|
||||
torrent, wav, wmv, zip, …
|
||||
|
||||
# Notes about detecting metadata
|
||||
|
||||
While mat2 is doing its very best to display metadata when the `--show` flag is
|
||||
passed, it doesn't mean that a file is clean from any metadata if mat2 doesn't
|
||||
show any. There is no reliable way to detect every single possible metadata for
|
||||
complex file formats.
|
||||
|
||||
This is why you shouldn't rely on metadata's presence to decide if your file must
|
||||
be cleaned or not.
|
||||
|
||||
# Notes about the lightweight mode
|
||||
|
||||
By default, mat2 might alter a bit the data of your files, in order to remove
|
||||
as much metadata as possible. For example, texts in PDF might not be selectable anymore,
|
||||
compressed images might get compressed again, …
|
||||
Since some users might be willing to trade some metadata's presence in exchange
|
||||
of the guarantee that mat2 won't modify the data of their files, there is the
|
||||
`-L` flag that precisely does that.
|
||||
|
||||
# Related software
|
||||
|
||||
- The first iteration of [MAT](https://mat.boum.org)
|
||||
- [Exiftool](https://sno.phy.queensu.ca/~phil/exiftool/mat)
|
||||
- [pdf-redact-tools](https://github.com/firstlookmedia/pdf-redact-tools), that
|
||||
tries to deal with *printer dots* too.
|
||||
- [pdfparanoia](https://github.com/kanzure/pdfparanoia), that removes
|
||||
watermarks from PDF.
|
||||
- [Scrambled Exif](https://f-droid.org/packages/com.jarsilio.android.scrambledeggsif/),
|
||||
an open-source Android application to remove metadata from pictures.
|
||||
- [Dangerzone](https://dangerzone.rocks/), designed to sanitize harmful documents
|
||||
into harmless ones.
|
||||
|
||||
# Contact
|
||||
|
||||
If possible, use the [issues system](https://0xacab.org/jvoisin/mat2/issues)
|
||||
or the [mailing list](https://www.autistici.org/mailman/listinfo/mat-dev)
|
||||
Should a more private contact be needed (eg. for reporting security issues),
|
||||
you can email Julien (jvoisin) Voisin at `julien.voisin+mat2@dustri.org`,
|
||||
using the gpg key `9FCDEE9E1A381F311EA62A7404D041E8171901CC`.
|
||||
|
||||
# Donations
|
||||
|
||||
If you want to donate some money, please give it to [Tails]( https://tails.boum.org/donate/?r=contribute ).
|
||||
|
||||
# License
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Copyright 2018 Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org>
|
||||
Copyright 2016 Marie-Rose for mat2's logo
|
||||
|
||||
The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3,
|
||||
and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx
|
||||
|
||||
The `narrated_powerpoint_presentation.pptx` file is in the public domain.
|
||||
|
||||
# Thanks
|
||||
|
||||
mat2 wouldn't exist without:
|
||||
|
||||
- the [Google Summer of Code](https://summerofcode.withgoogle.com/);
|
||||
- the fine people from [Tails]( https://tails.boum.org);
|
||||
- friends
|
||||
|
||||
Many thanks to them!
|
||||
# This repository is deprecated, please use https://github.com/jvoisin/mat2 instead
|
@@ -19,14 +19,14 @@ details.
|
||||
# jpegoptim, optipng, …
|
||||
|
||||
While designed to reduce as much as possible the size of pictures,
|
||||
those software can be used to remove metadata. They usually have very good
|
||||
those software can be used to remove metadata. They usually have excellent
|
||||
support for a single picture format, and can be used in place of mat2 for them.
|
||||
|
||||
|
||||
# PDF Redact Tools
|
||||
|
||||
[PDF Redact Tools](https://github.com/firstlookmedia/pdf-redact-tools) is
|
||||
a software developed by the people from [First Look
|
||||
software developed by the people from [First Look
|
||||
Media](https://firstlook.media/), the entity behind, amongst other things,
|
||||
[The Intercept](https://theintercept.com/).
|
||||
|
||||
@@ -34,13 +34,13 @@ The tool uses roughly the same approach than mat2 to deal with PDF,
|
||||
which is unfortunately the only fileformat that it does support.
|
||||
It's interesting to note that it has counter-measures against
|
||||
[yellow dots](https://en.wikipedia.org/wiki/Machine_Identification_Code),
|
||||
a capacity that mat2 [doesn't possess yet](https://0xacab.org/jvoisin/mat2/issues/43).
|
||||
a capacity that mat2 doesn't have.
|
||||
|
||||
|
||||
# Exiv2
|
||||
|
||||
[Exiv2](https://www.exiv2.org/) was considered for mat2,
|
||||
but it currently [misses a lot of metadata](https://0xacab.org/jvoisin/mat2/issues/85)
|
||||
but it currently misses a lot of metadata.
|
||||
|
||||
|
||||
# Others non open source software/online service
|
||||
|
@@ -1,4 +1,4 @@
|
||||
.TH mat2 "1" "August 2023" "mat2 0.13.4" "User Commands"
|
||||
.TH mat2 "1" "January 2025" "mat2 0.13.5" "User Commands"
|
||||
|
||||
.SH NAME
|
||||
mat2 \- the metadata anonymisation toolkit 2
|
||||
@@ -84,7 +84,7 @@ but keep in mind by doing so, some metadata \fBwon't be cleaned\fR.
|
||||
|
||||
While mat2 does its very best to remove every single metadata,
|
||||
it's still in beta, and \fBsome\fR might remain. Should you encounter
|
||||
some issues, check the bugtracker: https://0xacab.org/jvoisin/mat2/issues
|
||||
some issues, check the bugtracker: https://github.com/jvoisin/mat2/issues
|
||||
.PP
|
||||
Please use accordingly and be careful.
|
||||
|
||||
|
@@ -34,7 +34,10 @@ class AbstractParser(abc.ABC):
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_meta(self) -> Dict[str, Union[str, Dict]]:
|
||||
"""Return all the metadata of the current file"""
|
||||
"""Return all the metadata of the current file
|
||||
|
||||
:raises RuntimeError: Raised if the cleaning process went wrong.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def remove_all(self) -> bool:
|
||||
|
@@ -152,7 +152,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||
self.filename, member_name, full_path)
|
||||
break
|
||||
|
||||
zin.extract(member=item, path=temp_folder)
|
||||
try:
|
||||
zin.extract(member=item, path=temp_folder)
|
||||
except OSError as e:
|
||||
logging.error("Unable to extraxt %s from %s: %s", item, self.filename, e)
|
||||
|
||||
os.chmod(full_path, stat.S_IRUSR)
|
||||
|
||||
@@ -161,6 +164,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||
|
||||
member_parser, _ = parser_factory.get_parser(full_path) # type: ignore
|
||||
if member_parser:
|
||||
member_parser.sandbox = self.sandbox
|
||||
local_meta = {**local_meta, **member_parser.get_meta()}
|
||||
|
||||
if local_meta:
|
||||
@@ -248,6 +252,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
|
||||
abort = True
|
||||
continue
|
||||
else:
|
||||
member_parser.sandbox = self.sandbox
|
||||
if member_parser.remove_all() is False:
|
||||
logging.warning("In file %s, something went wrong \
|
||||
with the cleaning of %s \
|
||||
|
@@ -82,6 +82,9 @@ class FLACParser(MutagenParser):
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(picture.data)
|
||||
p, _ = parser_factory.get_parser(fname) # type: ignore
|
||||
if p is None:
|
||||
raise ValueError
|
||||
p.sandbox = self.sandbox
|
||||
# Mypy chokes on ternaries :/
|
||||
meta[name] = p.get_meta() if p else 'harmful data' # type: ignore
|
||||
os.remove(fname)
|
||||
|
@@ -116,6 +116,7 @@ class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
|
||||
|
||||
_, extension = os.path.splitext(self.filename)
|
||||
pixbuf = GdkPixbuf.Pixbuf.new_from_file(self.filename)
|
||||
pixbuf = GdkPixbuf.Pixbuf.apply_embedded_orientation(pixbuf)
|
||||
if extension.lower() == '.jpg':
|
||||
extension = '.jpeg' # gdk is picky
|
||||
elif extension.lower() == '.tif':
|
||||
@@ -138,7 +139,7 @@ class JPGParser(GdkPixbufAbstractParser):
|
||||
'MIMEType', 'ImageWidth', 'ImageSize', 'BitsPerSample',
|
||||
'ColorComponents', 'EncodingProcess', 'JFIFVersion',
|
||||
'ResolutionUnit', 'XResolution', 'YCbCrSubSampling',
|
||||
'YResolution', 'Megapixels', 'ImageHeight'}
|
||||
'YResolution', 'Megapixels', 'ImageHeight', 'Orientation'}
|
||||
|
||||
|
||||
class TiffParser(GdkPixbufAbstractParser):
|
||||
@@ -152,7 +153,7 @@ class TiffParser(GdkPixbufAbstractParser):
|
||||
'FileInodeChangeDate', 'FileModifyDate', 'FileName',
|
||||
'FilePermissions', 'FileSize', 'FileType',
|
||||
'FileTypeExtension', 'ImageHeight', 'ImageSize',
|
||||
'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile'}
|
||||
'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile', 'Orientation'}
|
||||
|
||||
|
||||
class PPMParser(abstract.AbstractParser):
|
||||
@@ -195,3 +196,15 @@ class HEICParser(exiftool.ExiftoolParser):
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
return self._lightweight_cleanup()
|
||||
|
||||
class WEBPParser(GdkPixbufAbstractParser):
|
||||
mimetypes = {'image/webp'}
|
||||
meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName',
|
||||
'Directory', 'FileSize', 'FileModifyDate',
|
||||
'FileAccessDate', "FileInodeChangeDate",
|
||||
'FilePermissions', 'FileType', 'FileTypeExtension',
|
||||
'MIMEType', 'ImageWidth', 'ImageSize', 'BitsPerSample',
|
||||
'ColorComponents', 'EncodingProcess', 'JFIFVersion',
|
||||
'ResolutionUnit', 'XResolution', 'YCbCrSubSampling',
|
||||
'YResolution', 'Megapixels', 'ImageHeight', 'Orientation',
|
||||
'HorizontalScale', 'VerticalScale', 'VP8Version'}
|
||||
|
@@ -38,7 +38,7 @@ def _sort_xml_attributes(full_path: str) -> bool:
|
||||
for c in tree.getroot():
|
||||
c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))
|
||||
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
|
||||
@@ -147,7 +147,7 @@ class MSOfficeParser(ZipParser):
|
||||
# Additional presentation-wide properties like printing properties,
|
||||
# presentation show properties etc.
|
||||
r'^(?:word|ppt|xl)/presProps\.xml$',
|
||||
r'^(?:word|ppt|xl)/comments[0-9]+\.xml$',
|
||||
r'^(?:word|ppt|xl)/comments[0-9]*\.xml$',
|
||||
r'^(?:word|ppt|xl)/threadedComments/threadedComment[0-9]*\.xml$',
|
||||
r'^(?:word|ppt|xl)/commentsExtended\.xml$',
|
||||
r'^(?:word|ppt|xl)/commentsExtensible\.xml$',
|
||||
@@ -220,7 +220,7 @@ class MSOfficeParser(ZipParser):
|
||||
for element in elements_to_remove:
|
||||
parent_map[element].remove(element)
|
||||
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
@@ -250,7 +250,7 @@ class MSOfficeParser(ZipParser):
|
||||
for element in elements_to_remove:
|
||||
parent_map[element].remove(element)
|
||||
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
@@ -283,11 +283,82 @@ class MSOfficeParser(ZipParser):
|
||||
for children in element.iterfind('./*'):
|
||||
elements_ins.append((element, position, children))
|
||||
break
|
||||
|
||||
for (element, position, children) in elements_ins:
|
||||
parent_map[element].insert(position, children)
|
||||
|
||||
# the list can sometimes contain duplicate elements, so don't remove
|
||||
# until all children have been processed
|
||||
for (element, position, children) in elements_ins:
|
||||
if element in parent_map[element]:
|
||||
parent_map[element].remove(element)
|
||||
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def __remove_document_comment_meta(full_path: str) -> bool:
|
||||
try:
|
||||
tree, namespace = _parse_xml(full_path)
|
||||
except ET.ParseError as e: # pragma: no cover
|
||||
logging.error("Unable to parse %s: %s", full_path, e)
|
||||
return False
|
||||
|
||||
# search the docs to see if we can bail early
|
||||
range_start = tree.find('.//w:commentRangeStart', namespace)
|
||||
range_end = tree.find('.//w:commentRangeEnd', namespace)
|
||||
references = tree.find('.//w:commentReference', namespace)
|
||||
if range_start is None and range_end is None and references is None:
|
||||
return True # No comment meta tags are present
|
||||
|
||||
parent_map = {c:p for p in tree.iter() for c in p}
|
||||
|
||||
# iterate over the elements and add them to list
|
||||
elements_del = list()
|
||||
for element in tree.iterfind('.//w:commentRangeStart', namespace):
|
||||
elements_del.append(element)
|
||||
for element in tree.iterfind('.//w:commentRangeEnd', namespace):
|
||||
elements_del.append(element)
|
||||
for element in tree.iterfind('.//w:commentReference', namespace):
|
||||
elements_del.append(element)
|
||||
|
||||
# remove the elements
|
||||
for element in elements_del:
|
||||
parent_map[element].remove(element)
|
||||
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
def __remove_document_xml_rels_members(self, full_path: str) -> bool:
|
||||
""" Remove the dangling references from the word/_rels/document.xml.rels file, since MS office doesn't like them.
|
||||
"""
|
||||
try:
|
||||
tree, namespace = _parse_xml(full_path)
|
||||
except ET.ParseError as e: # pragma: no cover
|
||||
logging.error("Unable to parse %s: %s", full_path, e)
|
||||
return False
|
||||
|
||||
if len(namespace.items()) != 1: # pragma: no cover
|
||||
logging.debug("Got several namespaces for Types: %s", namespace.items())
|
||||
|
||||
removed_fnames = set()
|
||||
with zipfile.ZipFile(self.filename) as zin:
|
||||
for fname in [item.filename for item in zin.infolist()]:
|
||||
for file_to_omit in self.files_to_omit:
|
||||
if file_to_omit.search(fname):
|
||||
matches = map(lambda r: r.search(fname), self.files_to_keep)
|
||||
if any(matches): # the file is in the allowlist
|
||||
continue
|
||||
removed_fnames.add(fname)
|
||||
break
|
||||
|
||||
root = tree.getroot()
|
||||
for item in root.findall('{%s}Relationship' % namespace['']):
|
||||
name = 'word/' + item.attrib['Target'] # add the word/ prefix to the path, since all document rels are in the word/ directory
|
||||
if name in removed_fnames:
|
||||
root.remove(item)
|
||||
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
def __remove_content_type_members(self, full_path: str) -> bool:
|
||||
@@ -320,7 +391,7 @@ class MSOfficeParser(ZipParser):
|
||||
if name in removed_fnames:
|
||||
root.remove(item)
|
||||
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
def _final_checks(self) -> bool:
|
||||
@@ -355,7 +426,7 @@ class MSOfficeParser(ZipParser):
|
||||
|
||||
for item in tree.iterfind('.//p14:creationId', namespace):
|
||||
item.set('val', '%s' % random.randint(0, 2**32))
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
@@ -371,7 +442,7 @@ class MSOfficeParser(ZipParser):
|
||||
|
||||
for item in tree.iterfind('.//p:sldMasterId', namespace):
|
||||
item.set('id', '%s' % random.randint(0, 2**32))
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
def _specific_cleanup(self, full_path: str) -> bool:
|
||||
@@ -379,7 +450,7 @@ class MSOfficeParser(ZipParser):
|
||||
if os.stat(full_path).st_size == 0: # Don't process empty files
|
||||
return True
|
||||
|
||||
if not full_path.endswith('.xml'):
|
||||
if not full_path.endswith(('.xml', '.xml.rels')):
|
||||
return True
|
||||
|
||||
if self.__randomize_creationId(full_path) is False:
|
||||
@@ -396,6 +467,13 @@ class MSOfficeParser(ZipParser):
|
||||
# this file contains the revisions
|
||||
if self.__remove_revisions(full_path) is False:
|
||||
return False # pragma: no cover
|
||||
# remove comment references and ranges
|
||||
if self.__remove_document_comment_meta(full_path) is False:
|
||||
return False # pragma: no cover
|
||||
elif full_path.endswith('/word/_rels/document.xml.rels'):
|
||||
# similar to the above, but for the document.xml.rels file
|
||||
if self.__remove_document_xml_rels_members(full_path) is False: # pragma: no cover
|
||||
return False
|
||||
elif full_path.endswith('/docProps/app.xml'):
|
||||
# This file must be present and valid,
|
||||
# so we're removing as much as we can.
|
||||
@@ -447,7 +525,7 @@ class MSOfficeParser(ZipParser):
|
||||
# see: https://docs.microsoft.com/en-us/dotnet/framework/wpf/advanced/mc-ignorable-attribute
|
||||
with open(full_path, 'rb') as f:
|
||||
text = f.read()
|
||||
out = re.sub(b'mc:Ignorable="[^"]*"', b'', text, 1)
|
||||
out = re.sub(b'mc:Ignorable="[^"]*"', b'', text, count=1)
|
||||
with open(full_path, 'wb') as f:
|
||||
f.write(out)
|
||||
|
||||
@@ -514,7 +592,7 @@ class LibreOfficeParser(ZipParser):
|
||||
for changes in text.iterfind('.//text:tracked-changes', namespace):
|
||||
text.remove(changes)
|
||||
|
||||
tree.write(full_path, xml_declaration=True)
|
||||
tree.write(full_path, xml_declaration=True, encoding='utf-8')
|
||||
return True
|
||||
|
||||
def _specific_cleanup(self, full_path: str) -> bool:
|
||||
|
@@ -36,7 +36,10 @@ class PDFParser(abstract.AbstractParser):
|
||||
|
||||
def remove_all(self) -> bool:
|
||||
if self.lightweight_cleaning is True:
|
||||
return self.__remove_all_lightweight()
|
||||
try:
|
||||
return self.__remove_all_lightweight()
|
||||
except (cairo.Error, MemoryError) as e:
|
||||
raise RuntimeError(e)
|
||||
return self.__remove_all_thorough()
|
||||
|
||||
def __remove_all_lightweight(self) -> bool:
|
||||
@@ -133,8 +136,8 @@ class PDFParser(abstract.AbstractParser):
|
||||
# It should(tm) be alright though, because cairo's output format
|
||||
# for metadata is fixed.
|
||||
with open(out_file, 'rb') as f:
|
||||
out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(), 0,
|
||||
re.DOTALL | re.IGNORECASE)
|
||||
out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(),
|
||||
count=0, flags=re.DOTALL | re.IGNORECASE)
|
||||
with open(out_file, 'wb') as f:
|
||||
f.write(out)
|
||||
|
||||
|
@@ -20,7 +20,7 @@ class CSSParser(abstract.AbstractParser):
|
||||
content = f.read()
|
||||
except UnicodeDecodeError: # pragma: no cover
|
||||
raise ValueError
|
||||
cleaned = re.sub(r'/\*.*?\*/', '', content, 0, self.flags)
|
||||
cleaned = re.sub(r'/\*.*?\*/', '', content, count=0, flags=self.flags)
|
||||
with open(self.output_filename, 'w', encoding='utf-8') as f:
|
||||
f.write(cleaned)
|
||||
return True
|
||||
|
8
mat2
8
mat2
@@ -17,7 +17,7 @@ except ValueError as ex:
|
||||
print(ex)
|
||||
sys.exit(1)
|
||||
|
||||
__version__ = '0.13.4'
|
||||
__version__ = '0.13.5'
|
||||
|
||||
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING)
|
||||
|
||||
@@ -57,8 +57,8 @@ def create_arg_parser() -> argparse.ArgumentParser:
|
||||
', '.join(p.value for p in UnknownMemberPolicy))
|
||||
parser.add_argument('--inplace', action='store_true',
|
||||
help='clean in place, without backup')
|
||||
parser.add_argument('--no-sandbox', dest='sandbox', action='store_true',
|
||||
default=False, help='Disable bubblewrap\'s sandboxing')
|
||||
parser.add_argument('--no-sandbox', dest='sandbox', action='store_false',
|
||||
default=True, help='Disable bubblewrap\'s sandboxing')
|
||||
|
||||
excl_group = parser.add_mutually_exclusive_group()
|
||||
excl_group.add_argument('files', nargs='*', help='the files to process',
|
||||
@@ -186,7 +186,7 @@ def main() -> int:
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
if args.verbose:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
logging.getLogger(__name__).setLevel(logging.DEBUG)
|
||||
|
||||
if not args.files:
|
||||
if args.list:
|
||||
|
@@ -1,8 +1,19 @@
|
||||
[project]
|
||||
name = "mat"
|
||||
version = "0.13.4"
|
||||
name = "mat2"
|
||||
version = "0.13.5"
|
||||
description = "mat2 is a metadata removal tool, supporting a wide range of commonly used file formats, written in python3: at its core, it's a library, used by an eponymous command-line interface, as well as several file manager extensions."
|
||||
readme = "README.md"
|
||||
license = {file = "LICENSE"}
|
||||
requires-python = ">=3.9"
|
||||
dependencies = [
|
||||
'mutagen',
|
||||
'PyGObject',
|
||||
'pycairo',
|
||||
]
|
||||
[project.urls]
|
||||
Repository = "https://github.com/jvoisin/mat2"
|
||||
Issues = "https://github.com/jvoisin/mat2/issues"
|
||||
Changelog = "https://github.com/jvoisin/mat2/blob/master/CHANGELOG.md"
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py39"
|
||||
|
8
setup.py
8
setup.py
@@ -5,13 +5,13 @@ with open("README.md", encoding='utf-8') as fh:
|
||||
|
||||
setuptools.setup(
|
||||
name="mat2",
|
||||
version='0.13.4',
|
||||
version='0.13.5',
|
||||
author="Julien (jvoisin) Voisin",
|
||||
author_email="julien.voisin+mat2@dustri.org",
|
||||
description="A handy tool to trash your metadata",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://0xacab.org/jvoisin/mat2",
|
||||
url="https://github.com/jvoisin/mat2",
|
||||
python_requires = '>=3.5.0',
|
||||
scripts=['mat2'],
|
||||
install_requires=[
|
||||
@@ -20,7 +20,7 @@ setuptools.setup(
|
||||
'pycairo',
|
||||
],
|
||||
packages=setuptools.find_packages(exclude=('tests', )),
|
||||
data_files = [('man/man1', ['doc/mat2.1'])],
|
||||
data_files = [('share/man/man1', ['doc/mat2.1'])],
|
||||
classifiers=[
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Environment :: Console",
|
||||
@@ -31,6 +31,6 @@ setuptools.setup(
|
||||
"Intended Audience :: End Users/Desktop",
|
||||
],
|
||||
project_urls={
|
||||
'bugtacker': 'https://0xacab.org/jvoisin/mat2/issues',
|
||||
'bugtacker': 'https://github.com/jvoisin/mat2/issues',
|
||||
},
|
||||
)
|
||||
|
BIN
tests/data/comment.docx
Normal file
BIN
tests/data/comment.docx
Normal file
Binary file not shown.
BIN
tests/data/dirty.webp
Normal file
BIN
tests/data/dirty.webp
Normal file
Binary file not shown.
After Width: | Height: | Size: 38 KiB |
BIN
tests/dirty.epub
BIN
tests/dirty.epub
Binary file not shown.
@@ -236,6 +236,11 @@ class TestGetMeta(unittest.TestCase):
|
||||
self.assertIn(b'i am a : various comment', stdout)
|
||||
self.assertIn(b'artist: jvoisin', stdout)
|
||||
|
||||
#def test_webp(self):
|
||||
# proc = subprocess.Popen(mat2_binary + ['--show', './tests/data/dirty.webp'],
|
||||
# stdout=subprocess.PIPE)
|
||||
# stdout, _ = proc.communicate()
|
||||
# self.assertIn(b'Warning: [minor] Improper EXIF header', stdout)
|
||||
|
||||
class TestControlCharInjection(unittest.TestCase):
|
||||
def test_jpg(self):
|
||||
|
@@ -14,7 +14,7 @@ from libmat2 import harmless, video, web, archive
|
||||
|
||||
# No need to logging messages, should something go wrong,
|
||||
# the testsuite _will_ fail.
|
||||
logger = logging.getLogger()
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.FATAL)
|
||||
|
||||
|
||||
|
@@ -4,6 +4,7 @@ import unittest
|
||||
import shutil
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import tarfile
|
||||
import tempfile
|
||||
import zipfile
|
||||
@@ -113,6 +114,11 @@ class TestGetMeta(unittest.TestCase):
|
||||
meta = p.get_meta()
|
||||
self.assertEqual(meta['Comment'], 'Created with GIMP')
|
||||
|
||||
#def test_webp(self):
|
||||
# p = images.WEBPParser('./tests/data/dirty.webp')
|
||||
# meta = p.get_meta()
|
||||
# self.assertEqual(meta['Warning'], '[minor] Improper EXIF header')
|
||||
|
||||
def test_ppm(self):
|
||||
p = images.PPMParser('./tests/data/dirty.ppm')
|
||||
meta = p.get_meta()
|
||||
@@ -333,6 +339,11 @@ class TestCleaning(unittest.TestCase):
|
||||
'parser': images.JPGParser,
|
||||
'meta': {'Comment': 'Created with GIMP'},
|
||||
'expected_meta': {},
|
||||
#}, {
|
||||
# 'name': 'webp',
|
||||
# 'parser': images.WEBPParser,
|
||||
# 'meta': {'Warning': '[minor] Improper EXIF header'},
|
||||
# 'expected_meta': {},
|
||||
}, {
|
||||
'name': 'wav',
|
||||
'parser': audio.WAVParser,
|
||||
@@ -508,7 +519,11 @@ class TestCleaning(unittest.TestCase):
|
||||
'TrackID': 1,
|
||||
'TrackLayer': 0,
|
||||
'TransferCharacteristics': 'BT.709',
|
||||
'VideoFullRangeFlag': 'Limited',
|
||||
},
|
||||
'extra_expected_meta': {
|
||||
'VideoFullRangeFlag': 0,
|
||||
}
|
||||
},{
|
||||
'name': 'wmv',
|
||||
'ffmpeg': 1,
|
||||
@@ -521,7 +536,43 @@ class TestCleaning(unittest.TestCase):
|
||||
'name': 'heic',
|
||||
'parser': images.HEICParser,
|
||||
'meta': {},
|
||||
'expected_meta': {},
|
||||
'expected_meta': {
|
||||
'BlueMatrixColumn': '0.14305 0.06061 0.71393',
|
||||
'BlueTRC': '(Binary data 32 bytes, use -b option to extract)',
|
||||
'CMMFlags': 'Not Embedded, Independent',
|
||||
'ChromaticAdaptation': '1.04788 0.02292 -0.05022 0.02959 0.99048 -0.01707 -0.00925 0.01508 0.75168',
|
||||
'ChromaticityChannel1': '0.64 0.33002',
|
||||
'ChromaticityChannel2': '0.3 0.60001',
|
||||
'ChromaticityChannel3': '0.15001 0.06',
|
||||
'ChromaticityChannels': 3,
|
||||
'ChromaticityColorant': 'Unknown',
|
||||
'ColorSpaceData': 'RGB ',
|
||||
'ConnectionSpaceIlluminant': '0.9642 1 0.82491',
|
||||
'DeviceAttributes': 'Reflective, Glossy, Positive, Color',
|
||||
'DeviceManufacturer': '',
|
||||
'DeviceMfgDesc': 'GIMP',
|
||||
'DeviceModel': '',
|
||||
'DeviceModelDesc': 'sRGB',
|
||||
'ExifByteOrder': 'Big-endian (Motorola, MM)',
|
||||
'GreenMatrixColumn': '0.38512 0.7169 0.09706',
|
||||
'GreenTRC': '(Binary data 32 bytes, use -b option to extract)',
|
||||
'MediaWhitePoint': '0.9642 1 0.82491',
|
||||
'PrimaryPlatform': 'Apple Computer Inc.',
|
||||
'ProfileCMMType': 'Little CMS',
|
||||
'ProfileClass': 'Display Device Profile',
|
||||
'ProfileConnectionSpace': 'XYZ ',
|
||||
'ProfileCopyright': 'Public Domain',
|
||||
'ProfileCreator': 'Little CMS',
|
||||
'ProfileDateTime': '2022:05:15 16:29:22',
|
||||
'ProfileDescription': 'GIMP built-in sRGB',
|
||||
'ProfileFileSignature': 'acsp',
|
||||
'ProfileID': 0,
|
||||
'ProfileVersion': '4.3.0',
|
||||
'RedMatrixColumn': '0.43604 0.22249 0.01392',
|
||||
'RedTRC': '(Binary data 32 bytes, use -b option to extract)',
|
||||
'RenderingIntent': 'Perceptual',
|
||||
'Warning': 'Bad IFD0 directory',
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
@@ -556,8 +607,13 @@ class TestCleaning(unittest.TestCase):
|
||||
meta = p2.get_meta()
|
||||
if meta:
|
||||
for k, v in p2.get_meta().items():
|
||||
self.assertIn(k, case['expected_meta'], '"%s" is not in "%s" (%s)' % (k, case['expected_meta'], case['name']))
|
||||
self.assertIn(str(case['expected_meta'][k]), str(v))
|
||||
self.assertIn(k, case['expected_meta'], '"%s" is not in "%s" (%s), with all of them being %s' % (k, case['expected_meta'], case['name'], p2.get_meta().items()))
|
||||
if str(case['expected_meta'][k]) in str(v):
|
||||
continue
|
||||
if 'extra_expected_meta' in case and k in case['extra_expected_meta']:
|
||||
if str(case['extra_expected_meta'][k]) in str(v):
|
||||
continue
|
||||
self.assertTrue(False, "got a different value (%s) than excepted (%s) for %s, with all of them being %s" % (str(v), meta, k, p2.get_meta().items()))
|
||||
self.assertTrue(p2.remove_all())
|
||||
|
||||
os.remove(target)
|
||||
@@ -583,14 +639,20 @@ class TestCleaning(unittest.TestCase):
|
||||
os.remove('./tests/data/clean.cleaned.html')
|
||||
os.remove('./tests/data/clean.cleaned.cleaned.html')
|
||||
|
||||
with open('./tests/data/clean.html', 'w') as f:
|
||||
f.write('<title><title><pouet/><meta/></title></title><test/>')
|
||||
p = web.HTMLParser('./tests/data/clean.html')
|
||||
self.assertTrue(p.remove_all())
|
||||
with open('./tests/data/clean.cleaned.html', 'r') as f:
|
||||
self.assertEqual(f.read(), '<title></title><test/>')
|
||||
if sys.version_info >= (3, 13):
|
||||
with open('./tests/data/clean.html', 'w') as f:
|
||||
f.write('<title><title><pouet/><meta/></title></title><test/>')
|
||||
with self.assertRaises(ValueError):
|
||||
p = web.HTMLParser('./tests/data/clean.html')
|
||||
else:
|
||||
with open('./tests/data/clean.html', 'w') as f:
|
||||
f.write('<title><title><pouet/><meta/></title></title><test/>')
|
||||
p = web.HTMLParser('./tests/data/clean.html')
|
||||
self.assertTrue(p.remove_all())
|
||||
with open('./tests/data/clean.cleaned.html', 'r') as f:
|
||||
self.assertEqual(f.read(), '<title></title><test/>')
|
||||
os.remove('./tests/data/clean.cleaned.html')
|
||||
os.remove('./tests/data/clean.html')
|
||||
os.remove('./tests/data/clean.cleaned.html')
|
||||
|
||||
with open('./tests/data/clean.html', 'w') as f:
|
||||
f.write('<test><title>Some<b>metadata</b><br/></title></test>')
|
||||
@@ -857,3 +919,97 @@ class TestComplexOfficeFiles(unittest.TestCase):
|
||||
|
||||
os.remove(target)
|
||||
os.remove(p.output_filename)
|
||||
|
||||
class TextDocx(unittest.TestCase):
|
||||
def test_comment_xml_is_removed(self):
|
||||
with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
|
||||
# Check if 'word/comments.xml' exists in the zip
|
||||
self.assertIn('word/comments.xml', zipin.namelist())
|
||||
|
||||
shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
|
||||
p = office.MSOfficeParser('./tests/data/comment_clean.docx')
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
|
||||
# Check if 'word/comments.xml' exists in the zip
|
||||
self.assertNotIn('word/comments.xml', zipin.namelist())
|
||||
|
||||
os.remove('./tests/data/comment_clean.docx')
|
||||
os.remove('./tests/data/comment_clean.cleaned.docx')
|
||||
|
||||
def test_xml_is_utf8(self):
|
||||
with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
|
||||
c = zipin.open('word/document.xml')
|
||||
content = c.read()
|
||||
|
||||
# ensure encoding is utf-8
|
||||
r = b'encoding=(\'|\")UTF-8(\'|\")'
|
||||
match = re.search(r, content, re.IGNORECASE)
|
||||
self.assertIsNotNone(match)
|
||||
|
||||
shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
|
||||
p = office.MSOfficeParser('./tests/data/comment_clean.docx')
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
|
||||
c = zipin.open('word/document.xml')
|
||||
content = c.read()
|
||||
|
||||
# ensure encoding is still utf-8
|
||||
r = b'encoding=(\'|\")UTF-8(\'|\")'
|
||||
match = re.search(r, content, re.IGNORECASE)
|
||||
self.assertIsNotNone(match)
|
||||
|
||||
os.remove('./tests/data/comment_clean.docx')
|
||||
os.remove('./tests/data/comment_clean.cleaned.docx')
|
||||
|
||||
def test_comment_references_are_removed(self):
|
||||
with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
|
||||
c = zipin.open('word/document.xml')
|
||||
content = c.read()
|
||||
|
||||
r = b'w:commentRangeStart'
|
||||
self.assertIn(r, content)
|
||||
r = b'w:commentRangeEnd'
|
||||
self.assertIn(r, content)
|
||||
r = b'w:commentReference'
|
||||
self.assertIn(r, content)
|
||||
|
||||
shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
|
||||
p = office.MSOfficeParser('./tests/data/comment_clean.docx')
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
|
||||
c = zipin.open('word/document.xml')
|
||||
content = c.read()
|
||||
|
||||
r = b'w:commentRangeStart'
|
||||
self.assertNotIn(r, content)
|
||||
r = b'w:commentRangeEnd'
|
||||
self.assertNotIn(r, content)
|
||||
r = b'w:commentReference'
|
||||
self.assertNotIn(r, content)
|
||||
|
||||
os.remove('./tests/data/comment_clean.docx')
|
||||
os.remove('./tests/data/comment_clean.cleaned.docx')
|
||||
|
||||
def test_clean_document_xml_rels(self):
|
||||
with zipfile.ZipFile('./tests/data/comment.docx') as zipin:
|
||||
c = zipin.open('word/_rels/document.xml.rels')
|
||||
content = c.read()
|
||||
r = b'Target="comments.xml"'
|
||||
self.assertIn(r, content)
|
||||
|
||||
shutil.copy('./tests/data/comment.docx', './tests/data/comment_clean.docx')
|
||||
p = office.MSOfficeParser('./tests/data/comment_clean.docx')
|
||||
self.assertTrue(p.remove_all())
|
||||
|
||||
with zipfile.ZipFile('./tests/data/comment_clean.cleaned.docx') as zipin:
|
||||
c = zipin.open('word/_rels/document.xml.rels')
|
||||
content = c.read()
|
||||
r = b'Target="comments.xml"'
|
||||
self.assertNotIn(r, content)
|
||||
|
||||
os.remove('./tests/data/comment_clean.docx')
|
||||
os.remove('./tests/data/comment_clean.cleaned.docx')
|
||||
|
||||
|
@@ -23,6 +23,11 @@ class TestLightWeightCleaning(unittest.TestCase):
|
||||
'parser': images.JPGParser,
|
||||
'meta': {'Comment': 'Created with GIMP'},
|
||||
'expected_meta': {},
|
||||
#}, {
|
||||
# 'name': 'webp',
|
||||
# 'parser': images.WEBPParser,
|
||||
# 'meta': {'Warning': '[minor] Improper EXIF header'},
|
||||
# 'expected_meta': {},
|
||||
}, {
|
||||
'name': 'torrent',
|
||||
'parser': torrent.TorrentParser,
|
||||
@@ -33,7 +38,6 @@ class TestLightWeightCleaning(unittest.TestCase):
|
||||
'parser': images.TiffParser,
|
||||
'meta': {'ImageDescription': 'OLYMPUS DIGITAL CAMERA '},
|
||||
'expected_meta': {
|
||||
'Orientation': 'Horizontal (normal)',
|
||||
'ResolutionUnit': 'inches',
|
||||
'XResolution': 72,
|
||||
'YResolution': 72
|
||||
|
Reference in New Issue
Block a user