Extracting .MHT Contents with Python
Posted: 2024-06-19
Tags: Python
This is a dirty script to get the content out of .mht archives created by Internet Explorer.
It preserves directory structures but does not update any URLs inside of the dumped files.
It would probably be smarter to just use the email module to parse the file, but I wanted to understand how the .mht files were structured.
Only minimal error checking.
#!/usr/bin/env python3
from pathlib import Path
import re, sys, quopri, base64
INPUT_FILE = Path(sys.argv[1])
OUTPUT_DIR = INPUT_FILE.parent / INPUT_FILE.stem
OUTPUT_DIR.mkdir(exist_ok=True)
bound = ''
try:
with open(INPUT_FILE) as f:
for line in f:
if line.startswith("\tboundary="):
bound = '--' + line.split("boundary=")[1][1:-2]
break
name, encoding, lines = '', '', []
for line in f:
if name == '':
if line.startswith('Content-Transfer-Encoding:'):
encoding = line.split('Content-Transfer-Encoding: ')[1][:-1]
if line.startswith("Content-Location:"):
name = line.split("Content-Location: ")[1][:-1]
name = re.sub(r"[^A-Za-z0-9./-]", "_", name.split('//')[-1])
print(name, encoding)
else:
if line.startswith(bound):
fullname = OUTPUT_DIR / name
if fullname.exists():
print(f"{name} exists! Skipping...")
else:
print("Writing", name)
if encoding == 'quoted-printable':
qp = quopri.decodestring(''.join(lines))
qp = qp.decode('utf-8', errors='ignore')
fullname.parent.mkdir(parents=True, exist_ok=True)
fullname.write_text(qp, encoding='utf-8')
elif encoding == 'base64':
b64 = base64.b64decode(''.join(lines))
fullname.parent.mkdir(parents=True, exist_ok=True)
fullname.write_bytes(b64)
name, encoding, lines = '', '', []
else:
lines.append(line)
print('Done.')
except Exception as err:
sys.exit(f'ERROR - Failure reading {sys.argv[1]}: {err}')