Extracting .MHT Contents with Python
Posted: 2024-06-19
Tags: Python
This is a dirty script to get the content out of .mht
files.
It does not preserve any folder structures that might have existed. File contents are dumped to the current working directory.
Does not update any links inside of the dumped files to account for this behavior.
import sys, re, base64
if len(sys.argv) != 2:
sys.exit(f'USAGE: python f{sys.argv[0]} filename.mht')
if sys.argv[1][-4:] != '.mht':
sys.exit(f'ERROR - Expected a filename ending with .mht')
try:
file = open(sys.argv[1], 'r')
except Exception as err:
sys.exit(f'ERROR - Failure reading {sys.argv[1]}: {err}')
content = file.readlines()
file.close()
print(f'Read {len(content)} lines.')
files = []
boundary = ''
buffer = ''
filename = ''
type = ''
infile = False
for line in content:
if boundary == '':
if line.lstrip()[:9] == 'boundary=':
boundary = '--' + line.lstrip()[10:-2] + '\n'
print(f'Found boundary: {boundary}')
else:
if line == boundary:
if filename != '':
files.append({ 'name': filename,
'content': buffer,
'type': type })
type = ''
filename = ''
buffer = ''
infile = False
continue
if filename == '':
if line.startswith('Content-Transfer-Encoding'):
if line.endswith('quoted-printable\n'):
type = 'qp'
elif line.endswith('base64\n'):
type = 'b64'
elif line.startswith('Content-Location'):
filename = line[18:-1]
continue
if infile == False:
if line != '\n':
filename += line[:-1]
else:
infile = True
continue
if type == 'qp':
while True:
result = re.search("=([0-9A-F][0-9A-F])", line)
if result == None:
break
A = line[0:result.span()[0]]
B = chr(int(result.group()[-2:], 16))
C = line[result.span()[1]:]
line = A + B + C
if len(line) > 2 and line[-2] == '=':
line = line[:-2] + '\n'
buffer += line[:-1]
print(f'Found {len(files)} files.')
print('Writing output...')
for data in files:
name = data['name'].rsplit('/', 1)[-1]
try:
if data['type'] == 'b64':
file = open(name, 'xb')
else:
file = open(name, 'x')
except Exception as err:
print(f'ERROR! - {data["name"]}: {err}')
if data['type'] == 'qp':
file.write(data['content'])
elif data['type'] == 'b64':
file.write(base64.b64decode(data['content']))
else:
print(f'ERROR: Unknown type {data["type"]} for {data["name"]}')
file.close()
print('Done.')