Notes to Future-Me


Extracting .MHT Contents with Python

Posted: 2024-06-19
Tags: Python

This is a dirty script to get the content out of .mht files.

It does not preserve any folder structures that might have existed. File contents are dumped to the current working directory.

Does not update any links inside of the dumped files to account for this behavior.

import sys, re, base64

if len(sys.argv) != 2:
    sys.exit(f'USAGE: python f{sys.argv[0]} filename.mht')

if sys.argv[1][-4:] != '.mht':
    sys.exit(f'ERROR - Expected a filename ending with .mht')


try:
    file = open(sys.argv[1], 'r')
except Exception as err:
    sys.exit(f'ERROR - Failure reading {sys.argv[1]}: {err}')


content = file.readlines()
file.close()


print(f'Read {len(content)} lines.')


files = []
boundary = ''
buffer = ''
filename = ''
type = ''
infile = False

for line in content:
    if boundary == '':
        if line.lstrip()[:9] == 'boundary=':
            boundary = '--' + line.lstrip()[10:-2] + '\n'
            print(f'Found boundary: {boundary}')
    else:
        if line == boundary:
            if filename != '':
                files.append({ 'name': filename,
                               'content': buffer,
                               'type': type })
                
                type = ''
                filename = ''
                buffer = ''
                infile = False
            continue

        if filename == '':
            if line.startswith('Content-Transfer-Encoding'):
                if line.endswith('quoted-printable\n'):
                    type = 'qp'
                elif line.endswith('base64\n'):
                    type = 'b64'

            elif line.startswith('Content-Location'):
                filename = line[18:-1]

            continue


        if infile == False:
            if line != '\n':
                filename += line[:-1]
            else:
                infile = True
            continue
        

        if type == 'qp':
            while True:
                result = re.search("=([0-9A-F][0-9A-F])", line)
                if result == None:
                    break
                A = line[0:result.span()[0]] 
                B = chr(int(result.group()[-2:], 16))
                C = line[result.span()[1]:] 
                line = A + B + C

            if len(line) > 2 and line[-2] == '=':
                line = line[:-2] + '\n'

        buffer += line[:-1]
        
print(f'Found {len(files)} files.')
print('Writing output...')

for data in files:
    name = data['name'].rsplit('/', 1)[-1]
    try:
        if data['type'] == 'b64':
            file = open(name, 'xb')
        else:
            file = open(name, 'x')
    except Exception as err:
        print(f'ERROR! - {data["name"]}: {err}')
 
    if data['type'] == 'qp':
        file.write(data['content'])
    elif data['type'] == 'b64':
        file.write(base64.b64decode(data['content']))
    else:
        print(f'ERROR: Unknown type {data["type"]} for {data["name"]}')        
    file.close()


print('Done.')