Newer
Older
#!/usr/bin/env python3
from argparse import ArgumentParser, FileType
from base64 import b64decode
import json
import mimetypes
import os
from pathlib import Path
from urllib.parse import urlparse
from woob.tools.request import to_curl
def write_request(entry, fd):
entry = entry['request']
# we should put the path, but since requests does not output the Host header
# we would not know what was the host
fd.write(f"{entry['method']} {entry['url']} {entry['httpVersion']}\n\n".encode())
for header in entry['headers']:
fd.write(f"{header['name']}: {header['value']}\n".encode())
if 'postData' in entry:
if entry['postData'].get('x-binary'):
body = entry['postData']['text'].encode('latin-1')
else:
body = entry['postData']['text'].encode()
fd.write(b'\n' + body + b'\n')
if os.environ.get('WOOB_CURLIFY_REQUEST') == '1':
# Convert HAR to PreparedRequest format
entry['headers'] = {header['name']: header['value'] for header in entry['headers']}
body = entry.get('postData', {}).get('text')
if body:
entry['body'] = body
curl = to_curl(entry)
fd.write(b'\n' + curl.encode('utf-8') + b'\n')
def write_response(entry, fd):
entry = entry['response']
fd.write(f"{entry['httpVersion']} {entry['status']} {entry['statusText']}\n")
for header in entry['headers']:
fd.write(f"{header['name']}: {header['value']}\n")
def write_body(entry, fd):
entry = entry['response']
if entry['content'].get('encoding') == 'base64':
data = b64decode(entry['content']['text'])
else:
data = entry['content'].get('text', '')
data = data.encode('utf-8')
fd.write(data)
def guess_extension(entry):
headers = entry['response']['headers']
ctype = next((header['value'] for header in headers if header['name'].lower() == 'content-type'), '')
# due to http://bugs.python.org/issue1043134
if ctype == 'text/plain':
ext = '.txt'
else:
# try to get an extension (and avoid adding 'None')
ext = mimetypes.guess_extension(ctype, False) or ''
return ext
Florent Viard
committed
NAME_MAX_LENGTH = 80
def extract(n, destdir):
os.makedirs(destdir, exist_ok=True)
entry = data['log']['entries'][n]
ext = guess_extension(entry)
name = Path(urlparse(entry['request']['url']).path).stem
Florent Viard
committed
if name:
name = name[:NAME_MAX_LENGTH]
prefix = f'{destdir}/{n + 1:03d}-{entry["response"]["status"]}{name and f"-{name}"}{ext}'
with open(f'{prefix}-request.txt', 'wb') as fd:
write_request(entry, fd)
with open(f'{prefix}-response.txt', 'w') as fd:
write_response(entry, fd)
with open(prefix, 'wb') as fd:
write_body(entry, fd)
parser = ArgumentParser()
parser.add_argument('file', type=FileType('r'), help='HAR file to extract')
parser.add_argument('destdir', nargs='?', default=None, help='Destination directory for extracted files')
args = parser.parse_args()
if args.destdir is None:
# Automatically generate destdir if not provided
if args.file.name.endswith('.har'):
args.destdir = args.file.name[:-4]
else:
args.destdir = f'{args.file.name}_content'
data = json.load(args.file)
for n in range(len(data['log']['entries'])):
print('extracting request', n)
extract(n, args.destdir)