Downloading and extracting archived files

Datasets are important part of Machine Learning and quite often the datasets are provided in a zip file. In this article we’ll cover how to download an archived dataset from any URL and extract it using a python script.

# IMPORTING MODULES
import os
import zipfile
import tarfile
import gzip
import shutil
import requests
# ARCHIVE EXTENSIONS
ZIP_EXTENSION = ".zip"
TAR_EXTENSION = ".tar"
TAR_GZ_EXTENSION = ".tar.gz"
TGZ_EXTENSION = ".tgz"
GZ_EXTENSION = ".gz"
EMPTY_URL_ERROR = "ERROR: URL should not be empty."
FILENAME_ERROR = "ERROR: Filename should not be empty."
UNKNOWN_FORMAT = "ERROR: Unknown file format. Can't extract."
def download_dataset(url, target_path="data/", keep_download=True, overwrite_download=False):
	"""Downloads dataset from a url.
	url: string, a dataset path
	target_path: string, path where data will be downloaded
	keep_download: boolean, keeps the original file after extraction
	overwrite_download: boolean, stops download if dataset already exists
	"""
	if url == "" or url is None:
		raise Exception(EMPTY_URL_ERROR)

	filename = get_filename(url)
	file_location = get_file_location(target_path, filename)

	os.makedirs(data_dir, exist_ok=True)

	if os.path.exists(file_location) and not overwrite_download:
		print(f"File already exists at {file_location}. Use: 'overwrite_download=True' to \
overwrite download")
		extract_file(target_path, filename)
		return

	print(f"Downloading file from {url} to {file_location}.")
	# Download
	with open(file_location, 'wb') as f:
		with requests.get(url, allow_redirects=True, stream=True) as resp:
			for chunk in resp.iter_content(chunk_size = 512):  #chunk_size in bytes
				if chunk:
					f.write(chunk)

	print("Finished downloading.")
	print("Extracting the file now ...")
	extract_file(target_path, filename)

	if not keep_download:
		os.remove(file_location)

def extract_file(target_path, filename):
	"""Extract file based on file extension
	target_path: string, location where data will be extracted
	filename: string, name of the file along with extension
	"""
	if filename == "" or filename is None:
		raise Exception(FILENAME_ERROR)

	file_location = get_file_location(target_path, filename)

	if filename.endswith(ZIP_EXTENSION):
		print("Extracting zip file...")
		zipf = zipfile.ZipFile(file_location, 'r')
		zipf.extractall(target_path)
		zipf.close()
	elif filename.endswith(TAR_EXTENSION) or \
		 filename.endswith(TAR_GZ_EXTENSION) or \
		 filename.endswith(TGZ_EXTENSION):
		print("Extracting tar file")
		tarf = tarfile.open(file_location, 'r')
		tarf.extractall(target_path)
		tarf.close()
	elif filename.endswith(GZ_EXTENSION):
		print("Extracting gz file")
		out_file = file_location[:-3]
		with open(file_location, "rb") as f_in:
			with open(out_file, "wb") as f_out:
				shutil.copyfileobj(f_in, f_out)
	else:
		print(UNKNOWN_FORMAT)

def get_filename(url):
	"""Extract filename from file url"""
	filename = os.path.basename(url)
	return filename

def get_file_location(target_path, filename):
	""" Concatenate download directory and filename"""
	return target_path + filename

About requests module

Request module is useful when downloading large files. I prefere to use this module when the size of the file is more than 500MB. In the above code, the important thing to discuss is stream parameter of requests.get method.

If stream is set to False then the entire file gets downloaded into memory and if the file size is too large, then it can be an issue with the memory consumption.
If stream is set to True, then the download does not start immediately. Instead, only the header is downloaded and the connection remains open. This way we can either go with the download or we can cancel it (depending on header info). The file gets downloaded only when we use the content property or iterate over the content with iter_content.
The content property puts the entire content into the memory which we should try to avoid for bigger files whereas the iter_content loads the file in chunks into the memory.