# -*- coding: utf-8 -*-
"""
The :mod:`neurotic.datasets.gdrive` module implements a class for downloading
files from Google Drive using paths, rather than file IDs or shareable links.
.. autoclass:: GoogleDriveDownloader
:members:
"""
import os
import shutil
import urllib
from functools import reduce
from tqdm.auto import tqdm
from pydrive2.auth import GoogleAuth, LoadAuth
from pydrive2.drive import GoogleDrive
import logging
logger = logging.getLogger(__name__)
[docs]class GoogleDriveDownloader(GoogleDrive):
"""
A class for downloading files from Google Drive using paths.
Files can be specified for download using URL-like paths of the form
gdrive://<drive name>/<folder 1>/<...>/<folder N>/<file name>
The "<drive name>" may be "My Drive" for files located in a personal
Google Drive, or it may be the name of a Shared Drive that the user has
permission to access.
Note that these URL-like paths are not equivalent to ordinary URLs
associated with Google Drive files, such as shareable links, which are
composed of pseudorandom file IDs and do not reveal anything about the name
of the file or the folders containing it.
This class can only download files that are uniquely identifiable by their
paths. Google Drive does not require file or folder names to be unique, so
two or more files or folders with identical names may coexist in a folder.
Such files and folders cannot be distinguished by their paths, so they
cannot be downloaded using this class. A download will fail while
traversing the file tree if at any step there is more than one folder or
file that matches the path.
This class manages access authorization, optionally saving authorization
tokens to a file so that the authorization flow does not need to be
repeated in the future.
The ``client_secret_file`` should be the path to a client secret file in
JSON format, obtained from the `Google API Console
<https://console.developers.google.com/>`_. The Drive API must be enabled
for the corresponding client.
If ``save_tokens=False``, the authorization flow (a request via web browser
for permission to access Google Drive) will always run the first time a new
instance of this class is used, and authorization will not persist after
the instance is destroyed. If ``save_tokens=True`` and a file path is
provided with ``tokens_file``, access/refresh tokens resulting from a
successful authorization are stored in the file, and tokens are loaded from
the file in the future, so that the authorization flow does not need to be
repeated.
"""
def __init__(self, client_secret_file, tokens_file=None, save_tokens=False):
"""
Initialize a new GoogleDriveDownloader.
"""
self.settings = {
'client_config_file': client_secret_file,
'oauth_scope': ['https://www.googleapis.com/auth/drive.readonly'],
'save_credentials': save_tokens,
'save_credentials_backend': 'file',
'save_credentials_file': tokens_file
}
GoogleDrive.__init__(self, auth=self._create_auth())
def _create_auth(self):
"""
Create a GoogleAuth object with the correct settings.
"""
auth = GoogleAuth()
auth.settings.update(self.settings)
return auth
[docs] @LoadAuth
def authorize(self):
"""
Obtain tokens for reading the contents of a Google Drive account.
If ``save_tokens=True``, tokens will be loaded from the ``tokens_file``
if possible. If tokens cannot be restored this way, or if the loaded
tokens have expired, an authorization flow will be initiated, prompting
the user through a web browser to grant read-only privileges to the
client associated with the ``client_secret_file``. When the
authorization flow completes, if ``save_tokens=True``, the newly
created tokens will be stored in the ``tokens_file`` for future use.
Authorization is performed automatically when needed, but this method
can be called directly to retrieve (and possibly store) tokens without
initiating a download.
"""
# the LoadAuth decorator does all the work
return
[docs] def deauthorize(self):
"""
Forget tokens and delete the ``tokens_file``. The authorization flow
will be required for the next download.
"""
if os.path.exists(self.settings['save_credentials_file']):
os.remove(self.settings['save_credentials_file'])
del self.auth
self.auth = self._create_auth()
[docs] def is_authorized(self):
"""
Get the current authorization state.
"""
return (self.auth is not None and
self.auth.credentials is not None and
self.auth.service is not None)
[docs] @LoadAuth
def GetUserEmail(self):
"""
Get the email address for the authorized Google Drive account.
"""
return self.GetAbout()['user']['emailAddress']
[docs] @LoadAuth
def GetSharedDrivesList(self):
"""
Return information about available Shared Drives.
"""
# no PyDrive2 interface for this, so implement it here
return self.auth.service.drives().list().execute(http=self.http)
[docs] def download(self, gdrive_url, local_file, overwrite_existing=False, show_progress=True, bytes_per_chunk=1024*1024*5):
"""
Download a file from Google Drive using a URL-like path beginning with
"gdrive://".
"""
if not overwrite_existing and os.path.exists(local_file):
logger.info(f'Skipping {os.path.basename(local_file)} (already exists)')
return
logger.info(f'Downloading {os.path.basename(local_file)}')
try:
self._download_with_progress_bar(gdrive_url, local_file, show_progress=show_progress, bytes_per_chunk=bytes_per_chunk)
except Exception as e:
logger.error(f'Skipping {os.path.basename(local_file)} ({e})')
raise
def _download_with_progress_bar(self, gdrive_url, local_file, show_progress=True, bytes_per_chunk=1024*1024*5):
"""
Download while showing a progress bar.
"""
# TODO: bytes_per_chunk=1024*1024*100 (100 MiB) would match
# googleapiclient.http.MediaIoBaseDownload's default chunk size and
# seems to be significantly faster than smaller values, suggesting
# chunk fetching incurs a large overhead. Unfortunately, such a large
# chunk size would prevent the progress bar from updating frequently.
# As a compromise, the chunk size used by this method is just 5 MiB,
# which is a little larger than is ideal for progress reporting and yet
# still noticeably slows downloads. Is there a better solution?
# determine where to temporarily save the file during download
temp_file = local_file + '.part'
logger.debug(f'Temporarily downloading to {temp_file}')
# create the containing directory if necessary
if not os.path.exists(os.path.dirname(local_file)):
os.makedirs(os.path.dirname(local_file))
# locate the Google Drive file
file_id = self._get_file_id(gdrive_url)
if file_id is None:
raise ValueError(f'error locating file on server for account "{self.GetUserEmail()}"')
file = self.CreateFile({'id': file_id})
try:
with tqdm(total=int(file['fileSize']), unit='B', unit_scale=True) as pbar:
def update_pbar(total_transferred, file_size):
pbar.n = total_transferred
pbar.update()
file.GetContentFile(temp_file, callback=update_pbar, chunksize=bytes_per_chunk)
except:
# the download is likely incomplete, so delete the temporary file
if os.path.exists(temp_file):
os.remove(temp_file)
# raise the exception so that it can be handled elsewhere
raise
else:
# download completed, so move the temp file to the final location
shutil.move(temp_file, local_file)
def _get_file_id(self, gdrive_url):
"""
Retrieve the Google Drive ID for the file specified by ``gdrive_url``.
"""
# verify the url is of the right type
scheme = urllib.parse.urlparse(gdrive_url).scheme
if scheme != 'gdrive':
raise ValueError(f'gdrive_url must begin with "gdrive://": {gdrive_url}')
# extract drive name ("My Drive" or some Shared Drive) and file path
drive_name = urllib.parse.urlparse(gdrive_url).netloc
path = urllib.parse.urlparse(gdrive_url).path
path = os.path.normpath(path).strip(os.sep).split(os.sep)
# find the drive id from its name
if not drive_name:
raise ValueError('problem parsing drive name')
elif drive_name == 'My Drive':
drive_id = 'root'
else:
# search for all Shared Drives with a matching name
drives = self.GetSharedDrivesList().get('items', [])
drives = [drive for drive in drives if drive['name'] == drive_name]
# make sure the drive is unique
if len(drives) == 0:
raise ValueError(f'drive "{drive_name}" not found on server for account "{self.GetUserEmail()}"')
elif len(drives) > 1:
raise ValueError(f'ambigous path, multiple drives with name "{drive_name}" exist on server for account "{self.GetUserEmail()}"')
else:
drive_id = drives[0]['id']
# find the file id from its path by starting at the drive root and
# recursively searching for the id of the next folder in the path
file_id = reduce(self._get_child_id, path, drive_id)
return file_id
def _get_child_id(self, parent_id, child_name):
"""
Retrieve the Google Drive ID for the file or folder named
``child_name`` located in a folder or drive with ID ``parent_id``.
"""
# search for all files with a matching name and parent id
items = self.ListFile({'q': f'title="{child_name}" and "{parent_id}" '
'in parents and trashed=false'}).GetList()
# make sure the file is unique
if len(items) == 0:
raise ValueError(f'file or folder "{child_name}" not found on server for account "{self.GetUserEmail()}"')
elif len(items) > 1:
raise ValueError(f'ambiguous path, multiple files or folders with the name "{child_name}" exist under their parent folder on server for account "{self.GetUserEmail()}"')
else:
child_id = items[0]['id']
return child_id