Source code for flotilla.datapackage

"""
Functions to deal with creation and loading of datapackages
"""

import gzip
import json
import os
import string
import sys
import urllib2

import matplotlib as mpl


FLOTILLA_DOWNLOAD_DIR = os.path.expanduser('~/flotilla_projects')


[docs]def datapackage_url_to_dict(datapackage_url): filename = check_if_already_downloaded(datapackage_url) with open(filename) as f: datapackage = json.load(f) return datapackage
[docs]def check_if_already_downloaded(url, datapackage_name=None, download_dir=FLOTILLA_DOWNLOAD_DIR): """If a url filename has already been downloaded, don't download it again. Parameters ---------- url : str HTTP url of a file you want to downlaod Returns ------- filename : str Location of the file on your system """ try: os.mkdir(download_dir) sys.stdout.write('Creating a directory for saving your flotilla ' 'projects: {}\n'.format(download_dir)) except OSError: pass if datapackage_name is None: req = urllib2.Request(url) opener = urllib2.build_opener() opened_url = opener.open(req) datapackage = json.loads(opened_url.read()) datapackage_name = datapackage['name'] package_dir = '{}/{}'.format(download_dir, datapackage_name) try: os.mkdir(package_dir) sys.stdout.write('Creating a directory for saving the data for this ' 'project: {}\n'.format(package_dir)) except OSError: pass basename = url.rsplit('/', 1)[-1] filename = os.path.expanduser(os.path.join(package_dir, basename)) if not os.path.isfile(filename): sys.stdout.write('{} has not been downloaded before.\n\tDownloading ' 'now to {}\n'.format(url, filename)) req = urllib2.Request(url) opener = urllib2.build_opener() opened_url = opener.open(req) with open(filename, 'w') as f: f.write(opened_url.read()) return filename
[docs]def make_study_datapackage(study_name, metadata, expression_data=None, splicing_data=None, spikein_data=None, mapping_stats_data=None, title='', sources='', license=None, species=None, flotilla_dir=FLOTILLA_DOWNLOAD_DIR, metadata_kws=None, expression_kws=None, splicing_kws=None, spikein_kws=None, mapping_stats_kws=None, version=None, expression_feature_kws=None, expression_feature_data=None, splicing_feature_data=None, splicing_feature_kws=None, gene_ontology=None, supplemental_kws=None, host="https://s3-us-west-2.amazonaws.com/", host_destination='flotilla-projects/'): """Example code for making a datapackage for a Study""" if ' ' in study_name: raise ValueError("Datapackage name cannot have any spaces") if set(string.uppercase) & set(study_name): raise ValueError("Datapackage can only contain lowercase letters") datapackage_dir = '{}/{}'.format(flotilla_dir, study_name) try: os.makedirs(datapackage_dir) except OSError: pass supplemental_kws = {} if supplemental_kws is None else supplemental_kws datapackage = {'name': study_name, 'title': title, 'sources': sources, 'licenses': license, 'datapackage_version': version} if species is not None: datapackage['species'] = species resources = {'metadata': (metadata, metadata_kws), 'expression': (expression_data, expression_kws), 'splicing': (splicing_data, splicing_kws), 'spikein': (spikein_data, spikein_kws), 'mapping_stats': (mapping_stats_data, mapping_stats_kws), 'expression_feature': (expression_feature_data, expression_feature_kws), 'splicing_feature': (splicing_feature_data, splicing_feature_kws), 'gene_ontology': (gene_ontology, {})} datapackage['resources'] = [] for resource_name, (data, kws) in resources.items(): if data is None: continue datapackage['resources'].append({'name': resource_name}) resource = datapackage['resources'][-1] basename = '{}.csv.gz'.format(resource_name) data_filename = '{}/{}'.format(datapackage_dir, basename) with gzip.open(data_filename, 'wb') as f: data.to_csv(f) # if isinstance(data.columns, pd.MultiIndex): # resource['header'] = range(len(data.columns.levels)) # if isinstance(data.index, pd.MultiIndex): # resource['index_col'] = range(len(data.index.levels)) # try: # # TODO: only transmit data if it has been updated # subprocess.call( # "scp {} {}:{}{}.".format(data_filename, host, host_destination, # name), shell=True) # except Exception as e: # sys.stderr.write("error sending data to host: {}".format(e)) resource['path'] = basename resource['compression'] = 'gzip' resource['format'] = 'csv' if kws is not None: for key, value in kws.iteritems(): if key == 'phenotype_to_color': value = dict((k, mpl.colors.rgb2hex(v)) if isinstance(v, tuple) else (k, v) for k, v in value.iteritems()) resource[key] = value datapackage['resources'].append({'name': 'supplemental'}) supplemental = datapackage['resources'][-1] supplemental['resources'] = [] for supplemental_name, data in supplemental_kws.items(): resource = {} basename = '{}.csv.gz'.format(supplemental_name) data_filename = '{}/{}'.format(datapackage_dir, basename) with gzip.open(data_filename, 'wb') as f: data.to_csv(f) resource['name'] = supplemental_name resource['path'] = basename resource['compression'] = 'gzip' resource['format'] = 'csv' supplemental['resources'].append(resource) filename = '{}/datapackage.json'.format(datapackage_dir) with open(filename, 'w') as f: json.dump(datapackage, f, indent=2) sys.stdout.write('Wrote datapackage to {}\n'.format(filename))
[docs]def name_to_resource(datapackage, name): """Get resource with specified name in the datapackage""" for resource in datapackage['resources']: if resource['name'] == name: return resource raise ValueError('No resource named {} in this datapackage'.format(name))
Olga B. Botvinnik is funded by the NDSEG fellowship and is a NumFOCUS John Hunter Technology Fellow.
Michael T. Lovci was partially funded by a fellowship from Genentech.
Partially funded by NIH grants NS075449 and HG004659 and CIRM grants RB4-06045 and TR3-05676 to Gene Yeo.