-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Infer metadata from the data *.csv file #173
Changes from all commits
2065240
eb9a8b9
5a89536
c6c5dc8
e420923
6dfb247
a7ee863
0c4279b
37c5ad5
b8b4e64
1f81196
ef6ed5f
c7ab4bf
6343749
bb3fef5
bbe05fa
ed91085
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,3 +14,4 @@ Authors | |
* Marie-Claire Gering | ||
* Julian Endres | ||
* Felix Maurer | ||
* Pierre-Francois Duc |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
__version__ = "0.0.5" | ||
__version__ = "0.0.6dev" | ||
__project__ = "oemof.tabular" | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
import sys | ||
import tarfile | ||
import urllib.request | ||
import warnings | ||
import zipfile | ||
from ftplib import FTP | ||
from urllib.parse import urlparse | ||
|
@@ -59,6 +60,208 @@ def update_package_descriptor(): | |
p.save("datapackage.json") | ||
|
||
|
||
def map_sequence_profiles_to_resource_name( | ||
p, excluded_profiles=("timeindex",) | ||
): | ||
"""Look in every sequence resources and map each of its fields to itself | ||
|
||
Within this process the unicity of the field names will be checked, | ||
with the exception of the field "timeindex" | ||
|
||
""" | ||
|
||
def check_sequences_labels_unicity(labels, new_labels): | ||
intersect = set(labels).intersection(new_labels) | ||
if len(intersect) == 1: | ||
intersect = intersect.pop() | ||
if not intersect == "timeindex": | ||
answer = [intersect] | ||
else: | ||
answer = [] | ||
else: | ||
answer = list(intersect) | ||
|
||
if answer: | ||
warnings.warn( | ||
f"The labels of the profiles are not unique across all" | ||
f"files within 'sequences' folder: '{','.join(intersect)}' " | ||
f"used more than once" | ||
) | ||
return answer | ||
|
||
sequences = {} | ||
sequence_labels = [] | ||
duplicated_labels = [] | ||
for r in p.resources: | ||
if "/sequences/" in r.descriptor["path"]: | ||
field_labels = [ | ||
f.name | ||
for f in r.schema.fields | ||
if f.name not in excluded_profiles | ||
] | ||
sequences[r.descriptor["name"]] = field_labels | ||
duplicated_labels += check_sequences_labels_unicity( | ||
sequence_labels, field_labels | ||
) | ||
sequence_labels += field_labels | ||
|
||
if duplicated_labels: | ||
# write an error message here | ||
raise ValueError( | ||
f"The following sequences labels are not unique" | ||
f" across all sequences files: " | ||
f"{', '.join(duplicated_labels)}" | ||
) | ||
# map each profile to its resource name | ||
sequences_mapping = { | ||
value: key for (key, values) in sequences.items() for value in values | ||
} | ||
return sequences_mapping | ||
|
||
|
||
def infer_resource_foreign_keys(resource, sequences_profiles_to_resource): | ||
"""Find out the foreign keys within a resource fields | ||
|
||
Look through all field of a resource which are of type 'string' | ||
if any of their values are matching a profile header in any of | ||
the sequences resources | ||
|
||
|
||
Parameters | ||
---------- | ||
resource: a :datapackage.Resource: instance | ||
sequences_profiles_to_resource: the mapping of sequence profile | ||
headers to their resource name | ||
|
||
Returns | ||
------- | ||
The :datapackage.Resource: instance with updated "foreignKeys" field | ||
|
||
""" | ||
r = resource | ||
data = pd.DataFrame.from_records(r.read(keyed=True)) | ||
# TODO not sure this should be set here | ||
r.descriptor["schema"]["primaryKey"] = "name" | ||
if "foreignKeys" not in r.descriptor["schema"]: | ||
r.descriptor["schema"]["foreignKeys"] = [] | ||
|
||
for field in r.schema.fields: | ||
if field.type == "string": | ||
for potential_fk in data[field.name].dropna().unique(): | ||
|
||
if potential_fk in sequences_profiles_to_resource: | ||
# this is actually a wrong format and should be | ||
# with a "fields" field under the "reference" fields | ||
|
||
fk = { | ||
"fields": field.name, | ||
"reference": { | ||
"resource": sequences_profiles_to_resource[ | ||
potential_fk | ||
], | ||
}, | ||
} | ||
|
||
if fk not in r.descriptor["schema"]["foreignKeys"]: | ||
r.descriptor["schema"]["foreignKeys"].append(fk) | ||
r.commit() | ||
return r | ||
|
||
|
||
def infer_package_foreign_keys(package): | ||
"""Infer the foreign_keys from elements and sequences and update meta data | ||
|
||
Parameters | ||
---------- | ||
package | ||
|
||
Returns | ||
------- | ||
|
||
""" | ||
p = package | ||
sequences_profiles_to_resource = map_sequence_profiles_to_resource_name(p) | ||
|
||
for r in p.resources: | ||
if os.sep + "elements" + os.sep in r.descriptor["path"]: | ||
r = infer_resource_foreign_keys(r, sequences_profiles_to_resource) | ||
# sort foreign_key entries by alphabetically by fields | ||
r.descriptor["schema"]["foreignKeys"].sort( | ||
key=lambda x: x["fields"] | ||
) | ||
p.remove_resource(r.name) | ||
p.add_resource(r.descriptor) | ||
|
||
|
||
def infer_metadata_from_data( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Idea was to make this feature optional, therefore I did not want to modify |
||
path, | ||
package_name="default-name", | ||
metadata_filename="datapackage.json", | ||
): | ||
"""Creates a metadata .json file at the root-folder of datapackage | ||
|
||
The foreign keys are inferred from the csv files within | ||
"data/elements" and "data/sequences" resources. | ||
|
||
Parameters | ||
---------- | ||
path: string | ||
Absolute path to root-folder of the datapackage | ||
package_name: string | ||
Name of the data package | ||
metadata_filename: basestring | ||
Name of the inferred metadata string. | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Small docstring would be good There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
Returns | ||
------- | ||
Save a json metadata file at the root-folder of datapackage | ||
under the provided path. | ||
""" | ||
|
||
# Infer the fields from the package data | ||
path = os.path.abspath(path) | ||
p0 = Package(base_path=path) | ||
p0.infer(os.path.join(path, "**" + os.sep + "*.csv")) | ||
p0.commit() | ||
p0.save(os.path.join(path, metadata_filename)) | ||
|
||
foreign_keys = {} | ||
|
||
def infer_resource_basic_foreign_keys(resource): | ||
"""Prepare foreign_keys dict for building.infer_metadata | ||
|
||
Compare the fields of a resource to a list of field names known | ||
to be foreign keys. If the field name is within the list, it is | ||
used to populate the dict 'foreign_keys' | ||
""" | ||
for field in resource.schema.fields: | ||
if field.name in config.SPECIAL_FIELD_NAMES: | ||
fk_descriptor = config.SPECIAL_FIELD_NAMES[field.name] | ||
if fk_descriptor in foreign_keys: | ||
if resource.name not in foreign_keys[fk_descriptor]: | ||
foreign_keys[fk_descriptor].append(resource.name) | ||
else: | ||
foreign_keys[fk_descriptor] = [resource.name] | ||
|
||
for r in p0.resources: | ||
if os.sep + "elements" + os.sep in r.descriptor["path"]: | ||
infer_resource_basic_foreign_keys(r) | ||
# this function saves the metadata of the package in json format | ||
infer_metadata( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because this function does already part of the job if provided a dict |
||
package_name=package_name, | ||
path=path, | ||
foreign_keys=foreign_keys, | ||
metadata_filename=metadata_filename, | ||
) | ||
|
||
# reload the package from the saved json file | ||
p = Package(os.path.join(path, metadata_filename)) | ||
infer_package_foreign_keys(p) | ||
p.descriptor["resources"].sort(key=lambda x: (x["path"], x["name"])) | ||
p.commit() | ||
p.save(os.path.join(path, metadata_filename)) | ||
|
||
|
||
def infer_metadata( | ||
package_name="default-name", | ||
keep_resources=False, | ||
|
@@ -231,6 +434,7 @@ def infer_metadata( | |
) | ||
p.add_resource(r.descriptor) | ||
|
||
p.descriptor["resources"].sort(key=lambda x: (x["path"], x["name"])) | ||
p.commit() | ||
p.save(metadata_filename) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here one has to use
os.sep
instead of"/"