From 2fa0dc84981e0aadbed5245b86c5e9bb7ed20efc Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 29 Feb 2024 18:24:16 +0100 Subject: [PATCH 01/32] add build/ to gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9298a2c..618a9b6 100644 --- a/.gitignore +++ b/.gitignore @@ -29,4 +29,5 @@ venv.bak/ *.egg-info/ -dist/ \ No newline at end of file +dist/ +build/ From 4c74bc0d97a72c418848bfdf241c6deda1c7cff6 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 29 Feb 2024 18:24:23 +0100 Subject: [PATCH 02/32] add mkdocs to requirements --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9c78563..2ec6e03 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ classifiers = [ Repository = "https://github.com/UUDigitalHumanitieslab/I-analyzer" [project.optional-dependencies] -dev = ['pytest'] +dev = ['pytest', 'mkdocs'] [tool.setuptools] packages = [ From 3bc77e52dafcf6e60cc53721e675a8512ca82b01 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 29 Feb 2024 18:29:30 +0100 Subject: [PATCH 03/32] generate empty documentation site --- CONTRIBUTING.md | 26 +++++++++++++++++++++++++- docs/index.md | 3 +++ mkdocs.yml | 1 + 3 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 docs/index.md create mode 100644 mkdocs.yml diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9916112..e27eb55 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,4 +14,28 @@ Run unit tests with ```sh pytest -``` \ No newline at end of file +``` + +## Writing documentation + +Documentation is based on [mkdocs](https://www.mkdocs.org). + +### Commands + +Start the live-reloading docs server: + +```sh +mkdocs serve +``` + +Build the documentation site: + +```sh +mkdocs build +``` + +Print help message and exit: + +```sh +mkdocs -h +``` diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..5cf6899 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,3 @@ +# I-analyzer Readers documentation + +Welcome! This documentation is a work in progress. diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..ae98ccf --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1 @@ +site_name: I-analyzer Readers From 888c6c82e2c826159dd7e780b6c3f136107df9c3 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 29 Feb 2024 18:31:19 +0100 Subject: [PATCH 04/32] add mkdocstrings-python to requirements --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2ec6e03..a78c4cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ classifiers = [ Repository = "https://github.com/UUDigitalHumanitieslab/I-analyzer" [project.optional-dependencies] -dev = ['pytest', 'mkdocs'] +dev = ['pytest', 'mkdocs', 'mkdocstrings-python'] [tool.setuptools] packages = [ From 5ab11a755ceef43dc751f2ef89c44761ee81bbf0 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 29 Feb 2024 18:32:51 +0100 Subject: [PATCH 05/32] update mkdocs config --- mkdocs.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mkdocs.yml b/mkdocs.yml index ae98ccf..8f09a84 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1 +1,9 @@ site_name: I-analyzer Readers +repo_url: https://github.com/UUDigitalHumanitieslab/ianalyzer-readers +plugins: + - mkdocstrings +nav: + - 'index.md' +watch: + - docs + - ianalyzer_readers \ No newline at end of file From cf496704670678d54161760fe6f5ed3412bfc9f7 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 29 Feb 2024 18:35:19 +0100 Subject: [PATCH 06/32] add intro from readme --- docs/index.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 5cf6899..1c85dad 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,3 +1,9 @@ # I-analyzer Readers documentation -Welcome! This documentation is a work in progress. +**This documentation is a work in progress.** + +`ianalyzer-readers` is a python module to extract data from XML, HTML, CSV or XLSX files. + +This module was originally created for [I-analyzer](https://github.com/UUDigitalHumanitieslab/I-analyzer), a web application that extracts data from a variety of datasets, indexes them and presents a search interface. To do this, we wanted a way to extract data from source files without having to write a new script "from scratch" for each dataset, and an API that would work the same regardless of the source file type. + +The basic usage is that you will use the utilities in this package to create a `Reader` class tailored to a dataset. You specify what your data looks like, and then call the `documents()` method of the reader to get an iterator of documents - where each document is a flat dictionary of key/value pairs. From 26f55adb2bd7628b1b581a88975d1b53a38ebf84 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 29 Feb 2024 18:40:17 +0100 Subject: [PATCH 07/32] scaffold file structure, add installation instructions --- docs/api.md | 1 + docs/examples.md | 1 + docs/index.md | 12 +++++++++++- mkdocs.yml | 2 ++ 4 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 docs/api.md create mode 100644 docs/examples.md diff --git a/docs/api.md b/docs/api.md new file mode 100644 index 0000000..56147eb --- /dev/null +++ b/docs/api.md @@ -0,0 +1 @@ +# API documentation \ No newline at end of file diff --git a/docs/examples.md b/docs/examples.md new file mode 100644 index 0000000..1b976a9 --- /dev/null +++ b/docs/examples.md @@ -0,0 +1 @@ +# Examples \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 1c85dad..af2b109 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,4 +1,4 @@ -# I-analyzer Readers documentation +# Getting started **This documentation is a work in progress.** @@ -7,3 +7,13 @@ This module was originally created for [I-analyzer](https://github.com/UUDigitalHumanitieslab/I-analyzer), a web application that extracts data from a variety of datasets, indexes them and presents a search interface. To do this, we wanted a way to extract data from source files without having to write a new script "from scratch" for each dataset, and an API that would work the same regardless of the source file type. The basic usage is that you will use the utilities in this package to create a `Reader` class tailored to a dataset. You specify what your data looks like, and then call the `documents()` method of the reader to get an iterator of documents - where each document is a flat dictionary of key/value pairs. + +## Installation + +Requires [Python](https://python.org) 3.8 or later. This package can be installed via pip: + +```sh +pip install ianalyzer_readers +``` + +Consult the [PyPI documentation](https://packaging.python.org/en/latest/tutorials/installing-packages/) if you are unsure how to install packages in Python. \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 8f09a84..2341d89 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,6 +4,8 @@ plugins: - mkdocstrings nav: - 'index.md' + - 'api.md' + - 'examples.md' watch: - docs - ianalyzer_readers \ No newline at end of file From 2edcd973ef7d5af3de9d483804e055a8225ad40d Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 29 Feb 2024 18:59:47 +0100 Subject: [PATCH 08/32] generate documentation from docstrings --- docs/api.md | 26 +++++++++++++++++++++++++- ianalyzer_readers/readers/__init__.py | 0 mkdocs.yml | 8 +++++++- 3 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 ianalyzer_readers/readers/__init__.py diff --git a/docs/api.md b/docs/api.md index 56147eb..3225af9 100644 --- a/docs/api.md +++ b/docs/api.md @@ -1 +1,25 @@ -# API documentation \ No newline at end of file +# API documentation + +## Core classes + +::: ianalyzer_readers.readers.core + +## CSV reader + +::: ianalyzer_readers.readers.csv + +## XLSX reader + +::: ianalyzer_readers.readers.xlsx + +## XML reader + +::: ianalyzer_readers.readers.xml + +## HTML reader + +::: ianalyzer_readers.readers.html + +## Extractors + +::: ianalyzer_readers.extract diff --git a/ianalyzer_readers/readers/__init__.py b/ianalyzer_readers/readers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mkdocs.yml b/mkdocs.yml index 2341d89..8e560cf 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -8,4 +8,10 @@ nav: - 'examples.md' watch: - docs - - ianalyzer_readers \ No newline at end of file + - ianalyzer_readers +plugins: +- mkdocstrings: + handlers: + python: + options: + heading_level: 3 \ No newline at end of file From 45f2154c0e8251202c4e8f95e137f0fb2f8e0066 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 29 Feb 2024 19:47:23 +0100 Subject: [PATCH 09/32] docstrings for core module --- ianalyzer_readers/readers/core.py | 168 +++++++++++++++--------------- 1 file changed, 82 insertions(+), 86 deletions(-) diff --git a/ianalyzer_readers/readers/core.py b/ianalyzer_readers/readers/core.py index 29cd8ee..9e4922e 100644 --- a/ianalyzer_readers/readers/core.py +++ b/ianalyzer_readers/readers/core.py @@ -1,10 +1,15 @@ ''' -More with core classes +This module defines the base classes on which all Readers are built. + +It implements very little functionality of its own, but defines the interface +that Readers implement. + +The module defines two classes, `Field` and `Reader`. ''' from .. import extract from datetime import datetime - +from typing import List, Iterable, Dict, Any import logging logger = logging.getLogger() @@ -12,59 +17,99 @@ class Reader(object): ''' - Subclasses of this class define corpora and their documents by specifying: + A base class for readers. Readers are objects that can generate documents + from a source dataset. + + Subclasses of `Reader` can be created to read particular data formats or even + particular datasets. + + The `Reader` class is not intended to be used directly. Some methods need to + be implemented in child components, and will raise `NotImplementedError` if + you try to use `Reader` directly. + + A fully implemented `Reader` subclass will define how to read a dataset by + describing: - How to obtain its source files. - - What attributes its documents have. - - How to extract said attributes from the source files. - - What each attribute looks like in terms of the search form. + - What fields each document contains. + - How to extract said fields from the source files. ''' @property - def data_directory(self): + def data_directory(self) -> str: ''' Path to source data directory. + + Raises: + NotImplementedError: This method needs to be implementd on child + classes. It will raise an error by default. ''' - raise NotImplementedError('CorpusDefinition missing data_directory') + raise NotImplementedError('Reader missing data_directory') @property - def fields(self): + def fields(self) -> List: ''' - Each corpus should implement a list of fields, that is, instances of - the `Field` class, containing information about each attribute. - MUST include a field with `name='id'`. + The list of fields that are extracted from documents. + + These should be instances of the `Field` class (or implement the same API). + + Raises: + NotImplementedError: This method needs to be implementd on child + classes. It will raise an error by default. ''' - raise NotImplementedError('CorpusDefinition missing fields') + raise NotImplementedError('Reader missing fields') @property - def fieldnames(self): + def fieldnames(self) -> List[str]: + ''' + A list containing the name of each field of this Reader + ''' return [field.name for field in self.fields] - def sources(self, **kwargs): + def sources(self, **kwargs) -> Iterable: ''' - Obtain source files for the corpus, relevant to the given timespan. + Obtain source files for the Reader. - Specifically, returns an iterator of tuples that each contain a string - filename and a dictionary of associated metadata. The latter is usually - empty or contains only a timestamp; but any data that is to be - extracted without reading the file itself can be specified there. + Returns: + an iterable of tuples that each contain a string path, and a dictionary + with associated metadata. The metadata can contain any data that was + extracted before reading the file itself, such as data based on the + file path, or on a metadata file. + + Raises: + NotImplementedError: This method needs to be implementd on child + classes. It will raise an error by default. ''' raise NotImplementedError('CorpusDefinition missing sources') - def source2dicts(self, sources): + def source2dicts(self, source) -> Iterable[Dict[str, Any]]: ''' - Generate an iterator of document dictionaries from a given source file. + Given a source file, returns an iterable of extracted documents. + + Returns: + an iterable of document dictionaries. Each of these is a dictionary, + where the keys are names of this Reader's `fields`, and the values + are based on the extractor of each field. - The dictionaries are created from this corpus' `Field`s. + Raises: + NotImplementedError: This method needs to be implementd on child + classes. It will raise an error by default. ''' raise NotImplementedError('CorpusDefinition missing source2dicts') - def documents(self, sources=None): + def documents(self, sources:Iterable[str]=None) -> Iterable[Dict[str, Any]]: ''' - Generate an iterator of document dictionaries directly from the source - files. The source files are generated by self.sources(); however, if - `sources` is specified, those source/metadata tuples are used instead. + Returns an iterable of extracted documents from source files. + + Parameters: + sources: an iterable of paths to source files. If omitted, the reader + class will use the value of `self.sources()` instead. + + Returns: + an iterable of document dictionaries. Each of these is a dictionary, + where the keys are names of this Reader's `fields`, and the values + are based on the extractor of each field. ''' sources = sources or self.sources() return (document @@ -88,11 +133,16 @@ def _reject_extractors(self, *inapplicable_extractors): class Field(object): ''' - Fields hold the following data: - - a short hand name (name), which will be used as its key in the document - - how to extract data from the source documents (extractor) - - whether this field is required - - whether this field should be skipped + Fields are the elements of information that you wish to extract from each document. + + Parameters: + name: a short hand name (name), which will be used as its key in the document + extractor: an Extractor object that defines how this field's data can be + extracted from source documents. + required: whether this field is required. The `Reader` class should skip the + document is the value for this Field is `None`, though this is not supported + for all readers. + skip: if `True`, this field will not be included in the results. ''' def __init__(self, @@ -107,57 +157,3 @@ def __init__(self, self.extractor = extractor self.required = required self.skip = skip - -# Helper functions ############################################################ - - -def string_contains(target): - ''' - Return a predicate that performs a case-insensitive search for the target - string and returns whether it was found. - ''' - def f(string): - return bool(target.lower() in string.lower() if string else False) - return f - - -def until(year): - ''' - Returns a predicate to determine from metadata whether its 'date' field - represents a date before or on the given year. - ''' - def f(metadata): - date = metadata.get('date') - return date and date.year <= year - return f - - -def after(year): - ''' - Returns a predicate to determine from metadata whether its 'date' field - represents a date after the given year. - ''' - def f(metadata): - date = metadata.get('date') - return date and date.year > year - return f - - -def consolidate_start_end_years(start, end, min_date, max_date): - ''' given a start and end date provided by the user, make sure - - that start is not before end - - that start is not before min_date (corpus variable) - - that end is not after max_date (corpus variable) - ''' - if isinstance(start, int): - start = datetime(year=start, month=1, day=1) - if isinstance(end, int): - end = datetime(year=end, month=12, day=31) - if start > end: - tmp = start - start = end - end = tmp - if start < min_date: - start = min_date - if end > max_date: - end = max_date From 6e4fb1c8a125cab84555f7a91f00cbb46b333dc0 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 1 Mar 2024 11:17:37 +0100 Subject: [PATCH 10/32] typing improvements, docstrings for document csv reader --- ianalyzer_readers/readers/core.py | 43 ++++++++++++++++++++----- ianalyzer_readers/readers/csv.py | 52 ++++++++++++++++++++++--------- 2 files changed, 73 insertions(+), 22 deletions(-) diff --git a/ianalyzer_readers/readers/core.py b/ianalyzer_readers/readers/core.py index 9e4922e..df48bd8 100644 --- a/ianalyzer_readers/readers/core.py +++ b/ianalyzer_readers/readers/core.py @@ -9,11 +9,29 @@ from .. import extract from datetime import datetime -from typing import List, Iterable, Dict, Any +from typing import List, Iterable, Dict, Any, Union, Tuple import logging logger = logging.getLogger() +Source = Union[str, Tuple[str, Dict], bytes] +''' +Type definition for the source input to some Reader methods. + +Sources are either: + +- a string with the path to a filename +- a tuple containing a path to a filename, and a dictionary with metadata +- binary data with the file contents. This is not supported on all Reader subclasses. +''' + +Document = Dict[str, Any] +''' +Type definition for documents, defined for convenience. + +Each document extracted by a Reader is a dictionary, where the keys are names of +the Reader's `fields`, and the values are based on the extractor of each field. +''' class Reader(object): ''' @@ -67,7 +85,7 @@ def fieldnames(self) -> List[str]: ''' return [field.name for field in self.fields] - def sources(self, **kwargs) -> Iterable: + def sources(self, **kwargs) -> Iterable[Source]: ''' Obtain source files for the Reader. @@ -83,22 +101,27 @@ def sources(self, **kwargs) -> Iterable: ''' raise NotImplementedError('CorpusDefinition missing sources') - def source2dicts(self, source) -> Iterable[Dict[str, Any]]: + def source2dicts(self, source: Source) -> Iterable[Document]: ''' Given a source file, returns an iterable of extracted documents. + Parameters: + source: the source file to extract. This can be a string with the path to + the file, or a tuple with a path and a dictionary containing metadata. + Some reader subclasses may also support bytes as input. + Returns: an iterable of document dictionaries. Each of these is a dictionary, where the keys are names of this Reader's `fields`, and the values are based on the extractor of each field. Raises: - NotImplementedError: This method needs to be implementd on child + NotImplementedError: This method needs to be implemented on child classes. It will raise an error by default. ''' raise NotImplementedError('CorpusDefinition missing source2dicts') - def documents(self, sources:Iterable[str]=None) -> Iterable[Dict[str, Any]]: + def documents(self, sources:Iterable[Source]=None) -> Iterable[Document]: ''' Returns an iterable of extracted documents from source files. @@ -121,8 +144,14 @@ class will use the value of `self.sources()` instead. def _reject_extractors(self, *inapplicable_extractors): ''' - Raise errors if any fields use extractors that are not applicable - for the corpus. + Raise errors if any fields use any of the given extractors. + + This can be used to check that fields use extractors that match + the Reader subclass. + + Raises: + RuntimeError: raised when a field uses an extractor that is provided + in the input. ''' for field in self.fields: if isinstance(field.extractor, inapplicable_extractors): diff --git a/ianalyzer_readers/readers/csv.py b/ianalyzer_readers/readers/csv.py index 6e1a455..1d5825c 100644 --- a/ianalyzer_readers/readers/csv.py +++ b/ianalyzer_readers/readers/csv.py @@ -1,11 +1,12 @@ ''' -Module for the CSV reader +This module defines the CSV reader. Extraction is based on python's `csv` library. ''' from .. import extract -from .core import Reader +from typing import Generator, List, Dict +from .core import Reader, Document, Source import csv import sys @@ -16,38 +17,51 @@ class CSVReader(Reader): ''' - An CSVReader extracts data from comma separated value files. + A base class for Readers of .csv (comma separated value) files. - By default, the reader will extract one document per row, but you - can also set `field_entry` to group grows. + The CSVReader is designed for .csv or .tsv files that have a header row, and where + each file may list multiple documents. + + By default, the reader will extract one document for each row in a csv file, but + you can also set the `field_entry` property to group rows. ''' field_entry = None ''' - If applicable, the field that identifies entries. Subsequent rows with the same - value for this field are treated as a single document. If left blank, each row + If applicable, the column that identifies entries. Subsequent rows with the same + value for this column are treated as a single document. If left blank, each row is treated as a document. ''' required_field = None ''' - Specifies a required field, for example the main content. Rows with - an empty value for `required_field` will be skipped. + Specifies a required column in the CSV data, for example the main content. Rows + with an empty value for `required_field` will be skipped. ''' delimiter = ',' ''' - The delimiter for the CSV reader. + The column delimiter used in the CSV data ''' skip_lines = 0 ''' - Number of lines to skip before reading the header + Number of lines in the file to skip before reading the header. Can be used when files + use a fixed "preamble", e.g. to provide metadata or provenance. ''' - def source2dicts(self, source): + def source2dicts(self, source: Source) -> Generator[Document]: ''' - Generate document dicts from a CSV file + Given a CSV source file, returns an iterable of extracted documents. + + Parameters: + source: the source file to extract. This can be a string with the path to + the file, or a tuple with a path and a dictionary containing metadata. + + Returns: + an iterable of document dictionaries. Each of these is a dictionary, + where the keys are names of this Reader's `fields`, and the values + are based on the extractor of each field. ''' # make sure the field size is as big as the system permits @@ -96,16 +110,24 @@ def source2dicts(self, source): yield self.document_from_rows(rows, metadata, index) - def document_from_rows(self, rows, metadata, row_index): + def document_from_rows(self, rows: List[Dict], metadata: Dict, doc_index: int) -> Document: ''' Extract a single document from a list of rows + + Parameters: + rows: a list of row data. Since the CSVReader uses `csv.DictReader`, each row + is expected to be a dictionary. + metadata: a dictionary with file metadata. + doc_index: the index of this document in the source file. The first document + extracted from a file should have index 0, the second should have index 1, + and so forth. ''' doc = { field.name: field.extractor.apply( # The extractor is put to work by simply throwing at it # any and all information it might need - rows=rows, metadata = metadata, index=row_index + rows=rows, metadata = metadata, index=doc_index ) for field in self.fields if not field.skip } From 1aee3fe7aa5252a1f9a0102ad3bcebd22b42241f Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 1 Mar 2024 11:37:43 +0100 Subject: [PATCH 11/32] docstrings for xlsx reader --- ianalyzer_readers/readers/csv.py | 23 +++++----- ianalyzer_readers/readers/xlsx.py | 72 ++++++++++++++++++++++++------- 2 files changed, 70 insertions(+), 25 deletions(-) diff --git a/ianalyzer_readers/readers/csv.py b/ianalyzer_readers/readers/csv.py index 1d5825c..b340350 100644 --- a/ianalyzer_readers/readers/csv.py +++ b/ianalyzer_readers/readers/csv.py @@ -22,21 +22,24 @@ class CSVReader(Reader): The CSVReader is designed for .csv or .tsv files that have a header row, and where each file may list multiple documents. - By default, the reader will extract one document for each row in a csv file, but - you can also set the `field_entry` property to group rows. + The data should be structured in one of the following ways: + + - one document per row (this is the default) + - each document spans a number of consecutive rows. In this case, there should be a + column that indicates the identity of the document. ''' field_entry = None ''' - If applicable, the column that identifies entries. Subsequent rows with the same - value for this column are treated as a single document. If left blank, each row + If applicable, the name of the column that identifies entries. Subsequent rows with the + same value for this column are treated as a single document. If left blank, each row is treated as a document. ''' required_field = None ''' - Specifies a required column in the CSV data, for example the main content. Rows - with an empty value for `required_field` will be skipped. + Specifies the name of a required column in the CSV data, for example the main content. + Rows with an empty value for `required_field` will be skipped. ''' delimiter = ',' @@ -47,7 +50,7 @@ class CSVReader(Reader): skip_lines = 0 ''' Number of lines in the file to skip before reading the header. Can be used when files - use a fixed "preamble", e.g. to provide metadata or provenance. + use a fixed "preamble", e.g. to describe metadata or provenance. ''' def source2dicts(self, source: Source) -> Generator[Document]: @@ -102,15 +105,15 @@ def source2dicts(self, source: Source) -> Generator[Document]: document_id = identifier if is_new_document and rows: - yield self.document_from_rows(rows, metadata, index) + yield self._document_from_rows(rows, metadata, index) rows = [row] index += 1 else: rows.append(row) - yield self.document_from_rows(rows, metadata, index) + yield self._document_from_rows(rows, metadata, index) - def document_from_rows(self, rows: List[Dict], metadata: Dict, doc_index: int) -> Document: + def _document_from_rows(self, rows: List[Dict], metadata: Dict, doc_index: int) -> Document: ''' Extract a single document from a list of rows diff --git a/ianalyzer_readers/readers/xlsx.py b/ianalyzer_readers/readers/xlsx.py index d7235f9..eb6abcc 100644 --- a/ianalyzer_readers/readers/xlsx.py +++ b/ianalyzer_readers/readers/xlsx.py @@ -1,8 +1,9 @@ import logging import openpyxl from openpyxl.worksheet.worksheet import Worksheet +from typing import Generator -from .core import Reader +from .core import Reader, Document, Source from .. import extract logger = logging.getLogger() @@ -10,29 +11,55 @@ class XLSXReader(Reader): ''' - Parent class for corpora that extract data from excel spreadsheets + A base class for Readers that extract data from .xlsx spreadsheets + + The XLSXReader is quite rudimentary, and is designed to extract data from + spreadsheets that are formatted like a CSV table, with a clear column layout. The + sheet should have a header row. + + The data should be structured in one of the following ways: + + - one document per row (this is the default) + - each document spans a number of consecutive rows. In this case, there should be a + column that indicates the identity of the document. + + The XLSXReader will only look at the _first_ sheet in each file. ''' + field_entry = None ''' - If applicable, the field that identifies entries. Subsequent rows with the same - value for this field are treated as a single document. If left blank, each row + If applicable, the name of column that identifies entries. Subsequent rows with the + same value for this column are treated as a single document. If left blank, each row is treated as a document. ''' - field_entry = None + required_field = None ''' - Specifies a required field, for example the main content. Rows with + Specifies the name of a required column, for example the main content. Rows with an empty value for `required_field` will be skipped. ''' - required_field = None + skip_lines = 0 ''' - Number of lines to skip before reading the header + Number of lines in the sheet to skip before reading the header. Can be used when files + use a fixed "preamble", e.g. to describe metadata or provenance. ''' - skip_lines = 0 - def source2dicts(self, source): - # make sure the field size is as big as the system permits + + def source2dicts(self, source: Source) -> Generator[Document]: + ''' + Given an XLSX source file, returns an iterable of extracted documents. + + Parameters: + source: the source file to extract. This can be a string with the path to + the file, or a tuple with a path and a dictionary containing metadata. + + Returns: + an iterable of document dictionaries. Each of these is a dictionary, + where the keys are names of this Reader's `fields`, and the values + are based on the extractor of each field. + ''' + self._reject_extractors(extract.XML, extract.FilterAttribute) if isinstance(source, str): @@ -51,6 +78,10 @@ def source2dicts(self, source): return self._sheet2dicts(sheet, metadata) def _sheet2dicts(self, sheet: Worksheet, metadata): + ''' + Extract documents from a single worksheet + ''' + data = (row for row in sheet.values) for _ in range(self.skip_lines): @@ -77,19 +108,30 @@ def _sheet2dicts(self, sheet: Worksheet, metadata): document_id = identifier if is_new_document and rows: - yield self.document_from_rows(rows, metadata, index) + yield self._document_from_rows(rows, metadata, index) rows = [values] index += 1 else: rows.append(values) if rows: - yield self.document_from_rows(rows, metadata, index) + yield self._document_from_rows(rows, metadata, index) + + def _document_from_rows(self, rows, metadata, doc_index): + ''' + Extract a single document from a list of row data + + Parameters: + rows: a list of row data. Each row is expected to be a dictionary. + metadata: a dictionary with file metadata. + doc_index: the index of this document in the source file. The first document + extracted from a file should have index 0, the second should have index 1, + and so forth. + ''' - def document_from_rows(self, rows, metadata, row_index): doc = { field.name: field.extractor.apply( - rows=rows, metadata=metadata, index=row_index + rows=rows, metadata=metadata, index=doc_index ) for field in self.fields if not field.skip } From 1ff859fde68fd8c128b95a89ed10e3366323857b Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 1 Mar 2024 11:57:23 +0100 Subject: [PATCH 12/32] docstrings and typing for xml reader --- ianalyzer_readers/readers/csv.py | 4 +- ianalyzer_readers/readers/xlsx.py | 4 +- ianalyzer_readers/readers/xml.py | 88 ++++++++++++++++++++++--------- 3 files changed, 66 insertions(+), 30 deletions(-) diff --git a/ianalyzer_readers/readers/csv.py b/ianalyzer_readers/readers/csv.py index b340350..2d9db31 100644 --- a/ianalyzer_readers/readers/csv.py +++ b/ianalyzer_readers/readers/csv.py @@ -5,7 +5,7 @@ ''' from .. import extract -from typing import Generator, List, Dict +from typing import List, Dict, Iterable from .core import Reader, Document, Source import csv import sys @@ -53,7 +53,7 @@ class CSVReader(Reader): use a fixed "preamble", e.g. to describe metadata or provenance. ''' - def source2dicts(self, source: Source) -> Generator[Document]: + def source2dicts(self, source: Source) -> Iterable[Document]: ''' Given a CSV source file, returns an iterable of extracted documents. diff --git a/ianalyzer_readers/readers/xlsx.py b/ianalyzer_readers/readers/xlsx.py index eb6abcc..95de0c2 100644 --- a/ianalyzer_readers/readers/xlsx.py +++ b/ianalyzer_readers/readers/xlsx.py @@ -1,7 +1,7 @@ import logging import openpyxl from openpyxl.worksheet.worksheet import Worksheet -from typing import Generator +from typing import Iterable from .core import Reader, Document, Source from .. import extract @@ -46,7 +46,7 @@ class XLSXReader(Reader): ''' - def source2dicts(self, source: Source) -> Generator[Document]: + def source2dicts(self, source: Source) -> Iterable[Document]: ''' Given an XLSX source file, returns an iterable of extracted documents. diff --git a/ianalyzer_readers/readers/xml.py b/ianalyzer_readers/readers/xml.py index 835d1bf..2bd064d 100644 --- a/ianalyzer_readers/readers/xml.py +++ b/ianalyzer_readers/readers/xml.py @@ -1,48 +1,84 @@ ''' -Module for the XML reader +This module defines the XML Reader. -Extraction is based on beautiful soup. +Extraction is based on BeautifulSoup. ''' from .. import extract -from .core import Reader +from .core import Reader, Source, Document import itertools import bs4 import logging +from typing import Union, Dict, Callable, Any, Iterable logger = logging.getLogger() +TagSpecification = Union[ + None, + str, + Dict[str, Any], + Callable[[Any, Dict], Union[None, str, Dict[str, Any]]] +] +''' +A specification for an XML tag used in the `XMLReader`. + +These can be: + +- None +- a string with the name of the tag +- a dictionary with the named arguments that should be passed to the `find()` / `find_all()` + method of a BeautifulSoup node. +- A callable that takes an `XMLReader` instance and a dictionary with file metadata, and + returns any of the above. +''' class XMLReader(Reader): ''' - An XMLReader extracts data from XML sources + A base class for Readers that extract data from XML files. + + The built-in functionality of the XML reader is quite versatile, and can be further expanded by + adding custom functions to XML extractors that interact directly with BeautifulSoup nodes. + + The Reader is suitable for datasets where each file should be extracted as a single document, or + ones where each file contains multiple documents. ''' - tag_toplevel = None + tag_toplevel: TagSpecification = None ''' The top-level tag in the source documents. Can be: + - None - A string with the name of the tag - A dictionary that gives the named arguments to soup.find_all() - A bound method that takes the metadata of the document as input and outputs one of the above. ''' - tag_entry = None + tag_entry: TagSpecification = None ''' The tag that corresponds to a single document entry. Can be: + - None - A string with the name of the tag - A dictionary that gives the named arguments to soup.find_all() - A bound method that takes the metadata of the document as input and outputs one of the above. ''' - def source2dicts(self, source): + def source2dicts(self, source: Source) -> Iterable[Document]: ''' - Generate document dictionaries from a given XML file. + Given an XML source file, returns an iterable of extracted documents. + + Parameters: + source: the source file to extract. This can be a string with the path to + the file, or a tuple with a path and a dictionary containing metadata. + + Returns: + an iterable of document dictionaries. Each of these is a dictionary, + where the keys are names of this Reader's `fields`, and the values + are based on the extractor of each field. ''' # Make sure that extractors are sensible self._reject_extractors(extract.CSV) @@ -52,22 +88,22 @@ def source2dicts(self, source): if isinstance(source, str): # no metadata filename = source - soup = self.soup_from_xml(filename) + soup = self._soup_from_xml(filename) elif isinstance(source, bytes): - soup = self.soup_from_data(source) + soup = self._soup_from_data(source) filename = soup.find('RecordID') else: filename = source[0] - soup = self.soup_from_xml(filename) + soup = self._soup_from_xml(filename) metadata = source[1] or None - soup = self.soup_from_xml(filename) + soup = self._soup_from_xml(filename) if metadata and 'external_file' in metadata: external_fields = [field for field in self.fields if isinstance(field.extractor, extract.XML) and field.extractor.external_file] regular_fields = [field for field in self.fields if field not in external_fields] - external_soup = self.soup_from_xml(metadata['external_file']) + external_soup = self._soup_from_xml(metadata['external_file']) else: regular_fields = self.fields external_dict = {} @@ -75,8 +111,8 @@ def source2dicts(self, source): required_fields = [ field.name for field in self.fields if field.required] # Extract fields from the soup - tag = self.get_tag_requirements(self.tag_entry, metadata) - bowl = self.bowl_from_soup(soup, metadata=metadata) + tag = self._get_tag_requirements(self.tag_entry, metadata) + bowl = self._bowl_from_soup(soup, metadata=metadata) if bowl: spoonfuls = bowl.find_all(**tag) if tag else [bowl] for i, spoon in enumerate(spoonfuls): @@ -91,7 +127,7 @@ def source2dicts(self, source): external_dict = {} if external_fields: metadata.update(regular_field_dict) - external_dict = self.external_source2dict( + external_dict = self._external_source2dict( external_soup, external_fields, metadata) # yield the union of external fields and document fields @@ -106,7 +142,7 @@ def source2dicts(self, source): logger.warning( 'Top-level tag not found in `{}`'.format(filename)) - def get_tag_requirements(self, specification, metadata): + def _get_tag_requirements(self, specification, metadata): ''' Get the requirements for a tag given the specification and metadata. @@ -133,7 +169,7 @@ def get_tag_requirements(self, specification, metadata): else: raise TypeError('Tag must be a string or dict') - def external_source2dict(self, soup, external_fields, metadata): + def _external_source2dict(self, soup, external_fields, metadata): ''' given an external xml file with metadata, return a dictionary with tags which were found in that metadata @@ -141,7 +177,7 @@ def external_source2dict(self, soup, external_fields, metadata): ''' external_dict = {} for field in external_fields: - bowl = self.bowl_from_soup( + bowl = self._bowl_from_soup( soup, field.extractor.external_file['xml_tag_toplevel']) spoon = None if field.extractor.secondary_tag: @@ -167,7 +203,7 @@ def external_source2dict(self, soup, external_fields, metadata): 'Top-level tag not found in `{}`'.format(bowl)) return external_dict - def soup_from_xml(self, filename): + def _soup_from_xml(self, filename): ''' Returns beatifulsoup soup object for a given xml file ''' @@ -176,25 +212,25 @@ def soup_from_xml(self, filename): with open(filename, 'rb') as f: data = f.read() logger.info('Loaded {} into memory...'.format(filename)) - return self.soup_from_data(data) + return self._soup_from_data(data) - def soup_from_data(self, data): + def _soup_from_data(self, data): ''' Parses content of a xml file ''' return bs4.BeautifulSoup(data, 'lxml-xml') - def bowl_from_soup(self, soup, toplevel_tag=None, entry_tag=None, metadata = {}): + def _bowl_from_soup(self, soup, toplevel_tag=None, entry_tag=None, metadata = {}): ''' Returns bowl (subset of soup) of soup object. Bowl contains everything within the toplevel tag. If no such tag is present, it contains the entire soup. ''' if toplevel_tag == None: - toplevel_tag = self.get_tag_requirements(self.tag_toplevel, metadata) + toplevel_tag = self._get_tag_requirements(self.tag_toplevel, metadata) return soup.find(**toplevel_tag) if toplevel_tag else soup - def metadata_from_xml(self, filename, tags): + def _metadata_from_xml(self, filename, tags): ''' Given a filename of an xml with metadata, and a range of tags to extract, return a dictionary of all the contents of the requested tags. @@ -206,7 +242,7 @@ def metadata_from_xml(self, filename, tags): } ''' out_dict = {} - soup = self.soup_from_xml(filename) + soup = self._soup_from_xml(filename) for tag in tags: if isinstance(tag, str): tag_info = soup.find(tag) From 2e024842bd6922443db587e75a15e91d33d01af5 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 1 Mar 2024 12:16:24 +0100 Subject: [PATCH 13/32] typing and docstrings for html reader --- ianalyzer_readers/readers/html.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/ianalyzer_readers/readers/html.py b/ianalyzer_readers/readers/html.py index a2e5268..ba0f1c3 100644 --- a/ianalyzer_readers/readers/html.py +++ b/ianalyzer_readers/readers/html.py @@ -1,25 +1,40 @@ ''' -Mode for the HTML reader. +This module defines the XML Reader. -The HTML reader is implemented as a subclas of the XML reader. +The HTML reader is implemented as a subclas of the XML reader, and uses +BeautifulSoup to parse files. ''' from .. import extract +from .core import Source, Document from .xml import XMLReader import bs4 import logging +from typing import Iterable logger = logging.getLogger() class HTMLReader(XMLReader): ''' - An HTML reader extracts data from HTML sources. It is based on the XML reader. + An HTML reader extracts data from HTML sources. + + It is based on the XMLReader and supports the same options (`tag_toplevel` and + `tag_entry`). ''' - def source2dicts(self, source): + def source2dicts(self, source: Source) -> Iterable[Document]: ''' - Generate document dictionaries from a given HTML file. + Given an HTML source file, returns an iterable of extracted documents. + + Parameters: + source: the source file to extract. This can be a string with the path to + the file, or a tuple with a path and a dictionary containing metadata. + + Returns: + an iterable of document dictionaries. Each of these is a dictionary, + where the keys are names of this Reader's `fields`, and the values + are based on the extractor of each field. ''' (filename, metadata) = source From 31b3e6980e73a2f6adb52c6e9f05dc780a5fcebb Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 1 Mar 2024 13:34:24 +0100 Subject: [PATCH 14/32] extractor docstrings --- ianalyzer_readers/extract.py | 197 +++++++++++++++++++++++++++-------- 1 file changed, 153 insertions(+), 44 deletions(-) diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py index fad35ac..b0470fb 100644 --- a/ianalyzer_readers/extract.py +++ b/ianalyzer_readers/extract.py @@ -1,6 +1,9 @@ ''' This module contains extractor classes that can be used to obtain values for each field in a Reader. + +Some extractors are intended to work with specific `Reader` classes, while others +are generic. ''' import bs4 @@ -8,12 +11,21 @@ import re import logging import traceback +from typing import Tuple logger = logging.getLogger() class Extractor(object): ''' + Base class for extractors. + An extractor contains a method that can be used to gather data for a field. + + Parameters: + applicable: optional predicate that takes metadata and decides whether this + extractor is applicable. If left as `None`, the extractor is always + applicable. + transform: optional function to transform the postprocess the extracted value. ''' def __init__(self, @@ -48,6 +60,10 @@ def _apply(self, *nargs, **kwargs): ''' Actual extractor method to be implemented in subclasses (assume that testing for applicability and post-processing is taken care of). + + Raises: + NotImplementedError: This method needs to be implemented on child + classes. It will raise an error by default. ''' raise NotImplementedError() @@ -55,10 +71,29 @@ def _apply(self, *nargs, **kwargs): class Choice(Extractor): ''' Use the first applicable extractor from a list of extractors. + + This is a generic extractor that can be used in any `Reader`. + + The Choice extractor will use the `applicable` property of its provided extractors + to check which applies. + + Example usage: + + Choice(Constant('foo', applicable=some_condition), Constant('bar')) + + This would extract `'foo'` if `some_condition` is met; otherwise, + the extracted value will be `'bar'`. + + Note the difference with `Backup`: `Choice` is based on _metadata_, rather than the + extracted value. + + Parameters: + *extractors: extractors to choose from. These should be listed in descending + order of preference. ''' - def __init__(self, *nargs, **kwargs): - self.extractors = list(nargs) + def __init__(self, *extractors: Extractor, **kwargs): + self.extractors = list(extractors) super().__init__(**kwargs) def _apply(self, metadata, *nargs, **kwargs): @@ -71,10 +106,21 @@ def _apply(self, metadata, *nargs, **kwargs): class Combined(Extractor): ''' Apply all given extractors and return the results as a tuple. + + This is a generic extractor that can be used in any `Reader`. + + Example usage: + + Combined(Constant('foo'), Constant('bar')) + + This would extract `('foo', 'bar')` for each document. + + Parameters: + *extractors: extractors to combine. ''' - def __init__(self, *nargs, **kwargs): - self.extractors = list(nargs) + def __init__(self, *extractors: Extractor, **kwargs): + self.extractors = list(extractors) super().__init__(**kwargs) def _apply(self, *nargs, **kwargs): @@ -86,9 +132,26 @@ def _apply(self, *nargs, **kwargs): class Backup(Extractor): ''' Try all given extractors in order and return the first result that evaluates as true + + This is a generic extractor that can be used in any `Reader`. + + Example usage: + + Backup(Constant(None), Constant('foo')) + + Since the first extractor returns `None`, the second extractor will be used, and the + extracted value would be `'foo'`. + + Note the difference with `Choice`: `Backup` is based on the _extracted value_, rather + than metadata. + + Parameters: + *extractors: extractors to use. These should be listed in descending order of + preference. + ''' - def __init__(self, *nargs, **kwargs): - self.extractors = list(nargs) + def __init__(self, *extractors: Extractor, **kwargs): + self.extractors = list(extractors) super().__init__(**kwargs) def _apply(self, *nargs, **kwargs): @@ -102,6 +165,13 @@ def _apply(self, *nargs, **kwargs): class Constant(Extractor): ''' This extractor 'extracts' the same value every time, regardless of input. + + This is a generic extractor that can be used in any `Reader`. + + It is especially useful in combination with `Backup` or `Choice`. + + Parameters: + value: the value that should be "extracted". ''' def __init__(self, value, *nargs, **kwargs): @@ -115,6 +185,12 @@ def _apply(self, *nargs, **kwargs): class Metadata(Extractor): ''' This extractor extracts a value from provided metadata. + + This is a generic extractor that can be used in any `Reader`. + + Parameters: + key: the key in the metadata dictionary that should be + extracted. ''' def __init__(self, key, *nargs, **kwargs): @@ -128,10 +204,19 @@ class Pass(Extractor): ''' An extractor that just passes the value of another extractor. - Useful if you want to stack multiple `transform` arguments + This is a generic extractor that can be used in any `Reader`. + + This is useful if you want to stack multiple `transform` arguments. For example: + + Pass(Constant('foo ', transfrom=str.upper), transform=str.strip) + + This will extract `str.strip(str.upper('foo '))`, i.e. `'FOO'`. + + Parameters: + extractor: the extractor of which the value should be passed ''' - def __init__(self, extractor, *nargs, **kwargs): + def __init__(self, extractor: Extractor, *nargs, **kwargs): self.extractor = extractor super().__init__(**kwargs) @@ -142,6 +227,11 @@ class Order(Extractor): ''' An extractor that returns the index of the document in its source file. + + The index of the document needs to be passed on the by `Reader`, which needs to + implement some kind of counter in its `source2dicts` method. The `Reader` subclasses + in this package all implement this, and so `Order` can safely be used in any of them. + However, custom `Reader` subclasses may not support this extractor. ''' def _apply(self, index=None, *nargs, **kwargs): @@ -150,38 +240,51 @@ def _apply(self, index=None, *nargs, **kwargs): class XML(Extractor): ''' Extractor for XML data. Searches through a BeautifulSoup document. + + This extractor should be used in a `Reader` based on `XMLReader`. (Note that this + includes the `HTMLReader`.) + + Parameters: + tag: Tag to select. When this is a list, read as a path (e.g. to select + successive children; makes sense when `recursive=False`). Pass `None` if the + information is in an attribute of the current head of the tree. + parent_level: Whether to ascend the tree to find the indicated tag. Useful when + a part of the tree has been selected with `secondary_tag` + attribute: Which attribute, if any, to select + flatten: Flatten the text content of a non-text children? + toplevel: Tag to select for search: top-level or entry tag + recursive: Whether to search all descendants + multiple: Whether to abandon the search after the first element + secondary_tag: Whether the tag's content should match a given metadata field + ('match') or string ('exact') + external_file: Whether to search other xml files for this field, and the file tag + these files should have + transform_soup_func: A function to transform the soup directly after `_select` + was called, i.e. before further processing (attributes, flattening, etc). + Keep in mind that the soup passed could be `None`. + extract_soup_func: A function to extract a value directly from the soup object, + instead of using the content string or giving an attribute. Keep in mind + that the soup passed could be `None`. ''' def __init__(self, - # Tag to select. When this is a list, read as a path - # e.g. to select successive children; makes sense when recursive=False) - # Pass None if the information is in the attribute of the - # current head of the tree tag=[], - # whether to ascend the tree to find the indicated tag - # useful when a part of the tree has been selected with secondary_tag parent_level=None, - attribute=None, # Which attribute, if any, to select - flatten=False, # Flatten the text content of a non-text children? - toplevel=False, # Tag to select for search: top-level or entry tag - recursive=False, # Whether to search all descendants - multiple=False, # Whether to abandon the search after the first element + attribute=None, + flatten=False, + toplevel=False, + recursive=False, + multiple=False, secondary_tag={ 'tag': None, 'match': None, 'exact': None, - }, # Whether the tag's content should match a given metadata field ('match') or string ('exact') - external_file={ # Whether to search other xml files for this field, and the file tag these files should have + }, + external_file={ 'xml_tag_toplevel': None, 'xml_tag_entry': None }, - # a function [e.g. `my_func(soup)`]` to transform the soup directly - # after _select was called, i.e. before further processing (attributes, flattening, etc). - # Keep in mind that the soup passed could be None. transform_soup_func=None, - # a function to extract a value directly from the soup object, instead of using the content string - # or giving an attribute - # Keep in mind that the soup passed could be None. extract_soup_func=None, *nargs, **kwargs @@ -327,11 +430,16 @@ def _attr(self, soup): class FilterAttribute(XML): ''' This extractor extracts attributes or contents from a BeautifulSoup node. - It is an extension of the XML extractor + + It is an extension of the `XML` extractor and adds a single parameter, + `attribute_filter`. + + Parameters: + attribute_filter: Specify an attribute / value pair by which to select content ''' def __init__(self, - attribute_filter={ # Specify an attribute / value pair by which to select content + attribute_filter={ 'attribute': None, 'value': None}, *nargs, @@ -373,11 +481,13 @@ class CSV(Extractor): ''' This extractor extracts values from a list of CSV or spreadsheet rows. + It should be used in readers based on `CSVReader` or `XLSXReader`. + Parameters: - - multiple: Boolean. If a document spans multiple rows, the extracted value for a field with - `multiple = True` is a list of the value in each row. If `multiple = False` (default), only the value - from the first row is extracted. - - convert_to_none: optional, default is `['']`. Listed values are converted to `None`. If `None`/`False`, nothing is converted. + multiple: Boolean. If a document spans multiple rows, the extracted value for a field with + `multiple = True` is a list of the value in each row. If `multiple = False` (default), only the value + from the first row is extracted. + convert_to_none: optional, default is `['']`. Listed values are converted to `None`. If `None`/`False`, nothing is converted. ''' def __init__(self, field, @@ -402,19 +512,18 @@ def format(self, value): return value class ExternalFile(Extractor): + ''' + Free for all external file extractor that provides a stream to `stream_handler` + to do whatever is needed to extract data from an external file. Relies on `associated_file` + being present in the metadata. Note that the XMLExtractor has a built in trick to extract + data from external files (i.e. setting `external_file`), so you probably need that if your + external file is XML. + + Parameters: + stream_handler: function that will handle the opened file. + ''' def __init__(self, stream_handler, *nargs, **kwargs): - ''' - Free for all external file extractor that provides a stream to `stream_handler` - to do whatever is needed to extract data from an external file. Relies on `associated_file` - being present in the metadata. Note that the XMLExtractor has a built in trick to extract - data from external files (i.e. setting `external_file`), so you probably need that if your - external file is XML. - - Parameters: - folder -- folder where the file is located. - stream_handler -- function that will handle the opened file. - ''' super().__init__(*nargs, **kwargs) self.stream_handler = stream_handler From 36f3c22f57ea0479ab57d5d9e1c60037c5bc975f Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 1 Mar 2024 14:49:47 +0100 Subject: [PATCH 15/32] more documentation for xml extractor --- ianalyzer_readers/extract.py | 103 ++++++++++++++++++++++++----------- 1 file changed, 72 insertions(+), 31 deletions(-) diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py index b0470fb..4f19e8e 100644 --- a/ianalyzer_readers/extract.py +++ b/ianalyzer_readers/extract.py @@ -11,7 +11,7 @@ import re import logging import traceback -from typing import Tuple +from typing import Any, Dict, Callable, Union, List, Pattern logger = logging.getLogger() @@ -174,7 +174,7 @@ class Constant(Extractor): value: the value that should be "extracted". ''' - def __init__(self, value, *nargs, **kwargs): + def __init__(self, value: Any, *nargs, **kwargs): self.value = value super().__init__(*nargs, **kwargs) @@ -193,7 +193,7 @@ class Metadata(Extractor): extracted. ''' - def __init__(self, key, *nargs, **kwargs): + def __init__(self, key: str, *nargs, **kwargs): self.key = key super().__init__(*nargs, **kwargs) @@ -244,48 +244,89 @@ class XML(Extractor): This extractor should be used in a `Reader` based on `XMLReader`. (Note that this includes the `HTMLReader`.) + The XML extractor has a lot of options. When deciding how to extract a value, it + usually makes sense to determine them in this order: + + - Choose whether to use the source file (default), or use an external XML file by + setting `external_file`. + - Choose where to start searching. The default searching point is the entry tag + for the document, but you can also start from the top of the document by setting + `toplevel`. For either of these tags, you can set `parent_level` to select + an ancestor to search from. For instance, `parent_level=1` will search from the + parent of the selected tag. + - Choose the query to describe the tag(s) you need. Set `tag`, `recursive`, + `secondary_tag`. + - If you need to return _all_ matching tags, rather than the first match, set + `multiple=True`. + - If needed, set `transform_soup_func` to further modify the matched tag. For + instance, you could use built-in parameters to select a tag, and then add a + `transform_soup_func` to select a child from it with a more complex condition. + - Choose how to extract a value: set `attribute`, `flatten`, or `extract_soup_func` + if needed. + - The extracted value is a string, or the output of `extract_soup_func`. To further + transform it, add a function for `transform`. + Parameters: - tag: Tag to select. When this is a list, read as a path (e.g. to select - successive children; makes sense when `recursive=False`). Pass `None` if the - information is in an attribute of the current head of the tree. - parent_level: Whether to ascend the tree to find the indicated tag. Useful when - a part of the tree has been selected with `secondary_tag` - attribute: Which attribute, if any, to select - flatten: Flatten the text content of a non-text children? - toplevel: Tag to select for search: top-level or entry tag - recursive: Whether to search all descendants - multiple: Whether to abandon the search after the first element + tag: Tag to select. Can be: + + - a string + - a compiled regular expression (the output of `re.compile`). + - a list of strings or regular expression pattterns. In that case, it is read + as a path to select successive children. + - `None`, if the information is in an attribute of the current head of the + tree. + parent_level: If set, the extractor will ascend the tree before looking for the + indicated `tag`. Useful when you need to select information from a tag's + sibling or parent. + attribute: By default, the extractor will extract the text content of the tag. + Set this property to extract the value of an _attribute_ instead. + flatten: When extracting the text content of a tag, `flatten` determines whether + the contents of non-text children are flattened. If `False`, only the direct + text content of the tag is extracted. This parameter does nothing if + `attribute=True` is set. + toplevel: If `True`, the extractor will search from the toplevel tag of the XML + document, rather than the entry tag for the document. + recursive: If `True`, the extractor will search for `tag` recursively. If `False`, + it only looks for direct children. + multiple: If `False`, the extractor will extract the first matching element. If + `True`, it will extract a list of all matching elements. secondary_tag: Whether the tag's content should match a given metadata field ('match') or string ('exact') - external_file: Whether to search other xml files for this field, and the file tag - these files should have - transform_soup_func: A function to transform the soup directly after `_select` - was called, i.e. before further processing (attributes, flattening, etc). - Keep in mind that the soup passed could be `None`. - extract_soup_func: A function to extract a value directly from the soup object, - instead of using the content string or giving an attribute. Keep in mind + external_file: This property can be set to look through a secondary XML file + (usually one containing metadata). It requires that the pass metadata have an + `'external_file'` key that specifies the path to the file. This parameter + specifies the toplevel tag and entry level tag for that file; if set, the + extractor will extract this field from the external file instead of the current + source file. + transform_soup_func: A function to transform the soup directly after the tag is + selected, before further processing (attributes, flattening, etc) to extract + the value from it. Keep in mind that the soup passed could be `None` if no + matching tag is found. + extract_soup_func: A function to extract a value directly from the soup element, + instead of using the content string or an attribute. Keep in mind that the soup passed could be `None`. + `attribute` and `flatten` will do nothing if this property is set. ''' def __init__(self, - tag=[], - parent_level=None, - attribute=None, - flatten=False, - toplevel=False, - recursive=False, - multiple=False, - secondary_tag={ + tag: Union[str, Pattern, List[Union[str, Pattern]], None] =[], + parent_level: Union[int, None] = None, + attribute: Union[str, None] = None, + flatten: bool = False, + toplevel: bool = False, + recursive: bool = False, + multiple: bool = False, + secondary_tag: Dict = { 'tag': None, 'match': None, 'exact': None, }, - external_file={ + external_file: Dict = { 'xml_tag_toplevel': None, 'xml_tag_entry': None }, - transform_soup_func=None, - extract_soup_func=None, + transform_soup_func: Union[Callable, None] = None, + extract_soup_func: Union[Callable, None] = None, *nargs, **kwargs ): From c7f2d94c1f9e5a5869a3187d025f1d5544807774 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 1 Mar 2024 14:54:35 +0100 Subject: [PATCH 16/32] document kwargs in extractor subclasses --- ianalyzer_readers/extract.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py index 4f19e8e..9282865 100644 --- a/ianalyzer_readers/extract.py +++ b/ianalyzer_readers/extract.py @@ -90,6 +90,7 @@ class Choice(Extractor): Parameters: *extractors: extractors to choose from. These should be listed in descending order of preference. + **kwargs: additional options to pass on to `Extractor`. ''' def __init__(self, *extractors: Extractor, **kwargs): @@ -117,6 +118,7 @@ class Combined(Extractor): Parameters: *extractors: extractors to combine. + **kwargs: additional options to pass on to `Extractor`. ''' def __init__(self, *extractors: Extractor, **kwargs): @@ -148,7 +150,7 @@ class Backup(Extractor): Parameters: *extractors: extractors to use. These should be listed in descending order of preference. - + **kwargs: additional options to pass on to `Extractor`. ''' def __init__(self, *extractors: Extractor, **kwargs): self.extractors = list(extractors) @@ -172,6 +174,7 @@ class Constant(Extractor): Parameters: value: the value that should be "extracted". + **kwargs: additional options to pass on to `Extractor`. ''' def __init__(self, value: Any, *nargs, **kwargs): @@ -191,6 +194,7 @@ class Metadata(Extractor): Parameters: key: the key in the metadata dictionary that should be extracted. + **kwargs: additional options to pass on to `Extractor`. ''' def __init__(self, key: str, *nargs, **kwargs): @@ -214,6 +218,7 @@ class Pass(Extractor): Parameters: extractor: the extractor of which the value should be passed + **kwargs: additional options to pass on to `Extractor`. ''' def __init__(self, extractor: Extractor, *nargs, **kwargs): @@ -232,6 +237,9 @@ class Order(Extractor): implement some kind of counter in its `source2dicts` method. The `Reader` subclasses in this package all implement this, and so `Order` can safely be used in any of them. However, custom `Reader` subclasses may not support this extractor. + + Parameters: + **kwargs: additional options to pass on to `Extractor`. ''' def _apply(self, index=None, *nargs, **kwargs): @@ -306,6 +314,7 @@ class XML(Extractor): instead of using the content string or an attribute. Keep in mind that the soup passed could be `None`. `attribute` and `flatten` will do nothing if this property is set. + **kwargs: additional options to pass on to `Extractor`. ''' def __init__(self, @@ -477,6 +486,7 @@ class FilterAttribute(XML): Parameters: attribute_filter: Specify an attribute / value pair by which to select content + **kwargs: additional options to pass on to `XML`. ''' def __init__(self, @@ -525,10 +535,12 @@ class CSV(Extractor): It should be used in readers based on `CSVReader` or `XLSXReader`. Parameters: - multiple: Boolean. If a document spans multiple rows, the extracted value for a field with - `multiple = True` is a list of the value in each row. If `multiple = False` (default), only the value - from the first row is extracted. - convert_to_none: optional, default is `['']`. Listed values are converted to `None`. If `None`/`False`, nothing is converted. + multiple: Boolean. If a document spans multiple rows, the extracted value for a + field with `multiple = True` is a list of the value in each row. If + `multiple = False` (default), only the value from the first row is extracted. + convert_to_none: optional, default is `['']`. Listed values are converted to + `None`. If `None`/`False`, nothing is converted. + **kwargs: additional options to pass on to `Extractor`. ''' def __init__(self, field, @@ -562,6 +574,7 @@ class ExternalFile(Extractor): Parameters: stream_handler: function that will handle the opened file. + **kwargs: additional options to pass on to `Extractor`. ''' def __init__(self, stream_handler, *nargs, **kwargs): From 45d8876f8f52b4c7bd11314d9a9e80896c9b5f9b Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 1 Mar 2024 14:59:27 +0100 Subject: [PATCH 17/32] document supported extractors on reader classes --- ianalyzer_readers/readers/csv.py | 2 ++ ianalyzer_readers/readers/html.py | 3 +++ ianalyzer_readers/readers/xlsx.py | 2 ++ ianalyzer_readers/readers/xml.py | 3 +++ 4 files changed, 10 insertions(+) diff --git a/ianalyzer_readers/readers/csv.py b/ianalyzer_readers/readers/csv.py index 2d9db31..c1090d0 100644 --- a/ianalyzer_readers/readers/csv.py +++ b/ianalyzer_readers/readers/csv.py @@ -27,6 +27,8 @@ class CSVReader(Reader): - one document per row (this is the default) - each document spans a number of consecutive rows. In this case, there should be a column that indicates the identity of the document. + + In addition to generic extractor classes, this reader supports the `CSV` extractor. ''' field_entry = None diff --git a/ianalyzer_readers/readers/html.py b/ianalyzer_readers/readers/html.py index ba0f1c3..089ba92 100644 --- a/ianalyzer_readers/readers/html.py +++ b/ianalyzer_readers/readers/html.py @@ -21,6 +21,9 @@ class HTMLReader(XMLReader): It is based on the XMLReader and supports the same options (`tag_toplevel` and `tag_entry`). + + In addition to generic extractor classes, this reader supports the `XML` and + `FilterAttribute` extractors. ''' def source2dicts(self, source: Source) -> Iterable[Document]: diff --git a/ianalyzer_readers/readers/xlsx.py b/ianalyzer_readers/readers/xlsx.py index 95de0c2..99f6150 100644 --- a/ianalyzer_readers/readers/xlsx.py +++ b/ianalyzer_readers/readers/xlsx.py @@ -24,6 +24,8 @@ class XLSXReader(Reader): column that indicates the identity of the document. The XLSXReader will only look at the _first_ sheet in each file. + + In addition to generic extractor classes, this reader supports the `CSV` extractor. ''' field_entry = None diff --git a/ianalyzer_readers/readers/xml.py b/ianalyzer_readers/readers/xml.py index 2bd064d..1260b70 100644 --- a/ianalyzer_readers/readers/xml.py +++ b/ianalyzer_readers/readers/xml.py @@ -41,6 +41,9 @@ class XMLReader(Reader): The Reader is suitable for datasets where each file should be extracted as a single document, or ones where each file contains multiple documents. + + In addition to generic extractor classes, this reader supports the `XML` and + `FilterAttribute` extractors. ''' tag_toplevel: TagSpecification = None From 9e68defb59a71c061a2237465de47816d2f29920 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 1 Mar 2024 15:16:12 +0100 Subject: [PATCH 18/32] update NotImplementedError messages --- ianalyzer_readers/readers/core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ianalyzer_readers/readers/core.py b/ianalyzer_readers/readers/core.py index df48bd8..8d6e703 100644 --- a/ianalyzer_readers/readers/core.py +++ b/ianalyzer_readers/readers/core.py @@ -76,7 +76,7 @@ def fields(self) -> List: NotImplementedError: This method needs to be implementd on child classes. It will raise an error by default. ''' - raise NotImplementedError('Reader missing fields') + raise NotImplementedError('Reader missing fields implementation') @property def fieldnames(self) -> List[str]: @@ -99,7 +99,7 @@ def sources(self, **kwargs) -> Iterable[Source]: NotImplementedError: This method needs to be implementd on child classes. It will raise an error by default. ''' - raise NotImplementedError('CorpusDefinition missing sources') + raise NotImplementedError('Reader missing sources implementation') def source2dicts(self, source: Source) -> Iterable[Document]: ''' @@ -119,7 +119,7 @@ def source2dicts(self, source: Source) -> Iterable[Document]: NotImplementedError: This method needs to be implemented on child classes. It will raise an error by default. ''' - raise NotImplementedError('CorpusDefinition missing source2dicts') + raise NotImplementedError('Reader missing source2dicts implementation') def documents(self, sources:Iterable[Source]=None) -> Iterable[Document]: ''' From 73a88e623684c6f37f9d3abe280fa60dbd18d9cc Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Fri, 1 Mar 2024 16:42:09 +0100 Subject: [PATCH 19/32] add CSV example --- docs/examples.md | 412 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 411 insertions(+), 1 deletion(-) diff --git a/docs/examples.md b/docs/examples.md index 1b976a9..9f52584 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -1 +1,411 @@ -# Examples \ No newline at end of file +# Examples + +## CSV reader + +This example will define a basic CSV reader. + +Our dataset is contained in a directory `~/data`, which contains a single file, `Hamlet.csv`, that contains the script for *Hamlet* by William Shakespeare. A shortened version of the file looks as follows: + +```csv +act,scene,character,line +# ... +"I","V","","SCENE V. A more remote part of the Castle." +"I","V","HAMLET","Whither wilt thou lead me? Speak, I'll go no further." +"I","V","GHOST","Mark me." +"I","V","HAMLET","I will." +"I","V","GHOST","My hour is almost come," +"I","V","GHOST","When I to sulph'rous and tormenting flames" +"I","V","GHOST","Must render up myself." +"I","V","HAMLET","Alas, poor ghost!" +"I","V","GHOST","Pity me not, but lend thy serious hearing" +"I","V","GHOST","To what I shall unfold." +"I","V","HAMLET","Speak, I am bound to hear." +# ... +``` + +Since this data is encoded as CSV, we can use the `CSVReader` as a base class for our reader: + +```python +from ianalyzer_readers.readers.csv import CSVReader + +class HamletReader(CSVReader): + pass +``` + +### File discovery + +Before we can use the `HamletReader`, some additional attributes must be implemented. First, we need to implement `data_directory` and `sources`. + +```python +from ianalyzer_readers.readers.csv import CSVReader +import os + +class HamletReader(CSVReader): + data_directory = '~/data' + + def sources(self, **kwargs): + for filename in os.listdir(self.data_directory): + path = os.path.join(self.data_directory, filename) + yield path, {} +``` + +This states that our data is located in `~/data`. The method `sources()` specifies how we can discover files in the directory. Here, we just return the path for each file. + +Note that `sources()` includes some assumptions about the contents of the directory, which is why you need to define it for your dataset. For instance, this implementation assumes that all files in the data directory are actually CSV files that can be parsed by the reader, and returns all files. You could add a check for the file extension if that is not appropriate. + +### Defining fields + +Next, we need to define the fields that should be extracted for each document. The original CSV provides each line in the play, and lists the act, scene, character and the text of the line. We want to extract all those values. For good measure, we will also include the name of the play as a constant value. + +```python +from ianalyzer_readers.readers.csv import CSVReader +from ianalyer_readers.readers.core import Field +from ianalyzer_readers.extract import CSV, Constant +import os + +class HamletReader(CSVReader): + data_directory = '~/data' + + play = Field( + name='play', + extractor=Constant('Hamlet') + ) + act = Field( + name='act', + extractor=CSV('act') + ) + scene = Field( + name='scene', + extractor=CSV('scene') + ) + character = Field( + name='character', + extractor=CSV('character') + ) + line = Field( + name='line', + extractor=CSV('line') + ) + fields = [play, act, scene, character, line] + + def sources(self, **kwargs): + for filename in os.listdir(self.data_directory): + path = os.path.join(self.data_directory, filename) + yield path, {} +``` + +At this point, we should be able to use our reader. We look at the output of `documents()` to see the extracted value: + +```python +reader = HamletReader() +docs = list(reader.documents()) +print(docs) +``` + +The example section would look like this in the output: + +```python +[ + # ... + { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': None, + 'line': 'SCENE V. A more remote part of the Castle.', + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'HAMLET', + 'line': "Whither wilt thou lead me? Speak, I'll go no further.", + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'GHOST', + 'line': 'Mark me.', + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'HAMLET', + 'line': 'I will.', + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'GHOST', + 'line': 'My hour is almost come,', + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'GHOST', + 'line': "When I to sulph'rous and tormenting flames", + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'GHOST', + 'line': 'Must render up myself.', + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'HAMLET', + 'line': 'Alas, poor ghost!', + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'GHOST', + 'line': 'Pity me not, but lend thy serious hearing', + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'GHOST', + 'line': 'To what I shall unfold.', + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'HAMLET', + 'line': 'Speak, I am bound to hear.', + } + # ... +] +``` + +### Tweaking extraction + +We can adjust the CSV extraction. + +#### Transforming values + +The `character` field returns the character's names in uppercase, e.g. `'HAMLET'` (how they appeared in the data). Say that we would prefer `'Hamlet'`; we can add a `transform` argument to the extractor for `character`. + +```python +def format_name(name): + if name: + return name.title() + +character = Field( + name='character', + extractor=CSV('character', transform=format_name) +) +``` + +The check `if name` is needed because the character can also be `None`. If a name is provided, it will be converted to title case. + +Now the character names in the output will be `'Hamlet'` and `'Ghost'`. + +#### Grouping rows + +Instead of returning documents of a single line, we would like the reader to group multiple lines spoken by the same character. + +This can be done by setting the attribute `field_entry = 'character'` on the `HamletReader` class. This will instruct the reader to group consecutive rows with the same value in the `character` column. (Note that this refers to the name of the _column_ in the CSV, not the field `character` that we defined!) + +For grouped rows, the default behaviour for the `CSV` extractor is to extract its value from the first row. This makes sense for `character`, `act`, and `scene`, but not for `line`. + +We need to adjust the extractor for `line` so it extracts all lines instead. For clarity, let's rename it to `lines`. + +```python +lines = Field( + name='lines', + extractor=CSV('line', multiple=True, transform='\n'.join) +) +``` + +We add `multiple=True` to select all lines, and add a `transform` argument to the lines are formatted into a single string with linebreaks. + +At this point, the reader class should look like this: + +```python +from ianalyzer_readers.readers.csv import CSVReader +from ianalyer_readers.readers.core import Field +from ianalyzer_readers.extract import CSV, Constant +import os + +def format_name(name): + if name: + return name.title() + +class HamletReader(CSVReader): + data_directory = '~/data' + + field_entry = 'character' + + play = Field( + name='play', + extractor=Constant('Hamlet') + ) + act = Field( + name='act', + extractor=CSV('act') + ) + scene = Field( + name='scene', + extractor=CSV('scene') + ) + character = Field( + name='character', + extractor=CSV('character', transform=format_name) + ) + lines = Field( + name='lines', + extractor=CSV('line', multiple=True, transform='\n'.join) + ) + fields = [play, act, scene, character, lines] + + def sources(self, **kwargs): + for filename in os.listdir(self.data_directory): + path = os.path.join(self.data_directory, filename) + yield path, {} +``` + +Its output should look like this: + +```python +[ + # ... + { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': None, + 'line': 'SCENE V. A more remote part of the Castle.', + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'Hamlet', + 'line': "Whither wilt thou lead me? Speak, I'll go no further.", + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'Ghost', + 'line': 'Mark me.', + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'Hamlet', + 'line': 'I will.', + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'Ghost', + 'line': "My hour is almost come,\nWhen I to sulph'rous and tormenting flames\nMust render up myself.', + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'Hamlet', + 'line': 'Alas, poor ghost!', + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'Ghost', + 'line': 'Pity me not, but lend thy serious hearing\nTo what I shall unfold.', + }, { + 'play': 'Hamlet', + 'act': 'I', + 'scene': 'V', + 'character': 'Hamlet', + 'line': 'Speak, I am bound to hear.', + } + # ... +] +``` + +### Adding metadata + +Our `HamletReader` used a constant to return the name of the play. If we add more plays to our dataset, this won't work anymore. + +Let's say we add some more files in `~/data`, named `Othello.csv`, `The Tragedy of King Lear.csv`, etc. + +Let's turn our `HamletReader` into a `ShakespeareReader` that will assign the correct title of the play. + +Our `sources()` function was already written to yield every file in the data directory, which is what we want. However, the way our CSV files are structured, we will need to find the name of the play at this stage, because it won't be available as a column in the CSV. + +```python +from ianalyzer_readers.readers.csv import CSVReader +# ... +import os + +# ... + +class ShakespeareReader(CSVReader): + + # ... + + def sources(self, **kwargs): + for filename in os.listdir(self.data_directory): + path = os.path.join(self.data_directory, filename) + name, ext = os.path.splitext(filename) + yield path, {'title': name} +``` + +Now we can change the extractor for `play` so it gets the title from the metadata: + +```python +from ianalyer_readers.extract import Metadata + +play = Field( + name='play', + extractor=Metadata('title') +) +``` + +So we end up with the following reader: + +```python +from ianalyzer_readers.readers.csv import CSVReader +from ianalyer_readers.readers.core import Field +from ianalyzer_readers.extract import CSV, Metadata +import os + +def format_name(name): + if name: + return name.title() + +class ShakespeareReader(CSVReader): + data_directory = '~/data' + + field_entry = 'character' + + play = Field( + name='play', + extractor=Metadata('title') + ) + act = Field( + name='act', + extractor=CSV('act') + ) + scene = Field( + name='scene', + extractor=CSV('scene') + ) + character = Field( + name='character', + extractor=CSV('character', transform=format_name) + ) + lines = Field( + name='lines', + extractor=CSV('line', multiple=True, transform='\n'.join) + ) + fields = [play, act, scene, character, lines] + + def sources(self, **kwargs): + for filename in os.listdir(self.data_directory): + path = os.path.join(self.data_directory, filename) + name, ext = os.path.splitext(filename) + yield path, {'title': name} +``` + +For `Hamlet.csv`, the `ShakespeareReader` will extract the same output as `HamletReader`, but it will also return the correct title for other plays. From 4b944fceb205f261866f5e1e712f7d43fb5230e3 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 4 Mar 2024 15:57:52 +0100 Subject: [PATCH 20/32] add usage document --- docs/usage.md | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++ mkdocs.yml | 3 +- 2 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 docs/usage.md diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 0000000..9237834 --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,77 @@ +# Usage + +The intented use of this package is to extract information from a dataset. + +## Core concepts + +### Datasets + +This package is designed for datasets that consist of one or more __source files__, which contain _structured_ information. For example, a dataset could be: + +- a folder with XML files that all have the same structure +- an API where you can request JSON documents +- a single XLSX file with a data table in it + +What is important is that you know the structure of these files, so you can describe (in Python) where to find a particular piece of information in each file. + +### Documents + +We assume that the intend output of your data extraction is a collection of __documents__. Each document is a simple dictionary that contains the information you extracted. + +For example, you might extract the following two documents from a bibliographical dataset: + +```python +[ + { + 'title': 'Pride and Prejudice', + 'author': 'Jane Austen', + 'year': 1813, + }, { + 'title': 'Emma', + 'author': 'Jane Austen', + 'year': 1816, + } +] +``` + +Depending on the file type of your source files, you may extract one document per source file, or multiple. + +### Readers + +A __reader__ is the Python object that can take a dataset, and process it into documents. + +Readers usually custom-made for a specific dataset. They describe exactly where to find the dataset's files, how those files are structured into documents, what fields of information each document should contain, and how to get that information from the source file. + +`ianalyzer_readers` provides base classes that will provide a lot basic functionality, so you don't have to start from scratch for each dataset. + +### Fields + +Documents are dictionaries that give a value for each __field__ you want to extract. In the example documents above, `title`, `author`, and `year` are the fields. + +A reader class has fields attached to it. When the reader processes a document, it will extract the value for each field. + +### Extractors + +Each _field_ has an __extractor__: this is an object that finds the value of that field for a document. + +When a reader extracts a document, it passes a lot of information on to each extractor, which uses that to find the information it needs. Not all readers pass exactly the same information. For example, the `XMLReader` passes a parsed XML that the extractor can query. + +This means that _some extractors are only supported by specific readers_. The `XML` extractor can be used to query an XML tree - so an `XMLReader` can give it the information it needs, but a `CSVReader` cannot. + +Other extractors are _generic_: they are supported by all readers. + +Extractors may contain _other extractors_. Generic extractors often do: they are used to apply logic to extractors. For instance, the `Backup` extractor allows you to try one extractor, and provide a "backup" extractor if it doesn't return a value. + +## Why use this package? + +You will generally use `ianalyzer_readers` in places where you might also write a custom Python script to process your dataset. In some situations, that would be appropriate. We'll go over some reasons why `ianalyzer_readers` may be preferable in your situation. + +### All readers have the same API + +If you write one reader for an XML dataset, and one reader for a CSV dataset, you will write different underlying functions for each of them. The definition of the reader is going to depend on the type of data it needs to read. + +However, once those readers are done, they will have the same API. For example, you can write a module that takes a reader as input, asks it to extract documents, and saves those documents in a database. That module won't need to check whether the reader is working with XML or CSV data: as far as it's concerned, all readers are the same. + +### High-level interface + +`ianalyzer_readers` uses generic libraries like `csv` and `beautifulsoup4`, but is designed for narrower set of use cases. That means that if you're trying to do what the library is designed for, it's able to provide a more high-level interface. diff --git a/mkdocs.yml b/mkdocs.yml index 8e560cf..608c344 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,8 +4,9 @@ plugins: - mkdocstrings nav: - 'index.md' - - 'api.md' + - 'usage.md' - 'examples.md' + - 'api.md' watch: - docs - ianalyzer_readers From 84a64442b0c3da5e30c260c860209c91bd069131 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 4 Mar 2024 15:59:45 +0100 Subject: [PATCH 21/32] add module names to API documentation --- docs/api.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/api.md b/docs/api.md index 3225af9..eee3e38 100644 --- a/docs/api.md +++ b/docs/api.md @@ -2,24 +2,36 @@ ## Core classes +__Module:__ `ianalyzer_readers.readers.core` + ::: ianalyzer_readers.readers.core ## CSV reader +__Module:__ `ianalyzer_readers.readers.csv` + ::: ianalyzer_readers.readers.csv ## XLSX reader +__Module:__ `ianalyzer_readers.readers.xlsx` + ::: ianalyzer_readers.readers.xlsx ## XML reader +__Module:__ `ianalyzer_readers.readers.xml` + ::: ianalyzer_readers.readers.xml ## HTML reader +__Module:__ `ianalyzer_readers.readers.html` + ::: ianalyzer_readers.readers.html ## Extractors +__Module:__ `ianalyzer_readers.extract` + ::: ianalyzer_readers.extract From c747bbe9db45682c613437cfe05ec3c951d48633 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 4 Mar 2024 16:16:29 +0100 Subject: [PATCH 22/32] more typedefs --- ianalyzer_readers/extract.py | 38 +++++++++---------- ianalyzer_readers/readers/core.py | 63 +++++++++++++++---------------- tests/mock_csv_corpus.py | 4 +- 3 files changed, 52 insertions(+), 53 deletions(-) diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py index 9282865..0b7b4f7 100644 --- a/ianalyzer_readers/extract.py +++ b/ianalyzer_readers/extract.py @@ -11,7 +11,7 @@ import re import logging import traceback -from typing import Any, Dict, Callable, Union, List, Pattern +from typing import Any, Dict, Callable, Union, List, Pattern, Optional logger = logging.getLogger() @@ -29,9 +29,8 @@ class Extractor(object): ''' def __init__(self, - applicable=None, # Predicate that takes metadata and decides whether - # this extractor is applicable. None means always. - transform=None # Optional function to postprocess extracted value + applicable: Optional[Callable[[Dict], bool]] = None, + transform: Optional[Callable] = None ): self.transform = transform self.applicable = applicable @@ -97,7 +96,7 @@ def __init__(self, *extractors: Extractor, **kwargs): self.extractors = list(extractors) super().__init__(**kwargs) - def _apply(self, metadata, *nargs, **kwargs): + def _apply(self, metadata: Dict, *nargs, **kwargs): for extractor in self.extractors: if extractor.applicable is None or extractor.applicable(metadata): return extractor.apply(metadata=metadata, *nargs, **kwargs) @@ -201,7 +200,7 @@ def __init__(self, key: str, *nargs, **kwargs): self.key = key super().__init__(*nargs, **kwargs) - def _apply(self, metadata, *nargs, **kwargs): + def _apply(self, metadata: Dict, *nargs, **kwargs): return metadata.get(self.key) class Pass(Extractor): @@ -242,7 +241,7 @@ class Order(Extractor): **kwargs: additional options to pass on to `Extractor`. ''' - def _apply(self, index=None, *nargs, **kwargs): + def _apply(self, index: int = None, *nargs, **kwargs): return index class XML(Extractor): @@ -318,9 +317,9 @@ class XML(Extractor): ''' def __init__(self, - tag: Union[str, Pattern, List[Union[str, Pattern]], None] =[], - parent_level: Union[int, None] = None, - attribute: Union[str, None] = None, + tag: Union[str, Pattern, List[Union[str, Pattern]], None] = [], + parent_level: Optional[int] = None, + attribute: Optional[str] = None, flatten: bool = False, toplevel: bool = False, recursive: bool = False, @@ -334,8 +333,8 @@ def __init__(self, 'xml_tag_toplevel': None, 'xml_tag_entry': None }, - transform_soup_func: Union[Callable, None] = None, - extract_soup_func: Union[Callable, None] = None, + transform_soup_func: Optional[Callable] = None, + extract_soup_func: Optional[Callable] = None, *nargs, **kwargs ): @@ -490,7 +489,7 @@ class FilterAttribute(XML): ''' def __init__(self, - attribute_filter={ + attribute_filter: Dict = { 'attribute': None, 'value': None}, *nargs, @@ -535,7 +534,8 @@ class CSV(Extractor): It should be used in readers based on `CSVReader` or `XLSXReader`. Parameters: - multiple: Boolean. If a document spans multiple rows, the extracted value for a + column: The name of the column from which to extract the value. + multiple: If a document spans multiple rows, the extracted value for a field with `multiple = True` is a list of the value in each row. If `multiple = False` (default), only the value from the first row is extracted. convert_to_none: optional, default is `['']`. Listed values are converted to @@ -543,11 +543,11 @@ class CSV(Extractor): **kwargs: additional options to pass on to `Extractor`. ''' def __init__(self, - field, - multiple=False, - convert_to_none = [''], + column: str, + multiple: bool = False, + convert_to_none: List[str] = [''], *nargs, **kwargs): - self.field = field + self.field = column self.multiple = multiple self.convert_to_none = convert_to_none or [] super().__init__(*nargs, **kwargs) @@ -577,7 +577,7 @@ class ExternalFile(Extractor): **kwargs: additional options to pass on to `Extractor`. ''' - def __init__(self, stream_handler, *nargs, **kwargs): + def __init__(self, stream_handler: Callable, *nargs, **kwargs): super().__init__(*nargs, **kwargs) self.stream_handler = stream_handler diff --git a/ianalyzer_readers/readers/core.py b/ianalyzer_readers/readers/core.py index 8d6e703..7e7c20c 100644 --- a/ianalyzer_readers/readers/core.py +++ b/ianalyzer_readers/readers/core.py @@ -33,6 +33,34 @@ the Reader's `fields`, and the values are based on the extractor of each field. ''' +class Field(object): + ''' + Fields are the elements of information that you wish to extract from each document. + + Parameters: + name: a short hand name (name), which will be used as its key in the document + extractor: an Extractor object that defines how this field's data can be + extracted from source documents. + required: whether this field is required. The `Reader` class should skip the + document is the value for this Field is `None`, though this is not supported + for all readers. + skip: if `True`, this field will not be included in the results. + ''' + + def __init__(self, + name: str, + extractor: extract.Extractor = extract.Constant(None), + required: bool = False, + skip: bool = False, + **kwargs + ): + + self.name = name + self.extractor = extractor + self.required = required + self.skip = skip + + class Reader(object): ''' A base class for readers. Readers are objects that can generate documents @@ -66,7 +94,7 @@ def data_directory(self) -> str: @property - def fields(self) -> List: + def fields(self) -> List[Field]: ''' The list of fields that are extracted from documents. @@ -121,7 +149,7 @@ def source2dicts(self, source: Source) -> Iterable[Document]: ''' raise NotImplementedError('Reader missing source2dicts implementation') - def documents(self, sources:Iterable[Source]=None) -> Iterable[Document]: + def documents(self, sources:Iterable[Source] = None) -> Iterable[Document]: ''' Returns an iterable of extracted documents from source files. @@ -142,7 +170,7 @@ class will use the value of `self.sources()` instead. ) ) - def _reject_extractors(self, *inapplicable_extractors): + def _reject_extractors(self, *inapplicable_extractors: extract.Extractor): ''' Raise errors if any fields use any of the given extractors. @@ -157,32 +185,3 @@ def _reject_extractors(self, *inapplicable_extractors): if isinstance(field.extractor, inapplicable_extractors): raise RuntimeError( "Specified extractor method cannot be used with this type of data") - -# Fields ###################################################################### - -class Field(object): - ''' - Fields are the elements of information that you wish to extract from each document. - - Parameters: - name: a short hand name (name), which will be used as its key in the document - extractor: an Extractor object that defines how this field's data can be - extracted from source documents. - required: whether this field is required. The `Reader` class should skip the - document is the value for this Field is `None`, though this is not supported - for all readers. - skip: if `True`, this field will not be included in the results. - ''' - - def __init__(self, - name=None, - extractor=extract.Constant(None), - required=False, - skip=False, - **kwargs - ): - - self.name = name - self.extractor = extractor - self.required = required - self.skip = skip diff --git a/tests/mock_csv_corpus.py b/tests/mock_csv_corpus.py index 31ed7fe..689e979 100644 --- a/tests/mock_csv_corpus.py +++ b/tests/mock_csv_corpus.py @@ -22,12 +22,12 @@ def sources(self, **kwargs): fields = [ Field( name='character', - extractor=CSV(field='character') + extractor=CSV(column='character') ), Field( name='lines', extractor=CSV( - field='line', + column='line', multiple=True, ) ) From c7ec435ca8624a8fd90fc5ca667b9a5ee24b989c Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 4 Mar 2024 16:42:38 +0100 Subject: [PATCH 23/32] add basic XML test --- tests/test_xml_reader.py | 52 ++++++++++++++++++++++++++++++++++++ tests/xml_example/hamlet.xml | 37 +++++++++++++++++++++++++ tests/xml_reader.py | 39 +++++++++++++++++++++++++++ 3 files changed, 128 insertions(+) create mode 100644 tests/test_xml_reader.py create mode 100644 tests/xml_example/hamlet.xml create mode 100644 tests/xml_reader.py diff --git a/tests/test_xml_reader.py b/tests/test_xml_reader.py new file mode 100644 index 0000000..bc54fc8 --- /dev/null +++ b/tests/test_xml_reader.py @@ -0,0 +1,52 @@ +from .xml_reader import HamletXMLReader + +target_documents = [ + { + 'title': 'Hamlet', + 'character': 'HAMLET', + 'lines': "Whither wilt thou lead me? Speak, I\'ll go no further." + }, + { + 'title': 'Hamlet', + 'character': 'GHOST', + 'lines': "Mark me." + }, + { + 'title': 'Hamlet', + 'character': 'HAMLET', + 'lines': "I will." + }, + { + 'title': 'Hamlet', + 'character': 'GHOST', + 'lines': + "My hour is almost come,\n" + "When I to sulph\'rous and tormenting flames\n" + "Must render up myself." + }, + { + 'title': 'Hamlet', + 'character': 'HAMLET', + 'lines': "Alas, poor ghost!" + }, + { + 'title': 'Hamlet', + 'character': 'GHOST', + 'lines': + "Pity me not, but lend thy serious hearing\n" + "To what I shall unfold." + }, + { + 'title': 'Hamlet', + 'character': 'HAMLET', + 'lines': "Speak, I am bound to hear." + }, +] + + +def test_xml(): + reader = HamletXMLReader() + docs = reader.documents() + + for doc, target in zip(docs, target_documents): + assert doc == target diff --git a/tests/xml_example/hamlet.xml b/tests/xml_example/hamlet.xml new file mode 100644 index 0000000..3086e92 --- /dev/null +++ b/tests/xml_example/hamlet.xml @@ -0,0 +1,37 @@ + + + + Hamlet + William Shakespeare + + + + + + Whither wilt thou lead me? Speak, I'll go no further. + + + Mark me. + + + I will. + + + My hour is almost come, + When I to sulph'rous and tormenting flames + Must render up myself. + + + Alas, poor ghost! + + + Pity me not, but lend thy serious hearing + To what I shall unfold. + + + Speak, I am bound to hear. + + + + + \ No newline at end of file diff --git a/tests/xml_reader.py b/tests/xml_reader.py new file mode 100644 index 0000000..7162cc5 --- /dev/null +++ b/tests/xml_reader.py @@ -0,0 +1,39 @@ +import os + +from ianalyzer_readers.readers.xml import XMLReader +from ianalyzer_readers.readers.core import Field +from ianalyzer_readers.extract import XML + +here = os.path.abspath(os.path.dirname(__file__)) + +class HamletXMLReader(XMLReader): + """ + Example XML reader for testing + """ + + data_directory = os.path.join(here, 'xml_example') + + tag_toplevel = 'document' + tag_entry = 'lines' + + def sources(self, **kwargs): + for filename in os.listdir(self.data_directory): + full_path = os.path.join(self.data_directory, filename) + yield full_path, { + 'filename': filename + } + + title = Field( + 'title', + XML('title', toplevel=True, recursive=True) + ) + character = Field( + 'character', + XML(None, attribute='character') + ) + lines = Field( + 'lines', + XML('l', multiple=True, transform='\n'.join), + ) + + fields = [title, character, lines] \ No newline at end of file From 3f7ad29608608ce405f1a5296db1987792243ced Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 4 Mar 2024 17:39:52 +0100 Subject: [PATCH 24/32] add test for html reader --- tests/html_example/hamlet.html | 48 ++++++++++++++++++++++++++++++ tests/html_reader.py | 39 ++++++++++++++++++++++++ tests/test_html_reader.py | 54 ++++++++++++++++++++++++++++++++++ 3 files changed, 141 insertions(+) create mode 100644 tests/html_example/hamlet.html create mode 100644 tests/html_reader.py create mode 100644 tests/test_html_reader.py diff --git a/tests/html_example/hamlet.html b/tests/html_example/hamlet.html new file mode 100644 index 0000000..882a8b8 --- /dev/null +++ b/tests/html_example/hamlet.html @@ -0,0 +1,48 @@ + + + + + + Hamlet, by William Shakespeare + + +

Hamlet, Prince of Denmark

+
+

Act I

+
+

Scene V

+

+ HAMLET
+ Whither wilt thou lead me? Speak, I'll go no further. +

+

+ GHOST
+ Mark me. +

+

+ HAMLET
+ I will. +

+

+ GHOST
+ My hour is almost come,
+ When I to sulph'rous and tormenting flames
+ Must render up myself. +

+

+ HAMLET
+ Alas, poor ghost! +

+

+ GHOST
+ Pity me not, but lend thy serious hearing
+ To what I shall unfold. +

+

+ HAMLET
+ Speak, I am bound to hear. +

+
+
+ + \ No newline at end of file diff --git a/tests/html_reader.py b/tests/html_reader.py new file mode 100644 index 0000000..8f94a5f --- /dev/null +++ b/tests/html_reader.py @@ -0,0 +1,39 @@ +import os + +from ianalyzer_readers.readers.html import HTMLReader +from ianalyzer_readers.readers.core import Field +from ianalyzer_readers.extract import XML + +here = os.path.abspath(os.path.dirname(__file__)) + +class HamletHTMLReader(HTMLReader): + """ + Example XML reader for testing + """ + + data_directory = os.path.join(here, 'html_example') + + tag_toplevel = 'body' + tag_entry = 'p' + + def sources(self, **kwargs): + for filename in os.listdir(self.data_directory): + full_path = os.path.join(self.data_directory, filename) + yield full_path, { + 'filename': filename + } + + title = Field( + 'title', + XML('h1', toplevel=True) + ) + character = Field( + 'character', + XML('b') + ) + lines = Field( + 'lines', + XML(None, flatten=True), + ) + + fields = [title, character, lines] \ No newline at end of file diff --git a/tests/test_html_reader.py b/tests/test_html_reader.py new file mode 100644 index 0000000..3a02124 --- /dev/null +++ b/tests/test_html_reader.py @@ -0,0 +1,54 @@ +from .html_reader import HamletHTMLReader + +target_documents = [ + { + 'title': 'Hamlet, Prince of Denmark', + 'character': 'HAMLET', + 'lines': "HAMLET \n Whither wilt thou lead me? Speak, I\'ll go no further." + }, + { + 'title': 'Hamlet, Prince of Denmark', + 'character': 'GHOST', + 'lines': "GHOST \n Mark me." + }, + { + 'title': 'Hamlet, Prince of Denmark', + 'character': 'HAMLET', + 'lines': "HAMLET \n I will." + }, + { + 'title': 'Hamlet, Prince of Denmark', + 'character': 'GHOST', + 'lines': + "GHOST \n " + "My hour is almost come,\n " + "When I to sulph\'rous and tormenting flames\n " + "Must render up myself." + }, + { + 'title': 'Hamlet, Prince of Denmark', + 'character': 'HAMLET', + 'lines': "HAMLET \n Alas, poor ghost!" + }, + { + 'title': 'Hamlet, Prince of Denmark', + 'character': 'GHOST', + 'lines': + "GHOST \n " + "Pity me not, but lend thy serious hearing\n " + "To what I shall unfold." + }, + { + 'title': 'Hamlet, Prince of Denmark', + 'character': 'HAMLET', + 'lines': "HAMLET \n Speak, I am bound to hear." + }, +] + + +def test_html(): + reader = HamletHTMLReader() + docs = reader.documents() + + for doc, target in zip(docs, target_documents): + assert doc == target From 9b7d1b4544a2eda5238afdc235419c879acec52f Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 4 Mar 2024 17:55:50 +0100 Subject: [PATCH 25/32] add xlsx reader test --- tests/test_xlsx_reader.py | 45 +++++++++++++++++++++++++++++++++ tests/xlsx_example/hamlet.xlsx | Bin 0 -> 5643 bytes tests/xlsx_reader.py | 28 ++++++++++++++++++++ 3 files changed, 73 insertions(+) create mode 100644 tests/test_xlsx_reader.py create mode 100644 tests/xlsx_example/hamlet.xlsx create mode 100644 tests/xlsx_reader.py diff --git a/tests/test_xlsx_reader.py b/tests/test_xlsx_reader.py new file mode 100644 index 0000000..0b1422c --- /dev/null +++ b/tests/test_xlsx_reader.py @@ -0,0 +1,45 @@ +from .xlsx_reader import HamletXLSXReader + +target_documents = [ + { + 'character': 'HAMLET', + 'lines': "Whither wilt thou lead me? Speak, Iʼll go no further." + }, + { + 'character': 'GHOST', + 'lines': "Mark me." + }, + { + 'character': 'HAMLET', + 'lines': "I will." + }, + { + 'character': 'GHOST', + 'lines': + "My hour is almost come,\n" + "When I to sulphʼrous and tormenting flames\n" + "Must render up myself." + }, + { + 'character': 'HAMLET', + 'lines': "Alas, poor ghost!" + }, + { + 'character': 'GHOST', + 'lines': + "Pity me not, but lend thy serious hearing\n" + "To what I shall unfold." + }, + { + 'character': 'HAMLET', + 'lines': "Speak, I am bound to hear." + }, +] + + +def test_xlsx(): + reader = HamletXLSXReader() + docs = reader.documents() + + for doc, target in zip(docs, target_documents): + assert doc == target diff --git a/tests/xlsx_example/hamlet.xlsx b/tests/xlsx_example/hamlet.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..f695c9deca6e4617d41c0ab3e11e9e6a729ee8b9 GIT binary patch literal 5643 zcmaJ_1z1$w)@JBt2nnTAdZa_7kuGTjhHi##h7?gGMUfFix&@@up&JS5h9N{6l#sZC z-~F##|NDLGJZI*d^PIKz*=xOf?RPCr6*P1*6l`p4l>JC61C$$rhrBj(w|4U2<+=VX zNo-MX1K~&R20Z6D7Mzp9&d6<0Rcv4opoKHL+^Kq&)gFHELynXLKjvdwtQ$1=(jsSG z)rD&w#HwQYOxN%kY7?tvP+oiArJEnG&_E`%lUU@THbj=gJx)XUsDDYiH#@*HUKw}e zP)%hRux`0!g(4yK)ZIkahwsuVYAwQW>jjCMtzPD2FnVKznzN&zixM5R{{#MIea4pz z-rdToS%G;XjU)utk+^>I)i=@7fIu91e4U+Q9`(4)fPhWMoapn;+@=n9UZd&9#-q#u^LkPe704Z*lXdUE z&b*o@w$5RnmL(Dr#an1h@iUrFb^J=xVvwQR-)auOuUW?vG2O8>V=4CL-e*RAjASF} zejf1Dz~ z0*2Qi7V+ei4&UT8`QuvCo?elt> zS8NxraLY4{ObnNkPwB;gj3#`RB0+Y`Jbx~AjTeZ&%69fXN@miXPZK(VAzh5KeLL7> zaBMzAPM!IwG_bnNwKI-<&QCY-i|3V!nzj|D*o6rDy_#? z$6A&AQQ*Dwrl6+@<#IL=oJx7MxtMfp5DQC3_0KBG+^87#xwVzm{&eW9+y(SD8oD|% zDjzE8$qVbTRdd*TUNGU%zG!nL!ER+%x2Oa1n-eIH$rzOlOLt2?utia4e9<^t&1s%&|rf-r6m!wn|yJO(ct^A>4U}(U>+7O9^TQTpK7}@ zFZ~4`Y!Jh+UIY|)Vn)|}ioYMhr+~|X^C>@9gl$%HR+?@s=5eS}zrBmJK}DvgMo}UB znL!acnI5%prTz{sC{S5c(noy7xEcBrD3^SC@rvs$u9L`m6Ssd(wM^mj3I0_%Siu4O zX>K2i*ivzP*!)-D+*dgT&O+X3Z1)Vk>7+)IlgDO+B`TYd-`NsuiRdsFA++LXuJ6V6UqS6eI2)Bl^6}YC2+Kr|f zrTc_$wq0cF^)0Txq1x@gZ(+02RK;KMf5)>FyWqD__N-yOx@XnB1HNx(sV{8#rLRtQ zhz^@P0xQ4XgQhI_%pV^nkDE=WK!L_bm6Puvt!ADM8jqcV-%T@J(O${hLG$(Fr#KDl zrGFNDGXZ<1H{tALbej;W?Nj*Rbp^DT@KDyMo*B8OyB0>!7hPtB1I(|qP|s*DHxyq2 z><{SeGX!7R&$$s|rEch_wixVUHnu(WwbjW3VbQGGN$u-D`UcAQ*bGvGqF#gEoCt1G zk4%zYxBdzAl)n?Dhn=;xrw8w^zkApCZ_=M}nGpdZ@ee#2$}92O7cAIfbc9r6j4&&X zaF;>0Sxko%$a*-^^)0T|PZ0maP}Hf9og#vp;2dB%gyw&=SI^cg%2 zOs--+ykUo9s!aHckT9Q*hg;n*c6k^+B{ELc@^^khSNee62G-3ij`0bsK^);>1B8nj z_K7e%Zimyq#pi2mEpRa5sBG>c>%wK5^AOI-1R|j6rJ~L>%p&qEx?&jEUGkO-GMygD z_M+^yh$n>Q?MV)>52TJ;`g@X8b%Xrtr$I07Mv8UpCJP_`v6Y#-&ZsG_K zJEwovOJ4y7y*q28r>>EaHf#oSi)}@s`BtQ7P{-u(pp+&xw;HPgAS9d9^iuLe0mQ+? zp@k!XjzR1k*&o2$0z5g(erX1}SudDzY#z43-iZ3c8Z^^WL5G}UrW1-qCUsOXq3Oi= z&mT8cDQ3tspA3ff0R=*fm}d;IE}NJ+jyS{KoRCX^OWM8`v8d^YaJAH%oFvqY;fzN~ zI?3&klvffjskHYh;2TF^^&6ipiH{PgSwPv!wc?s%D33R0qLh#qZ)o!*18vXRULHBi zr6X^@BcE5S96j7s4fafIj@cNkA->q0iGoWG2A|_zE%`3S1y`Z0yhNRhYDN*tG`qqj z_Qt@D#RK`RB?Updd)mcT#yc;QFc$k%y8Vm8b|3W5Na zD_6lk1}VVaiJxLN4v|CPftkXz;yz8-W8g}^sb}vGT(laBm(d8e0}DT0tAd2G);sIg z(tl26mrDkDf`{#$!0y`2)xwR1q*sycky2m>qsPIZG_@|Tg#ZH?TRYZLHxXw^xck5@ zos~&-o3qXE@Fb5h!+>ovP4b~Ya)a5IwsvWQQg!NCj=ABv`o+o4J{w}w9Wfv`l4eUH zDjQ)qIZp(=)VS(nkD5gp9J%);CXdJ@P0J7u6?DX1ML=d=u%DFvvn-##Z?Jj=50=Gp zW{;SB_ev>s8>@OnV)kR;Lc16j^$HE9_$WM_WeuYxR|%Dyqlza+L0x8qY_mOSKUlHi z-U4$^`b-;Vd6aLWj%EKcT5q(5%HhS5ZO>R97UlbF4}!K~n$N^PF9;!EF_N*|7L1J3 zx~RtRJKN>TYgc|>U?OX$mnlP_*59Cs*TS-ucydLdnUaRTVnS{`Z_N*u4cg7Vxk#G@!Bh4gJZ)teCwvZ3bn3)aju0I|GXQo6Gx^>QccuXE0ry^HNHdK2?F+f3h^%dh&4bUA-`A3T_ClUH z;l~*9>f?eu%WIat~s>2`+9;fx2p8`Av$t9&-3ZzzkK%&KKyDBF@L)c zlk~n(7o63`l@G928`>6i1_d;%v#l?ZG00K6!@gQ;ZSqc-5m}Q2r0wKb8BWh_ZT3!H zpu6Xwzm^iGTV?kiURnNL^nG)sqyclSi!AEn~X{NN(px1j5 z8ObiI#OHDwk$f>Fqm?1|I?>sf$a$kPn+@M(bB_3!!cA;a`nhabf$+hPm|i4(74!F6 zvxd2Z8MCaxqWHt#t)ane1k-J4fyH$g>xioe{ey&=g=^HuCz05}ken40iTuAuJux!2 zRuD@~cZjP8uO-CY`d6i+-Qv;;!f)E;ES#OGB&#lPh;C=~2H20h5!!yul516IpOj5@ z3AN9lxi0R&)c z!tAebN^adRG796PuC#p;p`PDA)Jd5FF8+$ik%`s^B!4_n$k>@%2u~d1W=Dx@-6Pir zV>3lAst+Zr6Q?m#c_n_-vgU=~jU?d!Dy-a%+9R3>DLfj(l5p=Vt`$|RRi=^Zns?lZ za-P9-o_(}^P7F2UG4(HcKryGUCFSRTn>hnF3b1AvikNhfnqPj}HAcOfU)$E9A*N>%iB?Vly`UQnCr8|D=qnkblmK>v=3>|b<6ye=kv9N4Y!|6Hznm%^AB)AS3&BKL=HNbvy+K)Exg} z_Z%-Nlvt0WqG(wE@ExxfIiqmm@wik>EN}=WF{}ZhImxHDdIpw>`EjrQ7K8Mb^kK-O znQV32rnP&opbwoU*Z~kd{M)L`)>=B~o!AOt{d2dXc-qGp9^k&xOWnbnS=%6q8^)~s z=7+A(!|m*Z2+9&D^Ghtn2y*+NAJ&Dyp2?>pqFcXZ6>94n+Sj9yV(#=QS=U&?)(6em ze~Chlxx(T=EZ=mEh2^-6)GweMrnX+I=tg@oaks9n_Yw(gEbKyxg{U*6SFXYS_dqAT z={HL+4^N2mFV_v0F*-^L&2-&zH7dawIEj^21+KBh7m|P9s6OWlNqIv+)T=b5jd1MYb zlY@%{$)c7#O!5%zcW&AOv4gECTg!l#Vmf15Q%^`w83RR6_csMER0ypWc2DLZ3<1tr zP}`YbMd#8uN=P#DtM`%TKhMip7UJS*?c!;skMeL(UX1JEw!tKwXU^L zj=Eq;#@4s{31OFVe+WdMoV^Ul7_&~P@-SR^1`qheYfAst*($BLMJz)%meuzKiKePY zMlW>}7ww3nz}wi59pfP?GNtaKg^j|(5h8k_*@TX096GqpEZ@x?RV?lrkC1e)`Uc9s z(W6b$2tr7WZV}?v7Y4b1!s#k;3CTiKZ%1p3W_ErF)1vET)>C2|l-c$VaqJ+P%`qEv zl@gNpK+DCvt?+8eW1qScZ`}CE@Tfd7Zod1 zF@?%kB$u;jjxVV%?TkDu}3=3K)WGq^`I(5K3ps}x|05kJH!?_0EvUm-V6%u$~Fs_3HKqW)@-FCa# zX}WH^{kPm`!u_duv)ypri25yT$f5pdOZ};QvuwNWNcbFoKy@33HzoA%v+PS%bT~m$Lk@;K-=_XV?6968=;B=JLHxKELGzY3x7Rzf;kl25v&@zYPo_ ZC-Xnnyrv2UGE68axX2^qx^Vxc^)E4keO~|o literal 0 HcmV?d00001 diff --git a/tests/xlsx_reader.py b/tests/xlsx_reader.py new file mode 100644 index 0000000..6c8e062 --- /dev/null +++ b/tests/xlsx_reader.py @@ -0,0 +1,28 @@ +from ianalyzer_readers.readers.xlsx import XLSXReader +from ianalyzer_readers.readers.core import Field +from ianalyzer_readers.extract import CSV +import os + +here = os.path.abspath(os.path.dirname(__file__)) + + +class HamletXLSXReader(XLSXReader): + data_directory = os.path.join(here, 'xlsx_example') + + def sources(self, **kwargs): + for filename in os.listdir(self.data_directory): + full_path = os.path.join(self.data_directory, filename) + yield full_path, { + 'filename': filename + } + + fields = [ + Field( + name='character', + extractor=CSV(column='Character') + ), + Field( + name='lines', + extractor=CSV(column='Lines') + ) + ] From bae6c286372623804ab3bcac88fe812e125dd28e Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 4 Mar 2024 18:03:46 +0100 Subject: [PATCH 26/32] remove WIP statement from documentation --- docs/index.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/index.md b/docs/index.md index af2b109..3128a36 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,7 +1,5 @@ # Getting started -**This documentation is a work in progress.** - `ianalyzer-readers` is a python module to extract data from XML, HTML, CSV or XLSX files. This module was originally created for [I-analyzer](https://github.com/UUDigitalHumanitieslab/I-analyzer), a web application that extracts data from a variety of datasets, indexes them and presents a search interface. To do this, we wanted a way to extract data from source files without having to write a new script "from scratch" for each dataset, and an API that would work the same regardless of the source file type. From 1ecf4de8056a8786777aa159d4a64fb2ce081c04 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 14 Mar 2024 11:13:56 +0100 Subject: [PATCH 27/32] generate requirements.txt close #3 --- requirements.txt | 91 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1d6e98c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,91 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --extra=dev +# +beautifulsoup4==4.12.3 + # via ianalyzer_readers (setup.py) +click==8.1.7 + # via + # mkdocs + # mkdocstrings +colorama==0.4.6 + # via griffe +et-xmlfile==1.1.0 + # via openpyxl +exceptiongroup==1.2.0 + # via pytest +ghp-import==2.1.0 + # via mkdocs +griffe==0.42.0 + # via mkdocstrings-python +iniconfig==2.0.0 + # via pytest +jinja2==3.1.3 + # via + # mkdocs + # mkdocstrings +lxml==5.1.0 + # via ianalyzer_readers (setup.py) +markdown==3.5.2 + # via + # mkdocs + # mkdocs-autorefs + # mkdocstrings + # mkdocstrings-python + # pymdown-extensions +markupsafe==2.1.5 + # via + # jinja2 + # mkdocs + # mkdocs-autorefs + # mkdocstrings +mergedeep==1.3.4 + # via mkdocs +mkdocs==1.5.3 + # via + # ianalyzer_readers (setup.py) + # mkdocs-autorefs + # mkdocstrings +mkdocs-autorefs==1.0.1 + # via mkdocstrings +mkdocstrings==0.24.1 + # via mkdocstrings-python +mkdocstrings-python==1.9.0 + # via ianalyzer_readers (setup.py) +openpyxl==3.1.2 + # via ianalyzer_readers (setup.py) +packaging==24.0 + # via + # mkdocs + # pytest +pathspec==0.12.1 + # via mkdocs +platformdirs==4.2.0 + # via + # mkdocs + # mkdocstrings +pluggy==1.4.0 + # via pytest +pymdown-extensions==10.7.1 + # via mkdocstrings +pytest==8.1.1 + # via ianalyzer_readers (setup.py) +python-dateutil==2.9.0.post0 + # via ghp-import +pyyaml==6.0.1 + # via + # mkdocs + # pymdown-extensions + # pyyaml-env-tag +pyyaml-env-tag==0.1 + # via mkdocs +six==1.16.0 + # via python-dateutil +soupsieve==2.5 + # via beautifulsoup4 +tomli==2.0.1 + # via pytest +watchdog==4.0.0 + # via mkdocs From 030df7b59156c97a6c2c26c9a7dd763a95613491 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 14 Mar 2024 11:18:20 +0100 Subject: [PATCH 28/32] add readthedocs.yaml --- readthedocs.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 readthedocs.yaml diff --git a/readthedocs.yaml b/readthedocs.yaml new file mode 100644 index 0000000..db6cd0b --- /dev/null +++ b/readthedocs.yaml @@ -0,0 +1,13 @@ +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.9" + +mkdocs: + configuration: mkdocs.yml + +python: + install: + - requirements: requirements.txt \ No newline at end of file From 59330643963866dfd6e570a06d98413235121395 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 14 Mar 2024 11:25:34 +0100 Subject: [PATCH 29/32] link documentation in readme --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 86c890c..1ce65d4 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # I-analyzer Readers +[![Documentation Status](https://readthedocs.org/projects/ianalyzer-readers/badge/?version=latest)](https://ianalyzer-readers.readthedocs.io/en/latest/?badge=latest) + `ianalyzer-readers` is a python module to extract data from XML, HTML, CSV or XLSX files. This module was originally created for [I-analyzer](https://github.com/UUDigitalHumanitieslab/I-analyzer), a web application that extracts data from a variety of datasets, indexes them and presents a search interface. To do this, we wanted a way to extract data from source files without having to write a new script "from scratch" for each dataset, and an API that would work the same regardless of the source file type. @@ -28,11 +30,11 @@ What we find especially useful is that all subclasses of `Reader` have the same ## Usage -*Usage documentation is not yet complete.* +Typical usage of this package would be to make a custom Python class for a dataset from which you want to extract a list of documents. We call this a `Reader`. This package provides the base classes to structure readers, and provides extraction utilities for several file types. -Typical use is that, for each dataset you want to extract, you create a subclass of `Reader` and define required properties. See the [CSV test corpus](./tests/mock_csv_corpus.py) for an example. +For detailed usage documention and examples, visit [ianalyzer-readers.readthedocs.io](https://ianalyzer-readers.readthedocs.io/en/latest/) -After defining the class for your dataset, you can call the `documents()` method to get a generator of document dictionaries. +If this site is unavailable, you can also generate the documentation site locally; see the [contributing guide](./CONTRIBUTING.md) for insttructions. ## Licence From 3fa985ac9b650f5ca6d86d6323a416ef36b44b81 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 14 Mar 2024 11:26:53 +0100 Subject: [PATCH 30/32] add status badge to readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1ce65d4..a94df60 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # I-analyzer Readers +[![Python package](https://github.com/UUDigitalHumanitieslab/ianalyzer-readers/actions/workflows/python-package.yml/badge.svg)](https://github.com/UUDigitalHumanitieslab/ianalyzer-readers/actions/workflows/python-package.yml) [![Documentation Status](https://readthedocs.org/projects/ianalyzer-readers/badge/?version=latest)](https://ianalyzer-readers.readthedocs.io/en/latest/?badge=latest) `ianalyzer-readers` is a python module to extract data from XML, HTML, CSV or XLSX files. From 0af295901e5b55cda60a349d9b506bed7fea71c6 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 14 Mar 2024 11:28:50 +0100 Subject: [PATCH 31/32] add documention URL in pyproject.toml --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index a78c4cb..d867293 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ classifiers = [ [project.urls] Repository = "https://github.com/UUDigitalHumanitieslab/I-analyzer" +Documentation = "https://ianalyzer-readers.readthedocs.io/" [project.optional-dependencies] dev = ['pytest', 'mkdocs', 'mkdocstrings-python'] From 69889ea4f4fcc727b4147699a12650499d057cd6 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 14 Mar 2024 11:28:59 +0100 Subject: [PATCH 32/32] bump version number --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d867293..b73ea48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "ianalyzer_readers" -version = "0.0.0" +version = "0.1.0" authors = [ {name="Utrecht University, Centre for Digital Humanities - Research Software Lab", email="cdh@uu.nl"} ]