From b1eecc46b405f94db7820d1baf9e3e590be00865 Mon Sep 17 00:00:00 2001 From: CaptainOfHacks <39195263+CaptainOfHacks@users.noreply.github.com> Date: Tue, 12 Mar 2024 20:59:51 +0200 Subject: [PATCH] add eform xml checker --- .../fetch_eform_notices_by_date.py | 2 +- .../notebooks/eform_xml_saxon_indexer.ipynb | 171 +++++++++++++----- .../notice_fetcher/services/notice_fetcher.py | 7 +- 3 files changed, 129 insertions(+), 51 deletions(-) diff --git a/dags/eforms_notices/fetch_eform_notices_by_date.py b/dags/eforms_notices/fetch_eform_notices_by_date.py index 5b8e7e98..7c9f6598 100644 --- a/dags/eforms_notices/fetch_eform_notices_by_date.py +++ b/dags/eforms_notices/fetch_eform_notices_by_date.py @@ -42,7 +42,7 @@ def fetch_by_date_notice_from_ted(): notice-subtype IN (10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24) AND FT~"eforms-sdk-" AND PD={date_wildcard} """ - + print(f"START for date: {date_wildcard}") notice_ids = notice_fetcher_by_query_pipeline(query=query) if not notice_ids: log_warning("No notices has been fetched!") diff --git a/ted_sws/data_sampler/entrypoints/notebooks/eform_xml_saxon_indexer.ipynb b/ted_sws/data_sampler/entrypoints/notebooks/eform_xml_saxon_indexer.ipynb index 1f3d28e7..a26e83de 100644 --- a/ted_sws/data_sampler/entrypoints/notebooks/eform_xml_saxon_indexer.ipynb +++ b/ted_sws/data_sampler/entrypoints/notebooks/eform_xml_saxon_indexer.ipynb @@ -2,82 +2,155 @@ "cells": [ { "cell_type": "code", - "execution_count": 24, - "id": "initial_id", + "outputs": [], + "source": [ + "from ted_sws.notice_fetcher.adapters.ted_api import TedAPIAdapter, TedRequestAPI\n", + "\n" + ], "metadata": { - "collapsed": true, + "collapsed": false, "ExecuteTime": { - "end_time": "2024-02-19T19:22:32.755823Z", - "start_time": "2024-02-19T19:22:32.752040Z" + "end_time": "2024-02-23T18:59:38.621562Z", + "start_time": "2024-02-23T18:59:38.053490Z" } }, + "id": "dda6b7c4ceddae43", + "execution_count": 1 + }, + { + "cell_type": "code", "outputs": [], "source": [ - "from typing import Generator, Optional, Set\n", - "import xml.etree.ElementTree as XMLElementTree\n", - "import re\n", - "from io import StringIO\n", + "date_wildcard = \"20240223*\"\n", "\n", + "query = f\"\"\" TD NOT IN (C E G I D P M Q O R 0 1 2 3 4 5 6 7 8 9 B S Y V F A H J K) AND \n", + " notice-subtype IN (10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24) AND FT~\"eforms-sdk-\" AND\n", + " PD=20240223*\n", + " \"\"\"\n", "\n", - "INCLUDE_ATTRIBUTES_KEYS_VALUES = {\"schemeName\", \"unitCode\", \"listName\"}\n", - "EXCLUDE_ATTRIBUTES_KEYS_VALUES = {\"nuts\", \"country\", \"cpv\"}\n", + "eforms_query = {\"query\" : query}\n", "\n", - "\n", - "def get_unique_xpath_generator(xml_content: str,\n", - " remove_namespaces: bool = True,\n", - " include_values_by_attribute_names: Optional[Set[str]] = None,\n", - " exclude_attribute_values: Optional[Set[str]] = None\n", - " ) -> Generator[str, None, None]:\n", - " xml_file = StringIO(xml_content)\n", - " path = []\n", - " it = XMLElementTree.iterparse(xml_file, events=('start', 'end'))\n", - " for evt, el in it:\n", - " if evt == 'start':\n", - " if remove_namespaces:\n", - " ns_tag = re.split('[{}]', el.tag, 2)[1:]\n", - " path.append(ns_tag[1] if len(ns_tag) > 1 else el.tag)\n", - " else:\n", - " path.append(el.tag)\n", - " xpath = \"/\" + '/'.join(path)\n", - " for attribute_key, attribute_value in el.attrib.items():\n", - " if (attribute_key in include_values_by_attribute_names) and (\n", - " attribute_value not in exclude_attribute_values):\n", - " yield f\"{xpath}@{attribute_key}={attribute_value}\"\n", - " else:\n", - " yield f\"{xpath}@{attribute_key}\"\n", - " yield xpath\n", - " else:\n", - " path.pop()" - ] + "ted_api_adapter = TedAPIAdapter(TedRequestAPI())" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-02-26T19:14:28.950233Z", + "start_time": "2024-02-26T19:14:28.946442Z" + } + }, + "id": "2b65da8994567c48", + "execution_count": 27 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "documents_generator = ted_api_adapter.get_generator_by_query(query=eforms_query, load_content=None)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-02-26T19:14:30.558789Z", + "start_time": "2024-02-26T19:14:30.555013Z" + } + }, + "id": "e4823d454454f2c5", + "execution_count": 28 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "document = next(documents_generator)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-02-26T19:14:37.685403Z", + "start_time": "2024-02-26T19:14:32.157983Z" + } + }, + "id": "58ccdf5539bf3ee3", + "execution_count": 29 }, { "cell_type": "code", "outputs": [ { "data": { - "text/plain": "{'/PriorInformationNotice',\n '/PriorInformationNotice/ContractFolderID',\n '/PriorInformationNotice/ContractingParty',\n '/PriorInformationNotice/ContractingParty/BuyerProfileURI',\n '/PriorInformationNotice/ContractingParty/ContractingActivity',\n '/PriorInformationNotice/ContractingParty/ContractingActivity/ActivityTypeCode',\n '/PriorInformationNotice/ContractingParty/ContractingActivity/ActivityTypeCode@listName=authority-activity',\n '/PriorInformationNotice/ContractingParty/ContractingPartyType',\n '/PriorInformationNotice/ContractingParty/ContractingPartyType/PartyTypeCode',\n '/PriorInformationNotice/ContractingParty/ContractingPartyType/PartyTypeCode@listName=buyer-legal-type',\n '/PriorInformationNotice/ContractingParty/Party',\n '/PriorInformationNotice/ContractingParty/Party/PartyIdentification',\n '/PriorInformationNotice/ContractingParty/Party/PartyIdentification/ID',\n '/PriorInformationNotice/ContractingParty/Party/PartyIdentification/ID@schemeName=organization',\n '/PriorInformationNotice/ContractingParty/Party/ServiceProviderParty',\n '/PriorInformationNotice/ContractingParty/Party/ServiceProviderParty/Party',\n '/PriorInformationNotice/ContractingParty/Party/ServiceProviderParty/Party/PartyIdentification',\n '/PriorInformationNotice/ContractingParty/Party/ServiceProviderParty/Party/PartyIdentification/ID',\n '/PriorInformationNotice/ContractingParty/Party/ServiceProviderParty/Party/PartyIdentification/ID@schemeName=organization',\n '/PriorInformationNotice/ContractingParty/Party/ServiceProviderParty/ServiceTypeCode',\n '/PriorInformationNotice/ContractingParty/Party/ServiceProviderParty/ServiceTypeCode@listName=organisation-role',\n '/PriorInformationNotice/CustomizationID',\n '/PriorInformationNotice/ID',\n '/PriorInformationNotice/ID@schemeName=notice-id',\n '/PriorInformationNotice/IssueDate',\n '/PriorInformationNotice/IssueTime',\n '/PriorInformationNotice/NoticeLanguageCode',\n '/PriorInformationNotice/NoticeTypeCode',\n '/PriorInformationNotice/NoticeTypeCode@listName=competition',\n '/PriorInformationNotice/ProcurementProject',\n '/PriorInformationNotice/ProcurementProject/Description',\n '/PriorInformationNotice/ProcurementProject/Description@languageID',\n '/PriorInformationNotice/ProcurementProject/ID',\n '/PriorInformationNotice/ProcurementProject/MainCommodityClassification',\n '/PriorInformationNotice/ProcurementProject/MainCommodityClassification/ItemClassificationCode',\n '/PriorInformationNotice/ProcurementProject/MainCommodityClassification/ItemClassificationCode@listName',\n '/PriorInformationNotice/ProcurementProject/Name',\n '/PriorInformationNotice/ProcurementProject/Name@languageID',\n '/PriorInformationNotice/ProcurementProject/Note',\n '/PriorInformationNotice/ProcurementProject/Note@languageID',\n '/PriorInformationNotice/ProcurementProject/PlannedPeriod',\n '/PriorInformationNotice/ProcurementProject/PlannedPeriod/EndDate',\n '/PriorInformationNotice/ProcurementProject/ProcurementTypeCode',\n '/PriorInformationNotice/ProcurementProject/ProcurementTypeCode@listName=contract-nature',\n '/PriorInformationNotice/ProcurementProject/RealizedLocation',\n '/PriorInformationNotice/ProcurementProject/RealizedLocation/Address',\n '/PriorInformationNotice/ProcurementProject/RealizedLocation/Address/Country',\n '/PriorInformationNotice/ProcurementProject/RealizedLocation/Address/Country/IdentificationCode',\n '/PriorInformationNotice/ProcurementProject/RealizedLocation/Address/Country/IdentificationCode@listName',\n '/PriorInformationNotice/ProcurementProjectLot',\n '/PriorInformationNotice/ProcurementProjectLot/ID',\n '/PriorInformationNotice/ProcurementProjectLot/ID@schemeName=Lot',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/Description',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/Description@languageID',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/ID',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/MainCommodityClassification',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/MainCommodityClassification/ItemClassificationCode',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/MainCommodityClassification/ItemClassificationCode@listName',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/Name',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/Name@languageID',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/Note',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/Note@languageID',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/PlannedPeriod',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/PlannedPeriod/EndDate',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/ProcurementTypeCode',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/ProcurementTypeCode@listName=contract-nature',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/RealizedLocation',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/RealizedLocation/Address',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/RealizedLocation/Address/Country',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/RealizedLocation/Address/Country/IdentificationCode',\n '/PriorInformationNotice/ProcurementProjectLot/ProcurementProject/RealizedLocation/Address/Country/IdentificationCode@listName',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/AuctionTerms',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/AuctionTerms/AuctionConstraintIndicator',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/ContractingSystem',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/ContractingSystem/ContractingSystemTypeCode',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/ContractingSystem/ContractingSystemTypeCode@listName=dps-usage',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/ContractingSystem/ContractingSystemTypeCode@listName=framework-agreement',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/EconomicOperatorShortList',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/EconomicOperatorShortList/LimitationDescription',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/SubmissionMethodCode',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/SubmissionMethodCode@listName=esubmission',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/UBLExtensions',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/UBLExtensions/UBLExtension',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/UBLExtensions/UBLExtension/ExtensionContent',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/InterestExpressionReceptionPeriod',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/InterestExpressionReceptionPeriod/EndDate',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingProcess/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/InterestExpressionReceptionPeriod/EndTime',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AdditionalInformationParty',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AdditionalInformationParty/PartyIdentification',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AdditionalInformationParty/PartyIdentification/ID',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AdditionalInformationParty/PartyIdentification/ID@schemeName=organization',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AppealTerms',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AppealTerms/AppealReceiverParty',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AppealTerms/AppealReceiverParty/PartyIdentification',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AppealTerms/AppealReceiverParty/PartyIdentification/ID',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AppealTerms/AppealReceiverParty/PartyIdentification/ID@schemeName=organization',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AppealTerms/MediationParty',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AppealTerms/MediationParty/PartyIdentification',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AppealTerms/MediationParty/PartyIdentification/ID',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AppealTerms/MediationParty/PartyIdentification/ID@schemeName=organization',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AwardingTerms',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AwardingTerms/AwardingCriterion',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AwardingTerms/AwardingCriterion/SubordinateAwardingCriterion',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AwardingTerms/AwardingCriterion/SubordinateAwardingCriterion/AwardingCriterionTypeCode',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AwardingTerms/AwardingCriterion/SubordinateAwardingCriterion/AwardingCriterionTypeCode@listName=award-criterion-type',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AwardingTerms/AwardingCriterion/SubordinateAwardingCriterion/UBLExtensions',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AwardingTerms/AwardingCriterion/SubordinateAwardingCriterion/UBLExtensions/UBLExtension',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AwardingTerms/AwardingCriterion/SubordinateAwardingCriterion/UBLExtensions/UBLExtension/ExtensionContent',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AwardingTerms/AwardingCriterion/SubordinateAwardingCriterion/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AwardingTerms/AwardingCriterion/SubordinateAwardingCriterion/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/AwardCriterionParameter',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AwardingTerms/AwardingCriterion/SubordinateAwardingCriterion/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/AwardCriterionParameter/ParameterCode',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AwardingTerms/AwardingCriterion/SubordinateAwardingCriterion/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/AwardCriterionParameter/ParameterCode@listName=number-weight',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/AwardingTerms/AwardingCriterion/SubordinateAwardingCriterion/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/AwardCriterionParameter/ParameterNumeric',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/CallForTendersDocumentReference',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/CallForTendersDocumentReference/Attachment',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/CallForTendersDocumentReference/Attachment/ExternalReference',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/CallForTendersDocumentReference/Attachment/ExternalReference/URI',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/CallForTendersDocumentReference/DocumentType',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/CallForTendersDocumentReference/ID',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/CallForTendersDocumentReference/LanguageID',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/ContractExecutionRequirement',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/ContractExecutionRequirement/ExecutionRequirementCode',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/ContractExecutionRequirement/ExecutionRequirementCode@listName=ecatalog-submission',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/ContractExecutionRequirement/ExecutionRequirementCode@listName=einvoicing',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/ContractExecutionRequirement/ExecutionRequirementCode@listName=reserved-execution',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/FundingProgramCode',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/FundingProgramCode@listName=eu-funded',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/Language',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/Language/ID',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/PostAwardProcess',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/PostAwardProcess/ElectronicOrderUsageIndicator',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/PostAwardProcess/ElectronicPaymentUsageIndicator',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/RequiredFinancialGuarantee',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/RequiredFinancialGuarantee/GuaranteeTypeCode',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/RequiredFinancialGuarantee/GuaranteeTypeCode@listName=tender-guarantee-required',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/TenderRecipientParty',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/TenderRecipientParty/EndpointID',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/TenderRecipientParty/PartyIdentification',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/TenderRecipientParty/PartyIdentification/ID',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/TenderRecipientParty/PartyIdentification/ID@schemeName=organization',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/TendererQualificationRequest',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/TendererQualificationRequest/CompanyLegalFormCode',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/TendererQualificationRequest/SpecificTendererRequirement',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/TendererQualificationRequest/SpecificTendererRequirement/TendererRequirementTypeCode',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/TendererQualificationRequest/SpecificTendererRequirement/TendererRequirementTypeCode@listName=reserved-procurement',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/UBLExtensions',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/UBLExtensions/UBLExtension',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/UBLExtensions/UBLExtension/ExtensionContent',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/SelectionCriteria',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/SelectionCriteria/CalculationExpressionCode',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/SelectionCriteria/CalculationExpressionCode@listName=usage',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/SelectionCriteria/CriterionTypeCode',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/SelectionCriteria/CriterionTypeCode@listName=selection-criterion',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/SelectionCriteria/Description',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/SelectionCriteria/Description@languageID',\n '/PriorInformationNotice/ProcurementProjectLot/TenderingTerms/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/SelectionCriteria/SecondStageIndicator',\n '/PriorInformationNotice/RegulatoryDomain',\n '/PriorInformationNotice/TenderingProcess',\n '/PriorInformationNotice/TenderingProcess/Description',\n '/PriorInformationNotice/TenderingProcess/Description@languageID',\n '/PriorInformationNotice/TenderingProcess/ProcedureCode',\n '/PriorInformationNotice/TenderingProcess/ProcedureCode@listName=procurement-procedure-type',\n '/PriorInformationNotice/TenderingTerms',\n '/PriorInformationNotice/TenderingTerms/TendererQualificationRequest',\n '/PriorInformationNotice/TenderingTerms/TendererQualificationRequest/SpecificTendererRequirement',\n '/PriorInformationNotice/TenderingTerms/TendererQualificationRequest/SpecificTendererRequirement/Description',\n '/PriorInformationNotice/TenderingTerms/TendererQualificationRequest/SpecificTendererRequirement/Description@languageID',\n '/PriorInformationNotice/TenderingTerms/TendererQualificationRequest/SpecificTendererRequirement/TendererRequirementTypeCode',\n '/PriorInformationNotice/TenderingTerms/TendererQualificationRequest/SpecificTendererRequirement/TendererRequirementTypeCode@listName=exclusion-ground',\n '/PriorInformationNotice/UBLExtensions',\n '/PriorInformationNotice/UBLExtensions/UBLExtension',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Changes',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Changes/Change',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Changes/Change/ChangedSectionIdentifier',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Changes/ChangeReason',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Changes/ChangeReason/ReasonCode',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Changes/ChangeReason/ReasonCode@languageID',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Changes/ChangeReason/ReasonCode@listName=change-corrig-justification',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/NoticeSubType',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/NoticeSubType/SubTypeCode',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/Contact',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/Contact/ElectronicMail',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/Contact/Name',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/Contact/Telephone',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/PartyIdentification',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/PartyIdentification/ID',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/PartyIdentification/ID@schemeName=organization',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/PartyLegalEntity',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/PartyLegalEntity/CompanyID',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/PartyName',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/PartyName/Name',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/PartyName/Name@languageID',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/PostalAddress',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/PostalAddress/CityName',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/PostalAddress/Country',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/PostalAddress/Country/IdentificationCode',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/PostalAddress/Country/IdentificationCode@listName',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/PostalAddress/PostalZone',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/PostalAddress/StreetName',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Organizations/Organization/Company/WebsiteURI',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Publication',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Publication/GazetteID',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Publication/GazetteID@schemeName=ojs-id',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Publication/NoticePublicationID',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Publication/NoticePublicationID@schemeName=ojs-notice-id',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/Publication/PublicationDate',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/TransmissionDate',\n '/PriorInformationNotice/UBLExtensions/UBLExtension/ExtensionContent/EformsExtension/TransmissionTime',\n '/PriorInformationNotice/UBLVersionID',\n '/PriorInformationNotice/VersionID'}" + "text/plain": "{'PD': '2024-02-23T00:00:00Z',\n 'publication-number': '112393-2024',\n 'ND': '112393-2024',\n 'links': {'xml': {'MUL': 'https://ted.europa.eu/en/notice/112393-2024/xml'},\n 'pdf': {'BUL': 'https://ted.europa.eu/bg/notice/112393-2024/pdf',\n 'SPA': 'https://ted.europa.eu/es/notice/112393-2024/pdf',\n 'CES': 'https://ted.europa.eu/cs/notice/112393-2024/pdf',\n 'DAN': 'https://ted.europa.eu/da/notice/112393-2024/pdf',\n 'DEU': 'https://ted.europa.eu/de/notice/112393-2024/pdf',\n 'EST': 'https://ted.europa.eu/et/notice/112393-2024/pdf',\n 'ELL': 'https://ted.europa.eu/el/notice/112393-2024/pdf',\n 'ENG': 'https://ted.europa.eu/en/notice/112393-2024/pdf',\n 'FRA': 'https://ted.europa.eu/fr/notice/112393-2024/pdf',\n 'GLE': 'https://ted.europa.eu/ga/notice/112393-2024/pdf',\n 'HRV': 'https://ted.europa.eu/hr/notice/112393-2024/pdf',\n 'ITA': 'https://ted.europa.eu/it/notice/112393-2024/pdf',\n 'LAV': 'https://ted.europa.eu/lv/notice/112393-2024/pdf',\n 'LIT': 'https://ted.europa.eu/lt/notice/112393-2024/pdf',\n 'HUN': 'https://ted.europa.eu/hu/notice/112393-2024/pdf',\n 'MLT': 'https://ted.europa.eu/mt/notice/112393-2024/pdf',\n 'NLD': 'https://ted.europa.eu/nl/notice/112393-2024/pdf',\n 'POL': 'https://ted.europa.eu/pl/notice/112393-2024/pdf',\n 'POR': 'https://ted.europa.eu/pt/notice/112393-2024/pdf',\n 'RON': 'https://ted.europa.eu/ro/notice/112393-2024/pdf',\n 'SLK': 'https://ted.europa.eu/sk/notice/112393-2024/pdf',\n 'SLV': 'https://ted.europa.eu/sl/notice/112393-2024/pdf',\n 'FIN': 'https://ted.europa.eu/fi/notice/112393-2024/pdf',\n 'SWE': 'https://ted.europa.eu/sv/notice/112393-2024/pdf'},\n 'pdfs': {'ELL': 'https://ted.europa.eu/el/notice/112393-2024/pdfs'},\n 'html': {'BUL': 'https://ted.europa.eu/bg/notice/-/detail/112393-2024',\n 'SPA': 'https://ted.europa.eu/es/notice/-/detail/112393-2024',\n 'CES': 'https://ted.europa.eu/cs/notice/-/detail/112393-2024',\n 'DAN': 'https://ted.europa.eu/da/notice/-/detail/112393-2024',\n 'DEU': 'https://ted.europa.eu/de/notice/-/detail/112393-2024',\n 'EST': 'https://ted.europa.eu/et/notice/-/detail/112393-2024',\n 'ELL': 'https://ted.europa.eu/el/notice/-/detail/112393-2024',\n 'ENG': 'https://ted.europa.eu/en/notice/-/detail/112393-2024',\n 'FRA': 'https://ted.europa.eu/fr/notice/-/detail/112393-2024',\n 'GLE': 'https://ted.europa.eu/ga/notice/-/detail/112393-2024',\n 'HRV': 'https://ted.europa.eu/hr/notice/-/detail/112393-2024',\n 'ITA': 'https://ted.europa.eu/it/notice/-/detail/112393-2024',\n 'LAV': 'https://ted.europa.eu/lv/notice/-/detail/112393-2024',\n 'LIT': 'https://ted.europa.eu/lt/notice/-/detail/112393-2024',\n 'HUN': 'https://ted.europa.eu/hu/notice/-/detail/112393-2024',\n 'MLT': 'https://ted.europa.eu/mt/notice/-/detail/112393-2024',\n 'NLD': 'https://ted.europa.eu/nl/notice/-/detail/112393-2024',\n 'POL': 'https://ted.europa.eu/pl/notice/-/detail/112393-2024',\n 'POR': 'https://ted.europa.eu/pt/notice/-/detail/112393-2024',\n 'RON': 'https://ted.europa.eu/ro/notice/-/detail/112393-2024',\n 'SLK': 'https://ted.europa.eu/sk/notice/-/detail/112393-2024',\n 'SLV': 'https://ted.europa.eu/sl/notice/-/detail/112393-2024',\n 'FIN': 'https://ted.europa.eu/fi/notice/-/detail/112393-2024',\n 'SWE': 'https://ted.europa.eu/sv/notice/-/detail/112393-2024'},\n 'htmlDirect': {'BUL': 'https://ted.europa.eu/bg/notice/112393-2024/html',\n 'SPA': 'https://ted.europa.eu/es/notice/112393-2024/html',\n 'CES': 'https://ted.europa.eu/cs/notice/112393-2024/html',\n 'DAN': 'https://ted.europa.eu/da/notice/112393-2024/html',\n 'DEU': 'https://ted.europa.eu/de/notice/112393-2024/html',\n 'EST': 'https://ted.europa.eu/et/notice/112393-2024/html',\n 'ELL': 'https://ted.europa.eu/el/notice/112393-2024/html',\n 'ENG': 'https://ted.europa.eu/en/notice/112393-2024/html',\n 'FRA': 'https://ted.europa.eu/fr/notice/112393-2024/html',\n 'GLE': 'https://ted.europa.eu/ga/notice/112393-2024/html',\n 'HRV': 'https://ted.europa.eu/hr/notice/112393-2024/html',\n 'ITA': 'https://ted.europa.eu/it/notice/112393-2024/html',\n 'LAV': 'https://ted.europa.eu/lv/notice/112393-2024/html',\n 'LIT': 'https://ted.europa.eu/lt/notice/112393-2024/html',\n 'HUN': 'https://ted.europa.eu/hu/notice/112393-2024/html',\n 'MLT': 'https://ted.europa.eu/mt/notice/112393-2024/html',\n 'NLD': 'https://ted.europa.eu/nl/notice/112393-2024/html',\n 'POL': 'https://ted.europa.eu/pl/notice/112393-2024/html',\n 'POR': 'https://ted.europa.eu/pt/notice/112393-2024/html',\n 'RON': 'https://ted.europa.eu/ro/notice/112393-2024/html',\n 'SLK': 'https://ted.europa.eu/sk/notice/112393-2024/html',\n 'SLV': 'https://ted.europa.eu/sl/notice/112393-2024/html',\n 'FIN': 'https://ted.europa.eu/fi/notice/112393-2024/html',\n 'SWE': 'https://ted.europa.eu/sv/notice/112393-2024/html'}}}" }, - "execution_count": 25, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from tests.test_data import EFORMS_SAMPLE_FILE_PATH\n", - "\n", - "xml_content = EFORMS_SAMPLE_FILE_PATH.read_text(encoding=\"utf-8\")\n", - "xpaths = set(get_unique_xpath_generator(xml_content))\n", - "xpaths" + "document" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-02-26T19:14:37.696903Z", + "start_time": "2024-02-26T19:14:37.690174Z" + } + }, + "id": "e26fbea0e097342a", + "execution_count": 30 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "from dateutil import rrule\n", + "from datetime import datetime\n", + "def generate_wildcards_foreach_day_in_range(start_date: str, end_date: str) -> list:\n", + " \"\"\"\n", + " Given a date range returns all daily dates in that range\n", + " :param start_date:\n", + " :param end_date:\n", + " :return:\n", + " \"\"\"\n", + " return [dt.strftime('%Y%m%d*')\n", + " for dt in rrule.rrule(rrule.DAILY,\n", + " dtstart=datetime.strptime(start_date, '%Y%m%d'),\n", + " until=datetime.strptime(end_date, '%Y%m%d'))]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-02-23T19:07:30.688571Z", + "start_time": "2024-02-23T19:07:30.686751Z" + } + }, + "id": "a6bd930a2c0b4862", + "execution_count": 15 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "['20231130*',\n '20231201*',\n '20231202*',\n '20231203*',\n '20231204*',\n '20231205*',\n '20231206*',\n '20231207*',\n '20231208*',\n '20231209*',\n '20231210*',\n '20231211*',\n '20231212*',\n '20231213*',\n '20231214*',\n '20231215*',\n '20231216*',\n '20231217*',\n '20231218*',\n '20231219*',\n '20231220*',\n '20231221*',\n '20231222*',\n '20231223*',\n '20231224*',\n '20231225*',\n '20231226*',\n '20231227*',\n '20231228*',\n '20231229*',\n '20231230*',\n '20231231*',\n '20240101*',\n '20240102*',\n '20240103*',\n '20240104*',\n '20240105*']" + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate_wildcards_foreach_day_in_range(start_date=\"20231130\", end_date=\"20240105\")" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-02-19T19:22:36.132395Z", - "start_time": "2024-02-19T19:22:36.119736Z" + "end_time": "2024-02-23T19:08:21.108888Z", + "start_time": "2024-02-23T19:08:21.106249Z" } }, - "id": "7cf5a47a4e2ae402", - "execution_count": 25 + "id": "1ade3194dfb8d958", + "execution_count": 18 } ], "metadata": { diff --git a/ted_sws/notice_fetcher/services/notice_fetcher.py b/ted_sws/notice_fetcher/services/notice_fetcher.py index 10310e88..8f6713ed 100644 --- a/ted_sws/notice_fetcher/services/notice_fetcher.py +++ b/ted_sws/notice_fetcher/services/notice_fetcher.py @@ -7,6 +7,7 @@ from ted_sws.core.model.notice import Notice from ted_sws.data_manager.adapters.repository_abc import NoticeRepositoryABC from ted_sws.notice_fetcher.adapters.ted_api_abc import TedAPIAdapterABC +from ted_sws.notice_metadata_processor.services.metadata_normalizer import check_if_xml_manifestation_is_eform class NoticeFetcherABC(abc.ABC): @@ -112,7 +113,11 @@ def fetch_notices_by_query(self, query: dict) -> List[str]: notice_ids = set() for document in documents: notice_ids.add(document["ND"]) - self.notice_repository.add(notice=self._create_notice(notice_data=document)) + notice = self._create_notice(notice_data=document) + if check_if_xml_manifestation_is_eform(notice.xml_manifestation.object_data): + self.notice_repository.add(notice=notice) + else: + print(f"Notice [{notice.ted_id}] is standard form!") return list(notice_ids) def fetch_notices_by_date_range(self, start_date: date, end_date: date) -> List[str]: