Skip to content

Commit

Permalink
stats recompute fixes: (#2022)
Browse files Browse the repository at this point in the history
- fix stats_recompute_last() and stats_recompute_all() to not update the
lastCrawl* properties of a crawl workflow if a crawl is running, as
those stats now point to the running crawl
- refactor _add_running_curr_crawl_stats() to make it clear stats only
updated if crawl is running
- stats_recompute_all() change order to ascending to actually get last
crawl, not first!
  • Loading branch information
ikreymer authored Aug 26, 2024
1 parent 135c974 commit a1df689
Showing 1 changed file with 36 additions and 46 deletions.
82 changes: 36 additions & 46 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ async def get_crawl_configs(
config = CrawlConfigOut.from_dict(res)
# pylint: disable=invalid-name
if not config.inactive:
self._add_curr_crawl_stats(config, await self.get_running_crawl(config))
await self._add_running_curr_crawl_stats(config)
configs.append(config)

return configs, total
Expand All @@ -554,14 +554,10 @@ async def get_crawl_config_info_for_profile(

return results

async def get_running_crawl(
self, crawlconfig: Union[CrawlConfig, CrawlConfigOut]
) -> Optional[CrawlOut]:
async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]:
"""Return the id of currently running crawl for this config, if any"""
# crawls = await self.crawl_manager.list_running_crawls(cid=crawlconfig.id)
crawls, _ = await self.crawl_ops.list_crawls(
cid=crawlconfig.id, running_only=True
)
crawls, _ = await self.crawl_ops.list_crawls(cid=cid, running_only=True)

if len(crawls) == 1:
return crawls[0]
Expand All @@ -570,21 +566,22 @@ async def get_running_crawl(

async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
"""recompute stats by incrementing size counter and number of crawls"""
update_query: dict[str, object] = {
"lastCrawlId": None,
"lastCrawlStartTime": None,
"lastStartedBy": None,
"lastCrawlTime": None,
"lastCrawlState": None,
"lastCrawlSize": None,
"lastCrawlStopping": False,
"isCrawlRunning": False,
}

match_query = {"cid": cid, "finished": {"$ne": None}, "inactive": {"$ne": True}}
last_crawl = await self.crawls.find_one(
match_query, sort=[("finished", pymongo.DESCENDING)]
)
update_query: dict[str, object] = {}

running_crawl = await self.get_running_crawl(cid)
# only look up last finished crawl if no crawls running, otherwise
# lastCrawl* stats are already for running crawl
if not running_crawl:
match_query = {
"cid": cid,
"finished": {"$ne": None},
"inactive": {"$ne": True},
}
last_crawl = await self.crawls.find_one(
match_query, sort=[("finished", pymongo.DESCENDING)]
)
else:
last_crawl = None

if last_crawl:
last_crawl_finished = last_crawl.get("finished")
Expand All @@ -598,6 +595,8 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
update_query["lastCrawlSize"] = sum(
file_.get("size", 0) for file_ in last_crawl.get("files", [])
)
update_query["lastCrawlStopping"] = False
update_query["isCrawlRunning"] = False

if last_crawl_finished:
update_query["lastRun"] = last_crawl_finished
Expand All @@ -616,16 +615,16 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):

return result is not None

def _add_curr_crawl_stats(
self, crawlconfig: CrawlConfigOut, crawl: Optional[CrawlOut]
):
async def _add_running_curr_crawl_stats(self, crawlconfig: CrawlConfigOut):
"""Add stats from current running crawl, if any"""
crawl = await self.get_running_crawl(crawlconfig.id)
if not crawl:
return

crawlconfig.lastCrawlState = crawl.state
crawlconfig.lastCrawlSize = crawl.stats.size if crawl.stats else 0
crawlconfig.lastCrawlStopping = crawl.stopping
crawlconfig.isCrawlRunning = True

async def get_crawl_config_out(self, cid: UUID, org: Organization):
"""Return CrawlConfigOut, including state of currently running crawl, if active
Expand All @@ -636,9 +635,7 @@ async def get_crawl_config_out(self, cid: UUID, org: Organization):
)

if not crawlconfig.inactive:
self._add_curr_crawl_stats(
crawlconfig, await self.get_running_crawl(crawlconfig)
)
await self._add_running_curr_crawl_stats(crawlconfig)

if crawlconfig.profileid:
crawlconfig.profileName = await self.profiles.get_profile_name(
Expand Down Expand Up @@ -715,7 +712,7 @@ async def make_inactive_or_delete(

query = {"inactive": True}

is_running = await self.get_running_crawl(crawlconfig) is not None
is_running = await self.get_running_crawl(crawlconfig.id) is not None

if is_running:
raise HTTPException(status_code=400, detail="crawl_running_cant_deactivate")
Expand Down Expand Up @@ -829,7 +826,7 @@ async def run_now_internal(
"""run new crawl for specified crawlconfig now"""
self.org_ops.can_write_data(org)

if await self.get_running_crawl(crawlconfig):
if await self.get_running_crawl(crawlconfig.id):
raise HTTPException(status_code=400, detail="crawl_already_running")

profile_filename = await self.get_profile_filename(crawlconfig.profileid, org)
Expand Down Expand Up @@ -924,20 +921,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
Should only be called when a crawl completes from operator or on migration
when no crawls are running.
"""
update_query: dict[str, object] = {
"crawlCount": 0,
"crawlSuccessfulCount": 0,
"totalSize": 0,
"lastCrawlId": None,
"lastCrawlStartTime": None,
"lastStartedBy": None,
"lastStartedByName": None,
"lastCrawlTime": None,
"lastCrawlState": None,
"lastCrawlSize": None,
"lastCrawlStopping": False,
"isCrawlRunning": False,
}
update_query: dict[str, object] = {}

match_query = {"cid": cid, "finished": {"$ne": None}}
count = await crawls.count_documents(match_query)
Expand All @@ -950,7 +934,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
last_crawl: Optional[dict[str, object]] = None
last_crawl_size = 0

async for res in crawls.find(match_query).sort("finished", pymongo.DESCENDING):
async for res in crawls.find(match_query).sort("finished", pymongo.ASCENDING):
files = res.get("files", [])
crawl_size = 0
for file in files:
Expand All @@ -964,7 +948,11 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
last_crawl = res
last_crawl_size = crawl_size

if last_crawl:
# only update last_crawl if no crawls running, otherwise
# lastCrawl* stats are already for running crawl
running_crawl = await crawl_configs.get_running_crawl(cid)

if last_crawl and not running_crawl:
update_query["totalSize"] = total_size
update_query["crawlSuccessfulCount"] = successful_count

Expand All @@ -974,6 +962,8 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID):
update_query["lastStartedByName"] = last_crawl.get("userName")
update_query["lastCrawlState"] = last_crawl.get("state")
update_query["lastCrawlSize"] = last_crawl_size
update_query["lastCrawlStopping"] = False
update_query["isCrawlRunning"] = False

last_crawl_finished = last_crawl.get("finished")
update_query["lastCrawlTime"] = last_crawl_finished
Expand Down

0 comments on commit a1df689

Please sign in to comment.