-
Notifications
You must be signed in to change notification settings - Fork 108
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Clean input data rules #10224
Merged
amaltaro
merged 2 commits into
dmwm:master
from
todor-ivanov:feature_MSRuleCleaner_DeleteInputRules_fix-10017
Feb 23, 2021
Merged
Clean input data rules #10224
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,6 +34,17 @@ | |
from Utils.Pipeline import Pipeline, Functor | ||
from WMCore.WMException import WMException | ||
from WMCore.Services.LogDB.LogDB import LogDB | ||
from WMCore.Services.WMStatsServer.WMStatsServer import WMStatsServer | ||
from WMCore.MicroService.Unified.Common import findParent | ||
|
||
|
||
class MSRuleCleanerResolveParentError(WMException): | ||
""" | ||
WMCore exception class for the MSRuleCleaner module, in WMCore MicroServices, | ||
used to signal if an error occurred during parent dataset resolution step. | ||
""" | ||
def __init__(self, message): | ||
super(MSRuleCleanerResolveParentError, self).__init__(message) | ||
|
||
|
||
class MSRuleCleanerArchivalError(WMException): | ||
|
@@ -81,15 +92,20 @@ def __init__(self, msConfig, logger=None): | |
self.logDB = LogDB(self.msConfig["logDBUrl"], | ||
self.msConfig["logDBReporter"], | ||
logger=self.logger) | ||
self.wmstatsSvc = WMStatsServer(self.msConfig['wmstatsUrl'], logger=self.logger) | ||
|
||
# Building all the Pipelines: | ||
pName = 'plineMSTrCont' | ||
self.plineMSTrCont = Pipeline(name=pName, | ||
funcLine=[Functor(self.setPlineMarker, pName), | ||
Functor(self.setParentDatasets), | ||
todor-ivanov marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Functor(self.getRucioRules, 'container', self.msConfig['rucioMStrAccount']), | ||
Functor(self.cleanRucioRules)]) | ||
pName = 'plineMSTrBlock' | ||
self.plineMSTrBlock = Pipeline(name=pName, | ||
funcLine=[Functor(self.setPlineMarker, pName), | ||
Functor(self.setParentDatasets), | ||
Functor(self.getRucioRules, 'block', self.msConfig['rucioMStrAccount']), | ||
Functor(self.cleanRucioRules)]) | ||
pName = 'plineAgentCont' | ||
self.plineAgentCont = Pipeline(name=pName, | ||
|
@@ -130,6 +146,32 @@ def __init__(self, msConfig, logger=None): | |
self.wfCounters = {'cleaned': {}, | ||
'archived': {'normalArchived': 0, | ||
'forceArchived': 0}} | ||
self.globalLocks = set() | ||
|
||
def getGlobalLocks(self): | ||
""" | ||
Fetches the list of 'globalLocks' from wmstats server and the list of | ||
'parentLocks' from request manager. Stores/updates the unified set in | ||
the 'globalLocks' instance variable. Returns the resultant unified set. | ||
:return: A union set of the 'globalLocks' and the 'parentLocks' lists | ||
""" | ||
self.logger.info("Fetching globalLocks list from wmstats server.") | ||
try: | ||
globalLocks = set(self.wmstatsSvc.getGlobalLocks()) | ||
except Exception as ex: | ||
msg = "Failed to refresh global locks list for the current polling cycle. Error: %s " | ||
msg += "Skipping this polling cycle." | ||
self.logger.error(msg, str(ex)) | ||
raise ex | ||
self.logger.info("Fetching parentLocks list from reqmgr2 server.") | ||
try: | ||
parentLocks = set(self.reqmgr2.getParentLocks()) | ||
except Exception as ex: | ||
msg = "Failed to refresh parent locks list for the current poling cycle. Error: %s " | ||
msg += "Skipping this polling cycle." | ||
self.logger.error(msg, str(ex)) | ||
raise ex | ||
self.globalLocks = globalLocks | parentLocks | ||
|
||
def resetCounters(self): | ||
""" | ||
|
@@ -152,7 +194,6 @@ def execute(self, reqStatus): | |
self.currThreadIdent = self.currThread.name | ||
self.updateReportDict(summary, "thread_id", self.currThreadIdent) | ||
self.resetCounters() | ||
|
||
self.logger.info("MSRuleCleaner is running in mode: %s.", self.mode) | ||
|
||
# Build the list of workflows to work on: | ||
|
@@ -167,6 +208,7 @@ def execute(self, reqStatus): | |
|
||
# Call _execute() and feed the relevant pipeline with the objects popped from requestRecords | ||
try: | ||
self.getGlobalLocks() | ||
totalNumRequests, cleanNumRequests, normalArchivedNumRequests, forceArchivedNumRequests = self._execute(requestRecords) | ||
msg = "\nNumber of processed workflows: %s." | ||
msg += "\nNumber of properly cleaned workflows: %s." | ||
|
@@ -280,6 +322,11 @@ def _dispatchWflow(self, wflow): | |
for pline in self.cleanuplines: | ||
try: | ||
pline.run(wflow) | ||
except MSRuleCleanerResolveParentError as ex: | ||
msg = "%s: Parentage Resolve Error: %s. " | ||
msg += "Will retry again in the next cycle." | ||
self.logger.error(msg, pline.name, ex.message()) | ||
continue | ||
except Exception as ex: | ||
msg = "%s: General error from pipeline. Workflow: %s. Error: \n%s. " | ||
msg += "\nWill retry again in the next cycle." | ||
|
@@ -528,30 +575,75 @@ def getMSOutputTransferInfo(self, wflow): | |
wflow['TransferDone'] = True | ||
return wflow | ||
|
||
def setParentDatasets(self, wflow): | ||
""" | ||
Used to resolve parent datasets for a workflow. | ||
:param wflow: A MSRuleCleaner workflow representation | ||
:return: The workflow object | ||
""" | ||
if wflow['InputDataset'] and wflow['IncludeParents']: | ||
childDataset = wflow['InputDataset'] | ||
parentDataset = findParent([childDataset], self.msConfig['dbsUrl']) | ||
todor-ivanov marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# NOTE: If findParent() returned None then the DBS service failed to | ||
# resolve the request (it is considered an ERROR outside WMCore) | ||
if parentDataset.get(childDataset, None) is None: | ||
msg = "Failed to resolve parent dataset for: %s in workflow: %s" % (childDataset, wflow['RequestName']) | ||
raise MSRuleCleanerResolveParentError(msg) | ||
elif parentDataset: | ||
wflow['ParentDataset'] = [parentDataset[childDataset]] | ||
msg = "Found parent %s for input dataset %s in workflow: %s " | ||
self.logger.info(msg, parentDataset, wflow['InputDataset'], wflow['RequestName']) | ||
else: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This else block is no longer needed. But that's a minor and I won't ask you to update this code once again. |
||
msg = "Could not find parent for input dataset: %s in workflows: %s" | ||
self.logger.error(msg, wflow['InputDataset'], wflow['RequestName']) | ||
return wflow | ||
|
||
def getRucioRules(self, wflow, gran, rucioAcct): | ||
""" | ||
Queries Rucio and builds the relevant list of blocklevel rules for | ||
the given workflow | ||
:param wflow: A MSRuleCleaner workflow representation | ||
:param gran: Data granularity to search for Rucio rules. Possible values: | ||
'block' || 'container' | ||
'block' or 'container' | ||
:return: The workflow object | ||
""" | ||
currPline = wflow['PlineMarkers'][-1] | ||
# Find all the output placement rules created by the agents | ||
for dataCont in wflow['OutputDatasets']: | ||
if gran == 'container': | ||
for rule in self.rucio.listDataRules(dataCont, account=rucioAcct): | ||
wflow['RulesToClean'][currPline].append(rule['id']) | ||
elif gran == 'block': | ||
try: | ||
blocks = self.rucio.getBlocksInContainer(dataCont) | ||
for block in blocks: | ||
for rule in self.rucio.listDataRules(block, account=rucioAcct): | ||
wflow['RulesToClean'][currPline].append(rule['id']) | ||
except WMRucioDIDNotFoundException: | ||
msg = "Container: %s not found in Rucio for workflow: %s." | ||
|
||
# Create the container list to the rucio account map and set the checkGlobalLocks flag. | ||
mapRuleType = {self.msConfig['rucioWmaAccount']: ["OutputDatasets"], | ||
self.msConfig['rucioMStrAccount']: ["InputDataset", "MCPileup", | ||
"DataPileup", "ParentDataset"]} | ||
if rucioAcct == self.msConfig['rucioMStrAccount']: | ||
checkGlobalLocks = True | ||
else: | ||
checkGlobalLocks = False | ||
|
||
# Find all the data placement rules created by the components: | ||
for dataType in mapRuleType[rucioAcct]: | ||
dataList = wflow[dataType] if isinstance(wflow[dataType], list) else [wflow[dataType]] | ||
for dataCont in dataList: | ||
self.logger.debug("getRucioRules: dataCont: %s", pformat(dataCont)) | ||
if checkGlobalLocks and dataCont in self.globalLocks: | ||
msg = "Found dataset: %s in GlobalLocks. NOT considering it for filling the " | ||
msg += "RulesToClean list for both container and block level Rules for workflow: %s!" | ||
self.logger.info(msg, dataCont, wflow['RequestName']) | ||
continue | ||
if gran == 'container': | ||
for rule in self.rucio.listDataRules(dataCont, account=rucioAcct): | ||
wflow['RulesToClean'][currPline].append(rule['id']) | ||
msg = "Found %s container-level rule to be deleted for container %s" | ||
self.logger.info(msg, rule['id'], dataCont) | ||
elif gran == 'block': | ||
try: | ||
blocks = self.rucio.getBlocksInContainer(dataCont) | ||
for block in blocks: | ||
for rule in self.rucio.listDataRules(block, account=rucioAcct): | ||
wflow['RulesToClean'][currPline].append(rule['id']) | ||
msg = "Found %s block-level rule to be deleted for container %s" | ||
self.logger.info(msg, rule['id'], dataCont) | ||
except WMRucioDIDNotFoundException: | ||
msg = "Container: %s not found in Rucio for workflow: %s." | ||
self.logger.info(msg, dataCont, wflow['RequestName']) | ||
return wflow | ||
|
||
def cleanRucioRules(self, wflow): | ||
|
@@ -581,12 +673,6 @@ def cleanRucioRules(self, wflow): | |
|
||
# Set the cleanup flag: | ||
wflow['CleanupStatus'][currPline] = all(delResults) | ||
# ---------------------------------------------------------------------- | ||
# FIXME : To be removed once the plineMSTrBlock && plineMSTrCont are | ||
# developed | ||
if wflow['CleanupStatus'][currPline] in ['plineMSTrBlock', 'plineMSTrCont']: | ||
wflow['CleanupStatus'][currPline] = True | ||
# ---------------------------------------------------------------------- | ||
return wflow | ||
|
||
def getRequestRecords(self, reqStatus): | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And I just noticed these default values don't match the default values in the unit test
testTaskChainDefaults
.DataPileup, MCPileup and ParentDataset are inconsistent.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I have not yet started paying attention to the final version of the unit tests. I was about to do that in a later stage. Once we agree on the implementation of all the rest. Because we kind of depend on that for the unit tests step.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Isn't the DataPileup/MCPileup/ParentDataset default value supposed to be a list instead of None?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think everything is as it should be, you just forget about this:
https://github.com/dmwm/WMCore/pull/10224/files/2f4a6275b7e0af14e3f42aabbca6f40470503ca9#diff-414c04814f1133dc334c7718010c11e70809389d23ec30e9d00d5e8874b84bf5R149-R152
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's precisely my point. This data structure defines the schema of the workflow object used throughout MSRuleCleaner. By just looking at this, I'd expect those keys to have a value equal to
None
, not an empty list.Does the whole logic - besides that explicit code you just referred to - change if you set default value to
[]
instead of None? If not and we get that for free, then I'd suggest to update the default value to make it consistent with the outcome of this workflow parser.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It should work, but what would happen is that this conversion from a string to list will happen silently in the workflow parser class here: [1] instead of explicitly in the three lines I previously mentioned. I have tested it both ways, and now I do not remember which exactly but there was a corner case in which the silent conversion of the data structure was not preferable. So I preferred in the
-wfParser()
method to make the best effort to revert the structure back to the original one as described in the document template (which reflects the values as we expect them from reqmgr), and only then make the explicit conversion to lists only for those I really want and make them obvious in theinit()
method of MSRuleCleanerWflow.[1]
https://github.com/dmwm/WMCore/pull/10224/files/2f4a6275b7e0af14e3f42aabbca6f40470503ca9#diff-414c04814f1133dc334c7718010c11e70809389d23ec30e9d00d5e8874b84bf5R110-R111
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, I just tested it again.... And not even a corner case but a rather general one. It tries to create a list of all letters present in all the MCPilup dataset names it can find and explodes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, let it be then. Thanks for checking this out Todor.