Commit 098f49de authored by Juan Antonio Navarro Perez's avatar Juan Antonio Navarro Perez Committed by Commit Bot

[pinboard] Only process/upload data from the latest 6 months

Helps avoid mixing recent data from results "without patch" (which are
shifted to one year in the past), with old data that actually ran a
year ago.

Also helps making sure that the size of the csv file with the uploaded
dataset does not grow without bounds.

Bug: 1006723
Change-Id: I9e5f847563e5ab62c937ef7236fd10094fd003eb
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1819501Reviewed-by: default avatarRoss McIlroy <rmcilroy@chromium.org>
Commit-Queue: Juan Antonio Navarro Pérez <perezju@chromium.org>
Cr-Commit-Position: refs/heads/master@{#698914}
parent 88f8b24c
...@@ -169,9 +169,12 @@ def AggregateAndUploadResults(state): ...@@ -169,9 +169,12 @@ def AggregateAndUploadResults(state):
cached_results = CachedFilePath(DATASET_PKL_FILE) cached_results = CachedFilePath(DATASET_PKL_FILE)
dfs = [] dfs = []
keep_revisions = set(item['revision'] for item in state)
if os.path.exists(cached_results): if os.path.exists(cached_results):
# To speed things up, we take the cache computed from previous results. # To speed things up, we take the cache computed from previous results.
df = pd.read_pickle(cached_results) df = pd.read_pickle(cached_results)
# Drop possible old data from revisions no longer in recent state.
df = df[df['revision'].isin(keep_revisions)]
dfs.append(df) dfs.append(df)
known_revisions = set(df['revision']) known_revisions = set(df['revision'])
else: else:
...@@ -329,8 +332,8 @@ def LoadJsonFile(filename): ...@@ -329,8 +332,8 @@ def LoadJsonFile(filename):
return json.load(f) return json.load(f)
def Yesterday(): def TimeAgo(**kwargs):
return pd.Timestamp.now(TZ) - pd.DateOffset(days=1) return pd.Timestamp.now(TZ) - pd.DateOffset(**kwargs)
def SetUpLogging(level): def SetUpLogging(level):
...@@ -350,6 +353,12 @@ def SetUpLogging(level): ...@@ -350,6 +353,12 @@ def SetUpLogging(level):
logger.addHandler(h2) logger.addHandler(h2)
def SelectRecentRevisions(state):
"""Filter out old revisions from state to keep only recent (6 months) data."""
from_date = str(TimeAgo(months=6).date())
return [item for item in state if item['timestamp'] > from_date]
def Main(): def Main():
SetUpLogging(level=logging.INFO) SetUpLogging(level=logging.INFO)
actions = ('start', 'collect', 'upload') actions = ('start', 'collect', 'upload')
...@@ -360,7 +369,7 @@ def Main(): ...@@ -360,7 +369,7 @@ def Main():
"results, 'upload' aggregated data, or 'auto' to do all in " "results, 'upload' aggregated data, or 'auto' to do all in "
"sequence.")) "sequence."))
parser.add_argument( parser.add_argument(
'--date', type=lambda s: pd.Timestamp(s, tz=TZ), default=Yesterday(), '--date', type=lambda s: pd.Timestamp(s, tz=TZ), default=TimeAgo(days=1),
help=('Run jobs for the last commit landed on the given date (assuming ' help=('Run jobs for the last commit landed on the given date (assuming '
'MTV time). Defaults to the last commit landed yesterday.')) 'MTV time). Defaults to the last commit landed yesterday.'))
args = parser.parse_args() args = parser.parse_args()
...@@ -372,10 +381,11 @@ def Main(): ...@@ -372,10 +381,11 @@ def Main():
try: try:
if 'start' in args.actions: if 'start' in args.actions:
StartPinpointJobs(state, args.date) StartPinpointJobs(state, args.date)
recent_state = SelectRecentRevisions(state)
if 'collect' in args.actions: if 'collect' in args.actions:
CollectPinpointResults(state) CollectPinpointResults(recent_state)
finally: finally:
UpdateJobsState(state) UpdateJobsState(state)
if 'upload' in args.actions: if 'upload' in args.actions:
AggregateAndUploadResults(state) AggregateAndUploadResults(recent_state)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment