slapos.grid.promise: prevent json file corruption while updating data

history and statistics files are sometimes corrupted because of invalid json We do 2 things to improve this: 1. we use json.load to read the previous file (this will detect if the file is corrupted) 2. we use json.dump to write the entire file at once in a temporary file and then we move this file to be atomic. Probably this will consume more resources. See merge request !728

slapos.grid.promise: prevent json file corruption while updating data
history and statistics files are sometimes corrupted because of invalid json We do 2 things to improve this: 1. we use json.load to read the previous file (this will detect if the file is corrupted) 2. we use json.dump to write the entire file at once in a temporary file and then we move this file to be atomic. Probably this will consume more resources. See merge request !728
d42de18b · Alain Takoudjou · Thomas Gambier · 6e26288e · d42de18b · d42de18b
Commit d42de18b authored Dec 17, 2024 by Alain Takoudjou Committed by Thomas Gambier Jan 13, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 92 additions and 25 deletions

slapos/grid/promise/__init__.py slapos/grid/promise/__init__.py +47 -25

slapos/tests/test_promise.py slapos/tests/test_promise.py +45 -0

No files found.
--- a/slapos/grid/promise/__init__.py
+++ b/slapos/grid/promise/__init__.py
@@ -41,6 +41,7 @@ import traceback
 import psutil
 import inspect
 import hashlib
+import errno
 from datetime import datetime
 from multiprocessing import Process, Queue as MQueue
 from six.moves import queue, reload_module
@@ -57,6 +58,7 @@ from slapos.grid.promise.wrapper import WrapPromise
 from slapos.version import version

 PROMISE_CACHE_FOLDER_NAME = '.slapgrid/promise/cache'
+TMP_FOLDER_NAME = '.slapgrid/promise/tmp'

 class PromiseError(Exception):
  pass
@@ -354,6 +356,11 @@ class PromiseLauncher(object):
      mkdir_p(self.promise_history_output_dir)
      self._updateFolderOwner()

+    self.tmp_dir = os.path.join(self.partition_folder, TMP_FOLDER_NAME)
+    if not os.path.exists(self.tmp_dir):
+      mkdir_p(self.tmp_dir)
+      self._updateFolderOwner(self.tmp_dir)
+
  def _generatePromiseResult(self, promise_process, promise_name, promise_path,
      message, execution_time=0):
    if self.check_anomaly:
@@ -450,6 +457,7 @@ class PromiseLauncher(object):
      self.promise_history_output_dir,
      '%s.history.json' % result.title
    )
+    tmp_history_file = os.path.join(self.tmp_dir, '%s.history.json' % result.title)

    # Remove useless informations
    result_dict = state_dict.pop('result')
@@ -458,12 +466,20 @@ class PromiseLauncher(object):
    state_dict.pop('path', '')
    state_dict.pop('type', '')
    state_dict["status"] = "ERROR" if result.item.hasFailed() else "OK"
+    history_dict = {
+      "date": time.time(),
+      "data": [state_dict]
+    }

-    if not os.path.exists(history_file) or not os.stat(history_file).st_size:
-      with open(history_file, 'w') as f:
-        f.write("""{
-          "date": %s,
-          "data": %s}""" % (time.time(), json.dumps([state_dict])))
+    try:
+      with open(history_file) as f:
+        history_dict = json.load(f)
+    except OSError as e:
+      if e.errno != errno.ENOENT:
+        raise
+    except ValueError:
+      # Broken json, use default history_dict
+      pass
    else:
      previous_state_list = previous_state_dict.get(result.name, None)
      if previous_state_list is not None:
@@ -473,36 +489,42 @@ class PromiseLauncher(object):
            current_sum == checksum:
          # Only save the changes and not the same info
          return
-  
+
      state_dict.pop('title', '')
      state_dict.pop('name', '')
-      with open (history_file, mode="r+") as f:
-        f.seek(0,2)
-        f.seek(f.tell() -2)
-        f.write('%s}' % ',{}]'.format(json.dumps(state_dict)))
+      history_dict['data'].append(state_dict)
+
+    with open(tmp_history_file, mode="w") as f:
+      json.dump(history_dict, f)
+    os.rename(tmp_history_file, history_file)

  def _saveStatisticsData(self, stat_file_path, date, success, error):
    # csv-like document for success/error statictics
-    if not os.path.exists(stat_file_path) or os.stat(stat_file_path).st_size == 0:
-      with open(stat_file_path, 'w') as fstat:
-        fstat.write("""{
-          "date": %s,
-          "data": ["Date, Success, Error, Warning"]}""" % time.time())
-  
+    stat_dict = {
+      "date": time.time(),
+      "data": ["Date, Success, Error, Warning"]
+    }
+    try:
+      with open(stat_file_path) as f:
+        stat_dict = json.load(f)
+    except OSError as e:
+      if e.errno != errno.ENOENT:
+        raise
+    except ValueError:
+      # Broken json, we use default
+      pass
+
+    tmp_stat_file = os.path.join(self.tmp_dir,
+                                 os.path.basename(stat_file_path))
    current_state = '%s, %s, %s, %s' % (
        date,
        success,
        error,
        '')
-  
-    # append to file
-    # XXX this is bad, it is getting everywhere.
-    if current_state:
-      with open (stat_file_path, mode="r+") as fstat:
-        fstat.seek(0,2)
-        position = fstat.tell() -2
-        fstat.seek(position)
-        fstat.write('%s}' % ',"{}"]'.format(current_state))
+    stat_dict['data'].append(current_state)
+    with open (tmp_stat_file, mode="w") as f:
+      json.dump(stat_dict, f)
+    os.rename(tmp_stat_file, stat_file_path)

  def _loadPromiseResult(self, promise_title):
    promise_output_file = os.path.join(

--- a/slapos/tests/test_promise.py
+++ b/slapos/tests/test_promise.py
@@ -566,6 +566,51 @@ class RunPromise(GenericPromise):
 }""")


+  def test_runpromise_with_currupted_history(self):
+    promise_name = 'my_promise'
+    self.configureLauncher()
+    self.generatePromiseScript('my_promise.py', success=True, periodicity=0.03)
+
+    self.launcher.run()
+    self.assertSuccessResult(promise_name)
+
+    # will fail if history json is broken
+    self.assertSuccessHistoryResult(promise_name)
+    history_file = os.path.join(self.partition_dir,
+                                PROMISE_HISTORY_RESULT_FOLDER_NAME,
+                                '%s.history.json' % promise_name)
+    stats_file = os.path.join(self.partition_dir,
+                              PROMISE_STATE_FOLDER_NAME,
+                              'promise_stats.json')
+    broken_history = """{
+	"data": [{
+		"failed": false,
+		"message": "success",
+		"name": "%(name)s.py",
+		"status": "OK",
+		"title": "%(name)s"
+	}, {]
+}""" % {'name': promise_name}
+
+    with open(history_file, 'w') as f:
+      f.write(broken_history)
+    with open(stats_file, 'w') as f:
+      f.write("""{
+	"data": ["Date, Success,
+""")
+    time.sleep(4)
+
+    with self.assertRaises(ValueError):
+      self.assertSuccessHistoryResult(promise_name)
+    with self.assertRaises(ValueError):
+      self.assertSuccessStatsResult(1)
+
+    self.launcher.run()
+    self.assertSuccessResult(promise_name)
+    # Promise history json is not broken anymore
+    self.assertSuccessHistoryResult(promise_name)
+    # Statistics data file is not broken anymore
+    self.assertSuccessStatsResult(1)

  def test_runpromise_no_logdir(self):
    promise_name = 'my_promise.py'