Commit 397e9cd0 authored by Lisa Casino's avatar Lisa Casino

promise: new promise

The new promise checks and shows the three biggest partitions if
there's no place left or the three fastest partitions
parent d652a769
from __future__ import division
from zope.interface import implementer
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
import os
import sys
import pwd
import sqlite3
import argparse
import datetime
import psutil
from slapos.collect.db import Database
@implementer(interface.IPromise)
class RunPromise(GenericPromise):
def __init__(self, config):
super(RunPromise, self).__init__(config)
# check disk space at least every 3 minutes
self.setPeriodicity(minute=3)
def biggestPartitions(self, db_path, date, time, limit=3):
database = Database(db_path, create=False, timeout=10)
try:
database.connect()
date_time = date + ' ' + time
where_query = "datetime(date || ' ' || time) >= datetime('%s', '-1 days') AND datetime(date || ' ' || time) <= datetime('%s')"
result = database.select(
"folder",
columns = "partition, disk_used*1024, max(datetime(date || ' ' || time))",
where = where_query % (date_time, date_time),
group = "partition",
order = "disk_used DESC",
limit = limit).fetchall()
if not result or not result[0]:
self.logger.info("No result from collector database: skipped")
return None
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return None
raise
finally:
try:
database.close()
except Exception:
pass
return result
def fastestPartitions(self, db_path, disk_partition, date, time, day_range, limit=3):
database = Database(db_path, create=False, timeout=10)
try:
database.connect()
# for each partition, we get two data: min and max to compute the slope of the function
date_time = date + ' ' + time
where_query = "datetime(date) >= datetime('%s', '-1 days') AND datetime(date) <= datetime('%s')"
result_max = database.select(
"folder",
columns = "partition, disk_used*1024, datetime(date || ' ' || time)",
where = where_query % (date_time, date_time),
group = "partition",
order = "partition").fetchall()
if not result_max or not result_max[0]:
self.logger.info("No result (max) from collector database: skipped")
return None
result_min = database.select(
"folder",
columns = "partition, disk_used*1024, min(datetime(date || ' ' || time))",
where = "datetime(date || ' ' || time) >= datetime('%s', '-%s days')" % (result_max[0][2], day_range),
group = "partition",
order = "partition").fetchall()
if not result_min or (result_min == result_max) :
self.logger.info("No result (min) from collector database: skipped")
return None
timep = '%Y-%m-%d %H:%M:%S'
ranked_results = []
for i in range(len(result_max)):
timespan = datetime.datetime.strptime(result_max[i][2], timep) - \
datetime.datetime.strptime(result_min[i][2], timep)
delta_days = timespan.total_seconds() / (3600.*24)
# if we don't have enough information OR information are the same
if (int(delta_days) <= 1) or (result_max[i] == result_min[i]):
continue
user, size_max, date_max = result_max[i]
user, size_min, date_min = result_min[i]
# slope/(1024*1024*1024) = number of giga per day
slope = (size_max - size_min)/delta_days
ranked_results.append((user, slope, date_min, date_max, delta_days))
ranked_results = sorted(ranked_results, key=lambda tup: tup[1], reverse=True)
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return None
raise
finally:
try:
database.close()
except Exception:
pass
return ranked_results[:limit]
def getDaysUntilFull(self, disk_partition, database, date, time, day_range):
"""Returns estimation of days until the disk_partition would become full
It uses date and time in order to find current disk free percentage, then rewinds
day_range back in history and calculates average speed of losing free space, which
is assumed constant and used to predict in how many days the disk would become full.
"""
database = Database(database, create=False, timeout=10)
try:
database.connect()
result_max = database.select(
"disk",
date = date,
columns = "free*1.0/(used+free) AS percent, max(datetime(date || ' ' || time))",
where = "time between '%s:00' and '%s:30' and partition='%s'" % (time, time, disk_partition),
limit = 1).fetchone()
if not result_max or not result_max[1]:
return None
result_min = database.select(
"disk",
columns = "free*1.0/(used+free) AS percent, min(datetime(date || ' ' || time))",
where = "datetime(date || ' ' || time) >= datetime('%s', '-%s days') and partition='%s'" % (result_max[1], day_range, disk_partition),
limit = 1).fetchone()
if not result_min or not result_min[1] or result_min == result_max:
return None
change = result_max[0] - result_min[0]
if change > 0.:
return None
timep = '%Y-%m-%d %H:%M:%S'
timespan = datetime.datetime.strptime(
result_max[1], timep) - datetime.datetime.strptime(
result_min[1], timep)
delta_days = timespan.total_seconds() / (3600.*24)
try:
return (-result_max[0] / (change / delta_days), result_min[1], result_min[0], result_max[1], result_max[0], delta_days)
except ZeroDivisionError as e:
# no data
return None
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return None
raise
finally:
try:
database.close()
except Exception:
pass
def getDiskSize(self, disk_partition, database):
database = Database(database, create=False, timeout=10)
try:
# fetch disk size
database.connect()
where_query = "partition='%s'" % (disk_partition)
order = "datetime(date || ' ' || time) DESC"
result = database.select(
"disk",
columns="free+used",
where=where_query,
order=order,
limit=1).fetchone()
if not result or not result[0]:
return None
disk_size = result[0]
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return None
raise
finally:
try:
database.close()
except Exception:
pass
return disk_size
def getFreeSpace(self, disk_partition, database, date, time):
database = Database(database, create=False, timeout=10)
try:
# fetch free disk space
database.connect()
where_query = "time between '%s:00' and '%s:30' and partition='%s'" % (time, time, disk_partition)
result = database.select(
"disk",
date=date,
columns="free",
where=where_query).fetchone()
if not result or not result[0]:
self.logger.info("No result from collector database: disk check skipped")
return 0
disk_free = result[0]
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return 0
raise
finally:
try:
database.close()
except Exception:
pass
return int(disk_free)
def raiseOnDatabaseLocked(self, locked_message):
max_warn = 10
latest_result_list = self.getLastPromiseResultList(result_count=max_warn)
warning_count = 0
if len(latest_result_list) < max_warn:
return False
for result in latest_result_list[0]:
if result['status'] == "ERROR" and locked_message in result["message"]:
return True
for result_list in latest_result_list:
found = False
for result in result_list:
if result['status'] == "WARNING" and locked_message in result["message"]:
found = True
warning_count += 1
break
if not found:
break
if warning_count == max_warn:
# too many warning on database locked, now fail.
return True
self.logger.warn("collector database is locked by another process")
return False
@staticmethod
def _checkInodeUsage(path):
stat = os.statvfs(path)
total_inode = stat.f_files
if total_inode:
usage = 100 * (total_inode - stat.f_ffree) / total_inode
if usage >= 98:
return "Disk Inodes usage is really high: %.4f%%" % usage
def getInodeUsage(self, path):
return (self._checkInodeUsage(path) or
os.path.ismount('/tmp') and self._checkInodeUsage('/tmp') or
"")
def sense(self):
# find if a disk is mounted on the path
disk_partition = ""
db_path = self.getConfig('collectordb')
check_date = self.getConfig('test-check-date')
path = os.path.join(self.getPartitionFolder(), "") + "extrafolder"
partitions = psutil.disk_partitions()
while path is not '/':
if not disk_partition:
path = os.path.dirname(path)
else:
break
for p in partitions:
if p.mountpoint == path:
disk_partition = p.device
break
if not disk_partition:
self.logger.error("Couldn't find disk partition")
return
if db_path.endswith("collector.db"):
db_path=db_path[:-len("collector.db")]
if check_date:
# testing mode
currentdate = check_date
currenttime = self.getConfig('test-check-time', '09:17')
disk_partition = self.getConfig('test-disk-partition', '/dev/sda1')
else:
# get last minute
now = datetime.datetime.utcnow()
currentdate = now.strftime('%Y-%m-%d')
currenttime = now - datetime.timedelta(minutes=1)
currenttime = currenttime.time().strftime('%H:%M')
disk_size = self.getDiskSize(disk_partition, db_path)
default_threshold = None
if disk_size is not None:
default_threshold = round(disk_size/(1024*1024*1024) * 0.05, 2)
threshold = float(self.getConfig('threshold', default_threshold) or 2.0)
threshold_days = float(self.getConfig('threshold-days', '20'))
free_space = self.getFreeSpace(disk_partition, db_path, currentdate, currenttime)
days_until_full_tuple = self.getDaysUntilFull(disk_partition, db_path, currentdate, currenttime, threshold_days/2)
if days_until_full_tuple is not None:
days_until_full, min_date, min_free, max_date, max_free, day_span = days_until_full_tuple
message = "Disk will become full in %.2f days (threshold: %.2f days), checked from %s to %s, %.2f days span" % (
days_until_full, threshold_days, min_date, max_date, day_span)
if days_until_full < threshold_days:
self.logger.error(message + ', free space dropped from %.1f%% to %.1f%%: ERROR' % (min_free*100, max_free*100))
# display the 3 partitions with the highest usage rate in the last few days (threshold_days/2)
fast_partitions = self.fastestPartitions(db_path, disk_partition, currentdate, currenttime, threshold_days/2)
for partition in fast_partitions:
user_name, slope, date_min, date_max, delta_days = partition
self.logger.info("The partition %s has used %s Giga per day for the last %s days (from %s to %s)" % (
user_name, slope/(1024*1024*1024), delta_days, date_min, date_max))
else:
self.logger.info(message + ': OK')
if free_space == 0:
return
elif free_space > threshold*1024*1024*1024:
inode_usage = self.getInodeUsage(self.getPartitionFolder())
if inode_usage:
self.logger.error(inode_usage)
else:
self.logger.info("Disk usage: OK")
return
free_space = round(free_space/(1024*1024*1024), 2)
self.logger.error('Free disk space low: remaining %s G (threshold: %s G)' % (
free_space, threshold))
# display the 3 partitions that have the most storage capacity on the disk
big_partitions = self.biggestPartitions(db_path, currentdate, currenttime)
for partition in big_partitions:
user_name, size_partition, date_checked = partition
self.logger.info("The partition %s use %s Giga (date checked: %s)" % (
user_name, size_partition, date_checked))
def test(self):
return self._test(result_count=1, failure_amount=1)
def anomaly(self):
return self._test(result_count=3, failure_amount=3)
......@@ -6,7 +6,6 @@ from slapos.grid.promise.generic import GenericPromise
import os
import sys
import pwd
import sqlite3
import argparse
......@@ -23,50 +22,7 @@ class RunPromise(GenericPromise):
# check disk space at least every 3 minutes
self.setPeriodicity(minute=3)
def getPartitionSize(self, user, database, date, day_range):
database = Database(database, create=False, timeout=10)
try:
database.connect()
# if "du" didn't start (or didn't finish), we take the date of yesterday
result_max = database.select(
"folder",
columns = "disk_used*1024, max(datetime(date || ' ' || time))",
where = "datetime(date) >= datetime('%s', '-1 days') AND partition='%s'" % (date, user),
order = "time DESC",
limit = 1).fetchone()
if not result_max or not result_max[0]:
self.logger.info("No result from collector database in partition %s: skipped", user)
return 0
partition_size = result_max[0]
result_min = database.select(
"folder",
columns = "disk_used*1024, min(datetime(date || ' ' || time))",
where = "datetime(date || ' ' || time) >= datetime('%s', '-%s days') and partition='%s'" % (result_max[1], day_range, user),
limit = 1).fetchone()
if not result_min or (result_min == result_max) or not result_min[1]:
return None
timep = '%Y-%m-%d %H:%M:%S'
timespan = datetime.datetime.strptime(result_max[1], timep) - \
datetime.datetime.strptime(result_min[1], timep)
delta_days = timespan.total_seconds() / (3600.*24)
# if we don't have enough information (beginning)
if int(delta_days) <= 3:
self.logger.info("Not enough recent data")
return None
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return None
raise
finally:
try:
database.close()
except Exception:
pass
return partition_size, result_max[1], result_min[0], result_min[1], delta_days
def getDiskSize(self, disk_partition, database):
database = Database(database, create=False, timeout=10)
......@@ -122,35 +78,6 @@ class RunPromise(GenericPromise):
pass
return int(disk_free)
def getDaysUntilFullPartition(self, user, disk_partition, db_path, date, time, day_range):
database = Database(db_path, create=False, timeout=10)
try:
disk_size = self.getDiskSize(disk_partition, db_path)
part_size_now, date_max, part_size_min, date_min, delta_days = self.getPartitionSize(user, db_path, date, day_range)
free_space_min = -((part_size_min - disk_size)/disk_size)
free_space_max = -((part_size_now - disk_size)/disk_size)
change = free_space_min - free_space_max
disk_free_percent = self.getFreeSpace(disk_partition, db_path, date, time)/disk_size
try:
return (disk_free_percent / (change / delta_days), date_min, part_size_min, date_max, part_size_now, delta_days)
except ZeroDivisionError as e:
# no data
return None
except sqlite3.OperationalError as e:
# if database is still locked after timeout expiration (another process is using it)
# we print warning message and try the promise at next run until max warn count
locked_message = "database is locked"
if locked_message in str(e) and \
not self.raiseOnDatabaseLocked(locked_message):
return None
raise
finally:
try:
database.close()
except Exception:
pass
def raiseOnDatabaseLocked(self, locked_message):
max_warn = 10
latest_result_list = self.getLastPromiseResultList(result_count=max_warn)
......@@ -195,7 +122,6 @@ class RunPromise(GenericPromise):
def sense(self):
# find if a disk is mounted on the path
disk_partition = ""
user = pwd.getpwuid(os.getuid()).pw_name
db_path = self.getConfig('collectordb')
check_date = self.getConfig('test-check-date')
path = os.path.join(self.getPartitionFolder(), "") + "extrafolder"
......@@ -223,7 +149,7 @@ class RunPromise(GenericPromise):
disk_partition = self.getConfig('test-disk-partition', '/dev/sda1')
else:
# get last minute
now = datetime.datetime.utcnow()
now = datetime.datetime.now()
currentdate = now.strftime('%Y-%m-%d')
currenttime = now - datetime.timedelta(minutes=1)
currenttime = currenttime.time().strftime('%H:%M')
......@@ -237,17 +163,6 @@ class RunPromise(GenericPromise):
free_space = self.getFreeSpace(disk_partition, db_path, currentdate,
currenttime)
days_until_full_tuples = self.getDaysUntilFullPartition(user, disk_partition, db_path, currentdate, currenttime, threshold_days/2)
if days_until_full_tuples is not None:
days_until_full, min_date, min_free, max_date, max_free, day_span = days_until_full_tuples
message = "The partition %s will become full in %.2f days (threshold: %.2f days), checked from %s to %s, %.2f days span" % (
user, days_until_full, threshold_days, min_date, max_date, day_span)
if days_until_full < threshold_days:
self.logger.error(message + ', free space dropped from %.1f%% to %.1f%%: ERROR' % (min_free*100, max_free*100))
else:
self.logger.info(message + ': OK')
if free_space == 0:
return
elif free_space > threshold*1024*1024*1024:
......@@ -266,4 +181,4 @@ class RunPromise(GenericPromise):
return self._test(result_count=1, failure_amount=1)
def anomaly(self):
return self._test(result_count=3, failure_amount=3)
return self._test(result_count=3, failure_amount=3)
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment