Commit 640c0130 authored by Kirill Smelkov's avatar Kirill Smelkov

slapos/promise/plugin/check_cpri_lock: Don't check whatever device blindly

Currently this promise is implemented by grepping whole rf_info output
for "HW" and "SW" strings. But this won't work ok in the presence of
multiple CPRI devices. Imagine, for example if one device has CPRI lock,
while the other does not:

    PCIe CPRI /dev/sdr2@1:
      Hardware ID: 0x4b12
      DNA: [0x0048248a334a7054]
      Serial: ''
      FPGA revision: 2023-06-23  10:05:24
      FPGA vccint: 0.98 V
      FPGA vccaux: 1.76 V
      FPGA vccbram: 0.98 V
      FPGA temperature: 71.9 °C
      Clock tune: 0.0 ppm
      NUMA: 0
      CPRI_option: '5' (x8) lock=no                     <-- NOTE
      DMA0: TX fifo: 66.67us  Usage=16/32768 (0%)
      DMA0: RX fifo: 66.67us  Usage=16/32768 (0%)
      DMA0 Underflows: 0
      DMA0 Overflows: 0
    PCIe CPRI /dev/sdr3@1:
      Hardware ID: 0x4b12
      DNA: [0x0048248a334a7054]
      Serial: ''
      FPGA revision: 2023-06-23  10:05:24
      FPGA vccint: 0.98 V
      FPGA vccaux: 1.77 V
      FPGA vccbram: 0.98 V
      FPGA temperature: 71.7 °C
      Clock tune: 0.0 ppm
      NUMA: 0
      CPRI_option: '5' (x8) lock=HW+SW rx/tx=46.606us   <-- NOTE
        Port #0: T14=46.606us
      DMA0: TX fifo: 66.67us  Usage=16/32768 (0%)
      DMA0: RX fifo: 66.67us  Usage=16/32768 (0%)
      DMA0 Underflows: 0
      DMA0 Overflows: 0

the old code would still report "CPRI locked all ok" and also globally
without indicating which CPRI channel is locked.

-> Fix it by adjusting check_cpri_lock to parse rf_info text more
precisely, detect devices there and to understand which device has CPRI
lock and which does not.

For now this change is accompanied by the following change in
ors-amarisoft SR to keep it working:

    --- a/software/ors-amarisoft/instance-enb.jinja2.cfg
    +++ b/software/ors-amarisoft/instance-enb.jinja2.cfg
    @@ -35,7 +35,6 @@ parts =
       check-lopcomm-sync.py
       check-lopcomm-config-log.py
       check-lopcomm-stats-log.py
    -  check-cpri-lock.py
     {% endif %}
     {% if slapparameter_dict.get("dnsmasq", None) %}
       dnsmasq-service
    @@ -48,6 +47,7 @@ parts =
     {% endif %}
       monitor-base
       publish-connection-information
    +{% set extra_part_list = [] %}

     extends = {{ monitor_template }}

    @@ -688,12 +688,21 @@ config-testing = {{ slapparameter_dict.get("testing", False) }}
     config-config-log = ${lopcomm-rrh-config-template:log-output}
     config-stats-period = {{ slapparameter_dict.get("enb_stats_fetch_period", 60) }}

    -[check-cpri-lock.py]
    +{%  if ru == "lopcomm" %}
    +{%-   set cell_list = slapparameter_dict.get('cell_list', {'default': {}}) %}
    +{%-   for i, k in enumerate(cell_list) %}
    +{%-     set sfp_port = cell_list[k].get('cpri_port_number', i) %}
    +{%-     do extra_part_list.append('SFP{{sfp_port}}-cpri-lock.py') %}
    +[SFP{{sfp_port}}-cpri-lock.py]
     <= macro.promise
     promise = check_cpri_lock
     config-testing = {{ slapparameter_dict.get("testing", False) }}
    +config-sdr_dev  = {{ slapparameter_dict.get('sdr_number', 0) }}
    +config-sfp_port = {{ sfp_port }}
     config-amarisoft-rf-info-log = ${amarisoft-rf-info-template:log-output}
     config-stats-period = {{ slapparameter_dict.get("enb_stats_fetch_period", 60) }}
    +{%-  endfor %}
    +{% endif %}

     [check-rx-saturated.py]
     <= macro.promise
    @@ -702,3 +711,9 @@ config-testing = {{ slapparameter_dict.get("testing", False) }}
     config-amarisoft-stats-log = ${amarisoft-stats-template:log-output}
     config-stats-period = {{ slapparameter_dict.get("enb_stats_fetch_period", 60) }}
     config-max-rx-sample-db = {{ slapparameter_dict.get("max_rx_sample_db", 0) }}
    +
    +[buildout]
    +parts +=
    +{%- for part in extra_part_list %}
    +    {{ part }}
    +{%- endfor %}

(posted in slapos!1461)

The way rf_info text is parsed could be also useful in the future to
e.g. detect FPGA revision of the boards and report their recency status
via promise.

/cc @jhuge, @tomo, @xavier_thompson, @Daetalus
/reviewed-by @lu.xu
/reviewed-on !127
parent 7c3b240f
import errno import re
import json
import logging
import os
from dateutil import parser
from .util import JSONPromise, get_json_log_data_interval from .util import JSONPromise, get_json_log_data_interval
from zope.interface import implementer from zope.interface import implementer
...@@ -15,6 +10,7 @@ class RunPromise(JSONPromise): ...@@ -15,6 +10,7 @@ class RunPromise(JSONPromise):
super(RunPromise, self).__init__(config) super(RunPromise, self).__init__(config)
self.setPeriodicity(minute=1) self.setPeriodicity(minute=1)
self.amarisoft_rf_info_log = self.getConfig('amarisoft-rf-info-log') self.amarisoft_rf_info_log = self.getConfig('amarisoft-rf-info-log')
self.sdr_devchan = "/dev/sdr%s@%s" % (self.getConfig('sdr_dev'), self.getConfig('sfp_port'))
self.stats_period = int(self.getConfig('stats-period')) self.stats_period = int(self.getConfig('stats-period'))
self.testing = self.getConfig('testing') == "True" self.testing = self.getConfig('testing') == "True"
...@@ -23,22 +19,76 @@ class RunPromise(JSONPromise): ...@@ -23,22 +19,76 @@ class RunPromise(JSONPromise):
self.logger.info("skipping promise") self.logger.info("skipping promise")
return return
def error(msg): self.logger.error("%s: %s", self.sdr_devchan, msg)
def info(msg): self.logger.info ("%s: %s", self.sdr_devchan, msg)
data_list = get_json_log_data_interval(self.amarisoft_rf_info_log, self.stats_period * 2) data_list = get_json_log_data_interval(self.amarisoft_rf_info_log, self.stats_period * 2)
if len(data_list) < 1: if len(data_list) < 1:
self.logger.error("rf_info: stale data") error("rf_info: stale data")
return return
rf_info_text = data_list[0]['rf_info'] rf_info_text = data_list[0]['rf_info']
if "CPRI" not in rf_info_text: rf_info = self._parse_rf_info(rf_info_text)
self.logger.info("No CPRI feature") if self.sdr_devchan not in rf_info:
else: error("rf_info: no device entry")
if "HW" in rf_info_text and "SW" in rf_info_text: return
self.logger.info("CPRI locked")
else: rf_info = rf_info[self.sdr_devchan]
if "HW" not in rf_info_text: icpri = rf_info.get('CPRI_option')
self.logger.error("HW Lock is missing") if icpri is None:
if "SW" not in rf_info_text: error("no CPRI feature")
self.logger.error("SW Lock is missing") return
hw = ("HW" in icpri)
sw = ("SW" in icpri)
if not hw:
error("HW Lock is missing")
if not sw:
error("SW Lock is missing")
if hw and sw:
info("CPRI locked")
@staticmethod
def _parse_rf_info(rf_info_text): # -> {} /dev/sdrX@Y -> {key: value}
"""_parse_rf_info parses rf_info output into per-SDR-device key->value dictionaries.
For example:
TRX SDR driver 2023-09-07, API v15/18
PCIe CPRI /dev/sdr1@2:
FPGA vccint: 0.98 V
FPGA vccaux: 1.77 V
PCIe CPRI /dev/sdr3@4:
ABC: 123
DEF: 4567
is parsed as {'/dev/sdr1@2': {'FPGA vccint': '0.98 V', 'FPGA vccaux': '1.77 V'},
'/dev/sdr3@4': {'ABC': '123', 'DEF': '4567'}}
"""
rf_info = {}
cur = None
for l in rf_info_text.splitlines():
if not l.startswith(' '): # possibly start of new /dev entry
cur = None
m = re.search(r' (/dev/sdr[^\s]+):$', l)
if m is None: # not so - ignore the line
continue
cur = {}
sdr_devchan = m.group(1)
rf_info[sdr_devchan] = cur
continue
# indented line - it populates current if it still holds its context
if cur is None:
continue
k, v = l.split(':', 1)
k = k.strip()
v = v.strip()
cur[k] = v
return rf_info
def test(self): def test(self):
""" """
......
...@@ -44,6 +44,47 @@ class TestCheckCpriLock(TestPromisePluginMixin): ...@@ -44,6 +44,47 @@ class TestCheckCpriLock(TestPromisePluginMixin):
super(TestCheckCpriLock, self).setUp() super(TestCheckCpriLock, self).setUp()
self.amarisoft_rf_info_log = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'amarisoft_rf_info.json.log') self.amarisoft_rf_info_log = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'amarisoft_rf_info.json.log')
rf_info = \
"""
TRX SDR driver 2023-09-07, API v15/18
PCIe CPRI /dev/sdr2@1:
Hardware ID: 0x4b12
DNA: [0x0048248a334a7054]
Serial: ''
FPGA revision: 2023-06-23 10:05:24
FPGA vccint: 0.98 V
FPGA vccaux: 1.76 V
FPGA vccbram: 0.98 V
FPGA temperature: 71.9 °C
Clock tune: 0.0 ppm
NUMA: 0
CPRI_option: '5' (x8) lock=no
DMA0: TX fifo: 66.67us Usage=16/32768 (0%)
DMA0: RX fifo: 66.67us Usage=16/32768 (0%)
DMA0 Underflows: 0
DMA0 Overflows: 0
PCIe CPRI /dev/sdr3@1:
Hardware ID: 0x4b12
DNA: [0x0048248a334a7054]
Serial: ''
FPGA revision: 2023-06-23 10:05:24
FPGA vccint: 0.98 V
FPGA vccaux: 1.77 V
FPGA vccbram: 0.98 V
FPGA temperature: 71.7 °C
Clock tune: 0.0 ppm
NUMA: 0
CPRI_option: '5' (x8) lock=HW+SW rx/tx=46.606us
Port #0: T14=46.606us
DMA0: TX fifo: 66.67us Usage=16/32768 (0%)
DMA0: RX fifo: 66.67us Usage=16/32768 (0%)
DMA0 Underflows: 0
DMA0 Overflows: 0
PCIe SDR /dev/sdr4@0:
AAA: bbb
"""
self.rf_info_data = {'message': 'rf', 'rf_info': rf_info}
def writeLog(self, data, ago=5): def writeLog(self, data, ago=5):
with open(self.amarisoft_rf_info_log, 'w') as f: with open(self.amarisoft_rf_info_log, 'w') as f:
...@@ -59,21 +100,35 @@ class TestCheckCpriLock(TestPromisePluginMixin): ...@@ -59,21 +100,35 @@ class TestCheckCpriLock(TestPromisePluginMixin):
% (RunPromise.__module__, RunPromise.__name__, kw)) % (RunPromise.__module__, RunPromise.__name__, kw))
def test_locked_ok(self): def test_locked_ok(self):
self.writeLog({'rf_info': "CPRI: x16 HW SW"}) self.writeLog(self.rf_info_data)
self.writePromise() self.writePromise(sdr_dev='3', sfp_port='1')
self.configureLauncher() self.configureLauncher()
self.launcher.run() self.launcher.run()
def test_no_lock(self): def test_no_lock(self):
self.writeLog({'rf_info': "CPRI: x16\\n"}) self.writeLog(self.rf_info_data)
self.writePromise() self.writePromise(sdr_dev='2', sfp_port='1')
self.configureLauncher()
with self.assertRaisesRegex(PromiseError, r'(?m)HW Lock is missing\n.*SW Lock is missing'):
self.launcher.run()
def test_no_device(self):
self.writeLog(self.rf_info_data)
self.writePromise(sdr_dev='1', sfp_port='0')
self.configureLauncher()
with self.assertRaisesRegex(PromiseError, 'no device entry'):
self.launcher.run()
def test_no_cpri_entry(self):
self.writeLog(self.rf_info_data)
self.writePromise(sdr_dev='4', sfp_port='0')
self.configureLauncher() self.configureLauncher()
with self.assertRaises(PromiseError): with self.assertRaisesRegex(PromiseError, 'no CPRI feature'):
self.launcher.run() self.launcher.run()
def test_stale_data(self): def test_stale_data(self):
self.writeLog({'rf_info': "CPRI: x16 HW SW"}, ago=500) self.writeLog(self.rf_info_data, ago=500)
self.writePromise() self.writePromise(sdr_dev='3', sfp_port='1')
self.configureLauncher() self.configureLauncher()
with self.assertRaisesRegex(PromiseError, 'rf_info: stale data'): with self.assertRaisesRegex(PromiseError, 'rf_info: stale data'):
self.launcher.run() self.launcher.run()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment