Commit 562b18bc authored by Leo Le Bouter's avatar Leo Le Bouter

Initial commit

parents
test
test_stderr
/env
/.vscode
/test_dir
\ No newline at end of file
# metadata-collect-agent
In the context of the project [GNU/Linux System files on-boot Tamper Detection System](https://www.erp5.com/group_section/forum/GNU-Linux-System-files-on-boot-Tamper-Detection-System-94DGdYfmx1), we need to create an agent that will be run inside an initramfs to report as much metadata as useful while keeping system boot times acceptable. It must then report that metadata to a remote service for later analysis.
## Current performance properties
- Reads file system metadata from the main thread (stat, xattrs (SELinux, ..), POSIX ACLs)
- Reads files and hashes them across multiple processes (as many as core count) with the `multiprocessing` python module
- Maximizes disk I/O utilization successfully, the Python code's performance is not a bottleneck, the disk is (good sign)
## Desired performance properties
- Reduce memory usage
- Avoid storing all the collected data in memory at the same time
- encode and output JSON as the program runs (incompatible with tree-like data structure like now)
- discard data after output so that memory usage can be deterministic
- Beware of stack overflows
- Currently the file system traverse function is recursive, Python does not have tail recursion optimization so it potentially could overflow the stack. But due to file system paths being limited in size (is it always true? is it file system specific?), probably it's unlikely it ever will.
\ No newline at end of file
#!/usr/bin/env python3
import argparse
import sys
import os
import stat
import traceback
import hashlib
import io
import multiprocessing
import codecs
from json import JSONEncoder
import psutil
import posix1e # pylibacl
def compute_hashes(entry_path):
with open(entry_path, mode="rb") as f:
md5 = hashlib.md5()
sha1 = hashlib.sha1()
sha256 = hashlib.sha256()
sha512 = hashlib.sha512()
while True:
data = f.read(io.DEFAULT_BUFFER_SIZE)
md5.update(data)
sha1.update(data)
sha256.update(data)
sha512.update(data)
if len(data) < io.DEFAULT_BUFFER_SIZE:
break
return {"md5": md5.hexdigest(),
"sha1": sha1.hexdigest(),
"sha256": sha256.hexdigest(),
"sha512": sha512.hexdigest()}
def construct_fs_tree(mp_pool=None, mp_tasks=[], cur_dict=None, path="/", dev_whitelist=None):
is_first_call = False
if mp_pool == None:
is_first_call = True
mp_pool = multiprocessing.Pool()
if cur_dict == None:
cur_dict = {"stat": os.stat(path, follow_symlinks=False),
"childs": dict()}
if dev_whitelist != None:
path_stat = cur_dict["stat"]
if not path_stat.st_dev in dev_whitelist:
return cur_dict
try:
with os.scandir(path) as it:
for entry in it:
try:
entry_path = os.fsdecode(entry.path)
entry_name = os.fsdecode(entry.name)
try:
entry_stat = os.stat(entry_path, follow_symlinks=False)
except:
entry_stat = None
cur_dict["childs"][entry_name] = {"stat": entry_stat,
"childs": dict()}
try:
for k in os.listxattr(entry_path, follow_symlinks=False):
cur_dict["childs"][entry_name]["xattrs"][k] = str(os.getxattr(
entry_path, k))
except:
pass
try:
cur_dict["childs"][entry_name]["posix_acls"] = codecs.decode(posix1e.ACL(file=entry_path)
.to_any_text(options=posix1e.TEXT_ALL_EFFECTIVE),
"utf-8")
except:
pass
if stat.S_ISDIR(entry_stat.st_mode):
construct_fs_tree(mp_pool=mp_pool, mp_tasks=mp_tasks, cur_dict=cur_dict["childs"][entry_name],
path=entry_path, dev_whitelist=dev_whitelist)
elif stat.S_ISREG(entry_stat.st_mode):
mp_tasks.append({"result": mp_pool.apply_async(compute_hashes, [entry_path]),
"merge_into": cur_dict["childs"][entry_name]})
elif stat.S_ISLNK(entry_stat.st_mode):
cur_dict["childs"][entry_name]["symlink_target"] = os.readlink(
entry_path)
except Exception:
traceback.print_exc()
except Exception:
traceback.print_exc()
if is_first_call == True:
for task in mp_tasks:
try:
result = task["result"].get()
for k in iter(result):
task["merge_into"][k] = result[k]
except Exception:
traceback.print_exc()
mp_pool.close()
mp_pool.join()
return cur_dict
def main(argv):
parts = psutil.disk_partitions(all=False)
dev_whitelist = list()
for part in parts:
dev_whitelist.append(
os.stat(part.mountpoint, follow_symlinks=False).st_dev)
tree = construct_fs_tree(path='/', dev_whitelist=dev_whitelist)
print(JSONEncoder(separators=(',', ':')).encode(tree))
if __name__ == "__main__":
main(sys.argv)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment