Files
OS-LABS/3/exporter.py
T
2026-05-12 19:13:20 +07:00

708 lines
20 KiB
Python

#!/usr/bin/env python3
"""
Single-file Lab3 exporter.
What it does:
1. Runs benchmark series by recursion depth.
2. Runs benchmark series by min_size threshold.
3. Runs one logged execution with --log.
4. Draws old benchmark graphs.
5. Draws old timeline/depth histogram graphs.
6. Adds DFS process-tree graph:
each process has its children directly below it;
only after a whole subtree is drawn, the next sibling is drawn.
Generated output:
out/benchmark_depth.csv
out/benchmark_min_size.csv
out/logs/timeline.log
out/pics/time_by_depth.png
out/pics/speedup_by_depth.png
out/pics/process_count_by_depth.png
out/pics/time_by_min_size.png
out/pics/timeline.png
out/pics/depth_hist.png
out/pics/process_tree_dfs.png
Usage:
python3 exporter.py
python3 exporter.py --bin ./lab3 --out out
python3 exporter.py --fast
python3 exporter.py --skip-run
"""
from __future__ import annotations
import argparse
import csv
import re
import shutil
import subprocess
import sys
from collections import defaultdict
from pathlib import Path
from typing import Dict, List
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
STAT_RE = re.compile(
r"STAT:.*size=(\d+).*depth=(\d+).*min_size=(\d+).*processes=(\d+).*valid=(\d+).*time=([\d.]+)"
)
EVENT_RE = re.compile(
r"(START|END) PID=(\d+) PPID=(\d+) depth=(\d+) size=(\d+) time=([\d.]+)"
)
def seed_for(size: int, depth: int, min_size: int, salt: int) -> int:
# Для каждой точки используется свой seed, поэтому вход всегда случайный,
# но результаты можно воспроизвести.
return 2026 + salt * 1_000_003 + size * 17 + depth * 1009 + min_size * 31
def run_once(
bin_path: str,
size: int,
depth: int,
min_size: int,
seed: int,
log_path: Path | None = None,
) -> Dict[str, int | float | str]:
cmd = [
bin_path,
"--size",
str(size),
"--depth",
str(depth),
"--min-size",
str(min_size),
"--seed",
str(seed),
]
if log_path is not None:
cmd.append("--log")
p = subprocess.run(
cmd,
stdout=subprocess.PIPE if log_path is not None else subprocess.DEVNULL,
stderr=subprocess.PIPE,
text=True,
)
if log_path is not None:
log_path.parent.mkdir(parents=True, exist_ok=True)
log_path.write_text(p.stdout, encoding="utf-8")
if p.returncode != 0:
raise RuntimeError(
"Command failed:\n"
+ " ".join(cmd)
+ "\n\nSTDOUT:\n"
+ (p.stdout or "")
+ "\nSTDERR:\n"
+ (p.stderr or "")
)
m = STAT_RE.search(p.stderr)
if not m:
raise RuntimeError(f"STAT not found:\n{p.stderr}")
if m.group(5) != "1":
raise RuntimeError(f"sort validation failed:\n{p.stderr}")
return {
"size": int(m.group(1)),
"depth": int(m.group(2)),
"min_size": int(m.group(3)),
"processes": int(m.group(4)),
"valid": int(m.group(5)),
"time": float(m.group(6)),
"seed": seed,
"logfile": str(log_path) if log_path is not None else "",
}
def save_csv(path: Path, rows: List[dict], header: List[str]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8", newline="") as f:
w = csv.DictWriter(f, fieldnames=header)
w.writeheader()
for row in rows:
w.writerow({key: row.get(key, "") for key in header})
def read_csv(path: Path) -> List[dict]:
result = []
with path.open("r", encoding="utf-8", newline="") as f:
for row in csv.DictReader(f):
converted = dict(row)
for key in ["size", "depth", "min_size", "processes", "valid", "seed"]:
if key in converted and converted[key] != "":
converted[key] = int(converted[key])
if "time" in converted and converted["time"] != "":
converted["time"] = float(converted["time"])
result.append(converted)
return result
def save_plot(path: Path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
plt.tight_layout()
plt.savefig(path, dpi=140)
plt.close()
def clean_output(out_dir: Path) -> None:
if out_dir.exists():
shutil.rmtree(out_dir)
(out_dir / "pics").mkdir(parents=True, exist_ok=True)
(out_dir / "logs").mkdir(parents=True, exist_ok=True)
def plot_depth_scaling(
bin_path: str,
out_dir: Path,
fast: bool = False,
skip_run: bool = False,
) -> List[dict]:
pics = out_dir / "pics"
csv_path = out_dir / "benchmark_depth.csv"
if skip_run:
rows = read_csv(csv_path)
else:
if fast:
depths = list(range(10))
sizes = [20_000, 50_000]
min_size = 2048
else:
# ВАЖНО: на графиках по глубине ровно 30 точек по оси X: 0..29.
#
# depth идет до 29, но реально число процессов не взорвется бесконечно,
# потому что дальнейшее деление останавливает min_size.
depths = list(range(30))
# Несколько размеров дают несколько линий на одном графике.
sizes = [50_000, 100_000, 200_000]
min_size = 4096
rows = []
for size in sizes:
for d in depths:
seed = seed_for(size, d, min_size, salt=1)
r = run_once(bin_path, size, d, min_size, seed)
row = {**r, "seed": seed}
rows.append(row)
print(
f"depth_scaling: "
f"size={size} depth={d} min_size={min_size} "
f"seed={seed} processes={r['processes']} time={r['time']:.6f}",
flush=True,
)
save_csv(
csv_path,
rows,
[
"size",
"depth",
"min_size",
"seed",
"processes",
"valid",
"time",
"logfile",
],
)
plt.figure(figsize=(12, 6))
for size in sorted(set(r["size"] for r in rows)):
cur = [r for r in rows if r["size"] == size]
cur.sort(key=lambda r: r["depth"])
plt.plot(
[r["depth"] for r in cur],
[r["time"] for r in cur],
marker="o",
label=f"N={size}",
)
plt.xlabel("Глубина порождения процессов")
plt.ylabel("Время, сек")
plt.title("Зависимость времени сортировки от глубины fork-рекурсии")
plt.grid(True)
plt.legend()
save_plot(pics / "time_by_depth.png")
plt.figure(figsize=(12, 6))
for size in sorted(set(r["size"] for r in rows)):
cur = [r for r in rows if r["size"] == size]
cur.sort(key=lambda r: r["depth"])
base_time = cur[0]["time"]
speedup = [base_time / r["time"] if r["time"] > 0 else 0 for r in cur]
plt.plot(
[r["depth"] for r in cur],
speedup,
marker="s",
label=f"N={size}",
)
plt.xlabel("Глубина порождения процессов")
plt.ylabel("Ускорение относительно depth=0")
plt.title("Ускорение при использовании процессов")
plt.grid(True)
plt.legend()
save_plot(pics / "speedup_by_depth.png")
plt.figure(figsize=(12, 6))
for size in sorted(set(r["size"] for r in rows)):
cur = [r for r in rows if r["size"] == size]
cur.sort(key=lambda r: r["depth"])
plt.plot(
[r["depth"] for r in cur],
[r["processes"] for r in cur],
marker="^",
label=f"N={size}",
)
plt.xlabel("Глубина порождения процессов")
plt.ylabel("Количество процессов")
plt.title("Размер дерева процессов")
plt.grid(True)
plt.legend()
save_plot(pics / "process_count_by_depth.png")
return rows
def plot_threshold_effect(
bin_path: str,
out_dir: Path,
fast: bool = False,
skip_run: bool = False,
) -> List[dict]:
pics = out_dir / "pics"
csv_path = out_dir / "benchmark_min_size.csv"
if skip_run:
rows = read_csv(csv_path)
else:
size = 200_000
depth = 5
if fast:
# Быстрый режим: меньше точек.
min_sizes = [round(2 ** (7 + i * (8 / 14))) for i in range(15)]
else:
# ВАЖНО: на графике по min_size ровно 30 точек по оси X.
#
# 30 значений порога от 128 до 131072, примерно равномерно по log2-шкале.
min_sizes = [round(2 ** (7 + i * (10 / 29))) for i in range(30)]
rows = []
for m in min_sizes:
seed = seed_for(size, depth, m, salt=2)
r = run_once(bin_path, size, depth, m, seed)
row = {**r, "seed": seed}
rows.append(row)
print(
f"min_size_effect: "
f"size={size} depth={depth} min_size={m} "
f"seed={seed} processes={r['processes']} time={r['time']:.6f}",
flush=True,
)
save_csv(
csv_path,
rows,
[
"size",
"depth",
"min_size",
"seed",
"processes",
"valid",
"time",
"logfile",
],
)
rows.sort(key=lambda r: r["min_size"])
plt.figure(figsize=(12, 6))
plt.plot(
[r["min_size"] for r in rows],
[r["time"] for r in rows],
marker="o",
)
plt.xscale("log", base=2)
plt.xlabel("Минимальный размер части для fork")
plt.ylabel("Время, сек")
plt.title("Влияние порога min_size на производительность")
plt.grid(True)
save_plot(pics / "time_by_min_size.png")
return rows
def parse_events(log_path: Path) -> List[dict]:
events = defaultdict(dict)
if not log_path.exists():
return []
with log_path.open("r", encoding="utf-8", errors="ignore") as f:
for line in f:
m = EVENT_RE.search(line)
if not m:
continue
typ, pid, ppid, depth, size, t = m.groups()
key = (int(pid), int(depth), int(size))
events[key][typ] = float(t)
events[key]["pid"] = int(pid)
events[key]["ppid"] = int(ppid)
events[key]["depth"] = int(depth)
events[key]["size"] = int(size)
rows = []
for v in events.values():
if "START" in v and "END" in v:
rows.append(v)
rows.sort(key=lambda r: (r["START"], r["depth"], r["pid"]))
return rows
def plot_timeline(log_path: Path, out_dir: Path) -> None:
pics = out_dir / "pics"
rows = parse_events(log_path)
# Exclude root process: depth=0.
rows = [r for r in rows if r["depth"] != 0]
if not rows:
print("В логе нет полных START/END событий без корневого процесса depth=0")
return
base = log_path.stem
t0 = min(r["START"] for r in rows)
# Временная диаграмма: видно параллельность и время жизни дочерних процессов.
plt.figure(figsize=(12, max(5, len(rows) * 0.35)))
for y, r in enumerate(rows):
start = r["START"] - t0
end = r["END"] - t0
plt.plot([start, end], [y, y], linewidth=5)
plt.text(
end,
y,
f" pid={r['pid']} d={r['depth']} n={r['size']}",
va="center",
fontsize=8,
)
plt.xlabel("Время от старта первого дочернего процесса, сек")
plt.ylabel("Дочерние процессы / задачи сортировки")
plt.title(f"Временная диаграмма дочерних процессов: {base}")
plt.grid(True)
save_plot(pics / "timeline.png")
# Гистограмма глубин без depth=0.
by_depth = defaultdict(int)
for r in rows:
by_depth[r["depth"]] += 1
plt.figure(figsize=(8, 5))
xs = sorted(by_depth)
plt.bar(xs, [by_depth[x] for x in xs])
plt.xlabel("Глубина рекурсии")
plt.ylabel("Количество дочерних процессов")
plt.title(f"Распределение дочерних процессов по глубине: {base}")
plt.grid(True, axis="y")
save_plot(pics / "depth_hist.png")
def plot_process_tree_dfs(
log_path: Path, out_dir: Path, hide_root: bool = False
) -> None:
"""
Draw process tree in DFS order.
Root process depth=0 is always excluded.
If process 2 has children 3 and 4, and process 3 has children 5 and 6,
the order is:
2
3
5
6
4
Only after the whole subtree of process 2 is drawn, the next sibling is drawn.
"""
pics = out_dir / "pics"
rows = parse_events(log_path)
if not rows:
return
# Always exclude root process: depth=0.
rows = [r for r in rows if r["depth"] != 0]
if not rows:
return
by_pid = {r["pid"]: r for r in rows}
children = defaultdict(list)
for r in rows:
parent_pid = r["ppid"]
if parent_pid in by_pid:
children[parent_pid].append(r["pid"])
for parent_pid in children:
children[parent_pid].sort(key=lambda pid: by_pid[pid]["START"])
# These are direct children of hidden root or processes whose parent is not present.
roots = [r["pid"] for r in rows if r["ppid"] not in by_pid]
roots.sort(key=lambda pid: by_pid[pid]["START"])
ordered = []
def dfs(pid: int, level: int) -> None:
ordered.append((pid, level))
for child_pid in children.get(pid, []):
dfs(child_pid, level + 1)
for root_pid in roots:
dfs(root_pid, 0)
if not ordered:
return
y_by_pid = {pid: y for y, (pid, _) in enumerate(ordered)}
t0 = min(by_pid[pid]["START"] for pid, _ in ordered)
plt.figure(figsize=(14, max(5, len(ordered) * 0.45)))
for y, (pid, level) in enumerate(ordered):
r = by_pid[pid]
start = r["START"] - t0
end = r["END"] - t0
plt.plot([start, end], [y, y], linewidth=5)
indent = " " * level
label = f"{indent}pid={pid} d={r['depth']} n={r['size']}"
plt.text(
end,
y,
" " + label,
va="center",
fontsize=8,
)
parent_pid = r["ppid"]
if parent_pid in y_by_pid:
parent_y = y_by_pid[parent_pid]
parent_start = by_pid[parent_pid]["START"] - t0
plt.plot(
[start, start],
[parent_y, y],
linewidth=1,
linestyle="--",
)
plt.plot(
[parent_start, start],
[parent_y, parent_y],
linewidth=1,
linestyle="--",
)
plt.gca().invert_yaxis()
plt.xlabel("Время от старта первого дочернего процесса, сек")
plt.ylabel("Дерево дочерних процессов в DFS-порядке")
plt.title("Дерево процессов без корня: потомки расположены сразу под родителем")
plt.grid(True)
save_plot(pics / "process_tree_dfs.png")
def run_timeline_case(
bin_path: str,
out_dir: Path,
size: int,
depth: int,
min_size: int,
) -> Path:
log_path = out_dir / "logs" / "timeline.log"
seed = seed_for(size, depth, min_size, salt=5)
r = run_once(
bin_path=bin_path,
size=size,
depth=depth,
min_size=min_size,
seed=seed,
log_path=log_path,
)
print(
f"timeline: "
f"size={size} depth={depth} min_size={min_size} "
f"seed={seed} processes={r['processes']} time={r['time']:.6f} "
f"log={log_path}",
flush=True,
)
return log_path
def generate_report(out_dir: Path) -> None:
report = out_dir / "REPORT.md"
text = """# Lab3 benchmark report
Скрипт `exporter.py` запускает серию тестов и строит графики.
## CSV
- `benchmark_depth.csv` — серия по глубине рекурсии.
- `benchmark_min_size.csv` — серия по порогу локальной сортировки.
## Графики
- `pics/time_by_depth.png` — зависимость времени сортировки от глубины.
- `pics/speedup_by_depth.png` — ускорение относительно `depth=0`.
- `pics/process_count_by_depth.png` — количество процессов.
- `pics/time_by_min_size.png` — влияние `min_size`.
- `pics/timeline.png` — временная диаграмма процессов.
- `pics/depth_hist.png` — распределение процессов по глубине.
- `pics/process_tree_dfs.png` — дерево процессов в DFS-порядке: потомки идут сразу под родителем.
## Запуск
```bash
python3 exporter.py --bin ./lab3 --out out
Быстрый режим:
python3 exporter.py --bin ./lab3 --out out --fast
"""
report.write_text(text, encoding="utf-8")
def main() -> int:
parser = argparse.ArgumentParser(description="Single-file Lab3 benchmark exporter")
parser.add_argument("--bin", default="./lab3", help="Путь к бинарнику lab3")
parser.add_argument("--out", default="out", help="Каталог вывода")
parser.add_argument("--fast", action="store_true", help="Быстрый режим проверки")
parser.add_argument(
"--skip-run",
action="store_true",
help="Не запускать benchmark, а построить графики из существующих CSV",
)
parser.add_argument(
"--hide-root",
action="store_true",
help="Скрыть корневой процесс на DFS-графике дерева",
)
parser.add_argument("--timeline-size", type=int, default=8192)
parser.add_argument("--timeline-depth", type=int, default=3)
parser.add_argument("--timeline-min-size", type=int, default=64)
args = parser.parse_args()
bin_path = args.bin
out_dir = Path(args.out)
if not args.skip_run:
clean_output(out_dir)
else:
(out_dir / "pics").mkdir(parents=True, exist_ok=True)
(out_dir / "logs").mkdir(parents=True, exist_ok=True)
plot_depth_scaling(
bin_path=bin_path,
out_dir=out_dir,
fast=args.fast,
skip_run=args.skip_run,
)
plot_threshold_effect(
bin_path=bin_path,
out_dir=out_dir,
fast=args.fast,
skip_run=args.skip_run,
)
log_path = out_dir / "logs" / "timeline.log"
if not args.skip_run:
log_path = run_timeline_case(
bin_path=bin_path,
out_dir=out_dir,
size=args.timeline_size,
depth=args.timeline_depth,
min_size=args.timeline_min_size,
)
if log_path.exists():
plot_timeline(log_path, out_dir)
plot_process_tree_dfs(log_path, out_dir, hide_root=args.hide_root)
else:
print(f"Timeline log not found: {log_path}", file=sys.stderr)
generate_report(out_dir)
print(f"Графики сохранены в {out_dir / 'pics'}.")
print(f"CSV сохранены в {out_dir}.")
print(f"Лог сохранен в {out_dir / 'logs' / 'timeline.log'}.")
print(f"Отчет сохранен в {out_dir / 'REPORT.md'}.")
return 0
if __name__ == "__main__":
raise SystemExit(main())