prefix-filter/scripts/Generate-median-csv.py
2022-03-18 12:17:23 +02:00

339 lines
10 KiB
Python
Executable File

#!/usr/bin/env python3
from Naming import name_dict, get_time
from typing import *
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
# import matplotlib
# import pandas as pd
import os
import sys
# https://github.com/bendichter/brokenaxes
from brokenaxes import brokenaxes
from statistics import median
USE_AVERAGE = 0
USE_MEDIAN = 1
assert (USE_AVERAGE ^ USE_MEDIAN)
def get_lines(dir: str) -> list:
# file_name = os.path.basename(dir)
f = open(dir, "r")
lines = f.readlines()
f.close()
return lines
def find_line_index(lines: list, phrase: str) -> int:
for i in range(len(lines)):
line = lines[i].strip()
if (len(line) == 0) or (line.startswith("#")):
continue
if line.startswith(phrase):
return i
return -1
def get_filter_name_from_lines(lines):
i = find_line_index(lines, "NAME")
if (i == -1):
print(lines)
assert 0
assert i != -1
name = lines[i][5:].strip()
return name
def get_all_values_from_perf_list(lines: List[str]):
def find_all_partial_matches_in_lines(lines: List[str], pattern: str):
indexes = []
for i in range(len(lines)):
if lines[i].startswith(pattern):
indexes.append(i)
return indexes
beg_list = find_all_partial_matches_in_lines(lines, "NAME")
end_list = find_all_partial_matches_in_lines(lines, "BENCH_END")
def get_performance_list(lines: List[str]):
start_line = find_line_index(lines, "BENCH_START") + 1
end_line = find_line_index(lines, "BENCH_END") - 1 # inclusive.
performance_list = [[] for _ in range(4)]
for line in lines[start_line:end_line]:
if not (len(line.strip())):
continue
temp_list0 = line.split(",")
temp_list1 = [int(i.strip()) for i in temp_list0]
assert(len(temp_list1) == 4)
for i in range(4):
performance_list[i].append(temp_list1[i])
return performance_list
assert(len(beg_list) == len(end_list))
perfs_lists = []
for i in range(len(beg_list)):
temp_lines = lines[beg_list[i]: end_list[i] + 1]
assert temp_lines[0].startswith("NAME")
assert temp_lines[-1].startswith("BENCH")
temp_perf = get_performance_list(temp_lines)
perfs_lists.append(temp_perf)
def built_temp_list(single_perf_list: List, op: int, k: int):
"""[summary]
Args:
single_perf_list (List): [description]
op (int): [description]
k (int): [description]
Returns:
[Lists of lists]: [Each cell in the list, is ]
"""
# single_perf_list[Run][Operation][Round]
# temp_l == [Operation][Round][Run(0);Run(last)]
runs = len(single_perf_list)
assert len(single_perf_list) == runs
temp_l = [single_perf_list[i][op][k] for i in range(runs)]
return temp_l
assert len(perfs_lists[0]) == 4
operations = len(perfs_lists[0])
rounds = len(perfs_lists[0][0])
def built_raw_single_list():
fl = [[0] * rounds for _ in range(operations)]
assert operations == 4
for op in range(operations - 1):
for k in range(rounds):
temp_med = built_temp_list(perfs_lists, op, k)
fl[op][k] = temp_med
return fl
med_fl_list = built_raw_single_list()
return med_fl_list
def get_median_performance_list(lines: List[str]) -> List[List[float]]:
"""Returns a List of Lists: [operation][round(load)]
Args:
lines (List[str]): [description]
Returns:
List[List[float]]: [description]
"""
def find_all_partial_matches_in_lines(lines: List[str], pattern: str):
indexes = []
for i in range(len(lines)):
if lines[i].startswith(pattern):
indexes.append(i)
return indexes
beg_list = find_all_partial_matches_in_lines(lines, "NAME")
end_list = find_all_partial_matches_in_lines(lines, "BENCH_END")
def get_performance_list(lines: List[str]):
start_line = find_line_index(lines, "BENCH_START") + 1
end_line = find_line_index(lines, "BENCH_END") - 1 # inclusive.
performance_list = [[] for _ in range(4)]
for line in lines[start_line:end_line]:
if not (len(line.strip())):
continue
temp_list0 = line.split(",")
temp_list1 = [int(i.strip()) for i in temp_list0]
assert(len(temp_list1) == 4)
for i in range(4):
performance_list[i].append(temp_list1[i])
return performance_list
assert(len(beg_list) == len(end_list))
perfs_lists = []
for i in range(len(beg_list)):
temp_lines = lines[beg_list[i]: end_list[i] + 1]
assert temp_lines[0].startswith("NAME")
assert temp_lines[-1].startswith("BENCH")
temp_perf = get_performance_list(temp_lines)
perfs_lists.append(temp_perf)
def built_temp_list(single_perf_list: List, op: int, k: int) -> List[float]:
# single_perf_list[Run][Operation][Round]
runs = len(single_perf_list)
assert len(single_perf_list) == runs
temp_l = [single_perf_list[i][op][k] for i in range(runs)]
return temp_l
assert len(perfs_lists[0]) == 4
operations = len(perfs_lists[0])
rounds = len(perfs_lists[0][0])
def built_median_single_list() -> List[List[float]]:
fl = [[0] * rounds for _ in range(operations)]
assert operations == 4
for op in range(operations - 1):
for k in range(rounds):
temp_med = median(built_temp_list(perfs_lists, op, k))
fl[op][k] = temp_med
return fl
med_fl_list = built_median_single_list()
return med_fl_list
def get_op_divisors(lines: List[str]):
# a = find_line_index(lines, "NAME")
b = find_line_index(lines, "FILTER_MAX_CAPACITY")
c = find_line_index(lines, "NUMBER_OF_LOOKUP")
# name = lines[a].split()[1]
filter_max_cap = int(lines[b].split()[1])
lookup_reps = int(lines[c].split()[1])
return filter_max_cap, lookup_reps
def get_raw_all_data(f_list: list):
lines_list = []
names_list = []
for temp_file in f_list:
temp_lines = get_lines(temp_file)
lines_list.append(temp_lines)
names_list.append(get_filter_name_from_lines(temp_lines))
perf_lists = []
for temp_lines in lines_list:
# temp_perf == [Operation][Round][Run(0);Run(last)]
temp_perf = get_all_values_from_perf_list(temp_lines)
perf_lists.append(temp_perf)
def get_all_diviate_list(l):
def ratio(x, denom):
assert(denom != 0)
return (x-denom)/denom
t_med = median(l)
fl = [ratio(i, t_med) for i in l]
return fl
flat_div_list = []
filters_num = len(perf_lists)
operation_num = len(perf_lists[0])
rounds = len(perf_lists[0][0])
for op in range(3):
for t_filter in range(filters_num):
for t_round in range(rounds):
temp_l = perf_lists[t_filter][op][t_round]
# print(op, t_filter, t_round, end=":\t")
# print(temp_l)
# if temp_l == 0:
temp_res = get_all_diviate_list(temp_l)
flat_div_list += temp_res
# print()
return flat_div_list
def final_diver(f_list: list) -> None:
flat_diviate = get_raw_all_data(f_list)
s_fd = sorted(flat_diviate)
above_05 = [i for i in s_fd if abs(i) > 0.005]
above_1 = [i for i in s_fd if abs(i) > 0.01]
r1 = len(above_1)/len(s_fd)
r05 = len(above_05)/len(s_fd)
print(
"fraction of elements that are at most 1% away from median (in thier category) \t{:<.5f}".format(1-r1))
print(
"fraction of elements that are at most 0.5% from median (in thier category)\t{:<.5f}".format(1-r05))
print("min & max diviations:", s_fd[0], s_fd[-1])
def generate_csvs(f_list: list):
lines_list = []
names_list = []
for temp_file in f_list:
temp_lines = get_lines(temp_file)
lines_list.append(temp_lines)
names_list.append(get_filter_name_from_lines(temp_lines))
perf_lists = []
for temp_lines in lines_list:
temp_perf = []
if (USE_MEDIAN):
assert(not USE_AVERAGE)
temp_perf = get_median_performance_list(temp_lines)
elif (USE_AVERAGE):
temp_perf = get_average_performance_list(temp_lines)
else:
assert(0)
perf_lists.append(temp_perf)
filter_max_cap, lookup_reps = get_op_divisors(lines_list[0])
rounds_num = len(perf_lists[0][0])
add_step = round(filter_max_cap / rounds_num)
find_step = round(lookup_reps / rounds_num)
###########################################
ins_list = [temp_perf[0] for temp_perf in perf_lists]
uni_list = [temp_perf[1] for temp_perf in perf_lists]
yes_list = [temp_perf[2] for temp_perf in perf_lists]
###########################################
curr_time = get_time()
names = ["add-med-({:}).csv", "uni-med-({:}).csv", "yes-med-({:}).csv"]
names = [nm.format(curr_time) for nm in names]
f_add = open(names[0], "a")
f_uni = open(names[1], "a")
f_yes = open(names[2], "a")
files = [f_add, f_uni, f_yes]
lists = [ins_list, uni_list, yes_list]
filters_names = [name_dict(t) for t in names_list]
size = len(filters_names)
assert size == len(ins_list)
for op in range(len(files)):
t_file = files[op]
t_list = lists[op]
for i in range(size):
t_file.write(filters_names[i] + ",")
temp_line = str(t_list[i])[1:-1]
t_file.write(temp_line + "\n")
f_add.close()
f_uni.close()
f_yes.close()
def main_helper(path):
chosen_files = os.listdir(path)
files_list = [os.path.join(path, i)
for i in chosen_files if not i.endswith(".csv")]
files_list.sort()
final_diver(files_list)
generate_csvs(files_list)
def main():
"""
sys.argv[1] = path
"""
argc: int = len(sys.argv)
if argc == 1:
path = os.path.abspath(os.getcwd())
path = os.path.join(path, "Inputs")
assert os.path.isdir(path)
# name = "bench{:}".get_time()
main_helper(path)
elif argc == 2:
path = sys.argv[1]
# name = "bench{:}".format(get_time())
main_helper(path)
else:
print("Too many arguments where given ({:})".format(argc))
main_helper("./Inputs/")