TRexDataAnalysis.py revision 44972cd9
1#!/scratch/Anaconda2.4.0/bin/python
2import pandas as pd
3import numpy as np
4import matplotlib
5
6matplotlib.use('Agg')
7from matplotlib import pyplot as plt
8from matplotlib import dates as matdates
9from matplotlib import lines as matlines
10import os
11import time
12from datetime import datetime
13
14"""
15This Module is structured to work with a raw data at the following JSON format:
16
17 {'setup_name': {'test1_name':[QUERY1,QUERY2,QUERY3],
18                'test2_name':[QUERY1,QUERY2,QUERY3]
19                }
20  'setup_name2': {'test1_name':[QUERY1,QUERY2,QUERY3],
21                'test2_name':[QUERY1,QUERY2,QUERY3]
22                }
23 }
24
25 The Query structure is set (currently) to this:
26
27 (test_name,state, date,hour,minute,mpps_result,mpps_min,mpps_max,build_id) example:
28
29 ["syn attack - 64 bytes, single CPU", "stl", "20161226", "01", "39", "9.631898", "9.5", "11.5", "54289"]
30
31 it can be changed to support other formats of queries, simply change the query class to support your desired structure
32 the query class specify the indexes of the data within the query tuple
33
34"""
35
36
37class TestQuery(object):
38    QUERY_TIMEFORMAT = "%Y-%m-%d %H:%M:%S"  # date format in the query
39    QUERY_TIMESTAMP = 1
40    QUERY_MPPS_RESULT = 2
41    QUERY_BUILD_ID = 3
42
43
44class Test:
45    def __init__(self, name, setup_name, end_date):
46        self.name = name
47        self.setup_name = setup_name
48        self.end_date = end_date
49        self.stats = []  # tuple
50        self.results_df = []  # dataFrame
51        self.latest_result = []  # float
52        self.latest_result_date = ''  # string
53
54    def analyze_all_test_data(self, raw_test_data):
55        test_results = []
56        test_dates = []
57        test_build_ids = []
58        for query in raw_test_data:
59            # date_formatted = time.strftime("%d-%m-%Y",
60            #                                time.strptime(query[int(TestQuery.QUERY_DATE)], TestQuery.query_dateformat))
61            # time_of_res = date_formatted + '-' + query[int(TestQuery.QUERY_HOUR)] + ':' + query[
62            #     int(TestQuery.QUERY_MINUTE)]
63            time_of_query = time.strptime(query[TestQuery.QUERY_TIMESTAMP], TestQuery.QUERY_TIMEFORMAT)
64            time_formatted = time.strftime("%d-%m-%Y-%H:%M", time_of_query)
65            test_dates.append(time_formatted)
66            test_results.append(float(query[int(TestQuery.QUERY_MPPS_RESULT)]))
67            test_build_ids.append(query[int(TestQuery.QUERY_BUILD_ID)])
68        test_results_df = pd.DataFrame({self.name: test_results, self.name + ' Date': test_dates,
69                                        "Setup": ([self.setup_name] * len(test_results)), "Build Id": test_build_ids},
70                                       dtype='str')
71        stats_avg = float(test_results_df[self.name].mean())
72        stats_min = float(test_results_df[self.name].min())
73        stats_max = float(test_results_df[self.name].max())
74        stats = tuple(
75            [stats_avg, stats_min, stats_max,
76             float(test_results_df[self.name].std()),
77             float(((stats_max - stats_min) / stats_avg) * 100),
78             len(test_results)])  # stats = (avg_mpps,min,max,std,error, no of test_results) error = ((max-min)/avg)*100
79        self.latest_result = float(test_results_df[self.name].iloc[-1])
80        self.latest_result_date = str(test_results_df[test_results_df.columns[3]].iloc[-1])
81        self.results_df = test_results_df
82        self.stats = stats
83
84
85class Setup:
86    def __init__(self, name, end_date, raw_setup_data):
87        self.name = name
88        self.end_date = end_date  # string of date
89        self.tests = []  # list of test objects
90        self.all_tests_data_table = pd.DataFrame()  # dataframe
91        self.setup_trend_stats = pd.DataFrame()  # dataframe
92        self.latest_test_results = pd.DataFrame()  # dataframe
93        self.raw_setup_data = raw_setup_data  # dictionary
94        self.test_names = raw_setup_data.keys()  # list of names
95
96    def analyze_all_tests(self):
97        for test_name in self.test_names:
98            t = Test(test_name, self.name, self.end_date)
99            t.analyze_all_test_data(self.raw_setup_data[test_name])
100            self.tests.append(t)
101
102    def analyze_latest_test_results(self):
103        test_names = []
104        test_dates = []
105        test_latest_results = []
106        for test in self.tests:
107            test_names.append(test.name)
108            test_dates.append(test.latest_result_date)
109            test_latest_results.append(test.latest_result)
110        self.latest_test_results = pd.DataFrame(
111            {'Date': test_dates, 'Test Name': test_names, 'MPPS\Core (Norm)': test_latest_results},
112            index=range(1, len(test_latest_results) + 1))
113        self.latest_test_results = self.latest_test_results[[2, 1, 0]]  # re-order columns to name|MPPS|date
114
115    def analyze_all_tests_stats(self):
116        test_names = []
117        all_test_stats = []
118        for test in self.tests:
119            test_names.append(test.name)
120            all_test_stats.append(test.stats)
121        self.setup_trend_stats = pd.DataFrame(all_test_stats, index=test_names,
122                                              columns=['Avg MPPS/Core (Norm)', 'Min', 'Max', 'Std', 'Error (%)',
123                                                       'Total Results'])
124        self.setup_trend_stats.index.name = 'Test Name'
125
126    def analyze_all_tests_trend(self):
127        all_tests_trend_data = []
128        for test in self.tests:
129            all_tests_trend_data.append(test.results_df)
130        self.all_tests_data_table = reduce(lambda x, y: pd.merge(x, y, how='outer'), all_tests_trend_data)
131
132    def plot_trend_graph_all_tests(self, save_path='', file_name='_trend_graph.png'):
133        time_format1 = '%d-%m-%Y-%H:%M'
134        time_format2 = '%Y-%m-%d-%H:%M'
135        for test in self.tests:
136            test_data = test.results_df[test.results_df.columns[2]].tolist()
137            test_time_stamps = test.results_df[test.results_df.columns[3]].tolist()
138            start_date = test_time_stamps[0]
139            test_time_stamps.append(self.end_date + '-23:59')
140            test_data.append(test_data[-1])
141            float_test_time_stamps = []
142            for ts in test_time_stamps:
143                try:
144                    float_test_time_stamps.append(matdates.date2num(datetime.strptime(ts, time_format1)))
145                except:
146                    float_test_time_stamps.append(matdates.date2num(datetime.strptime(ts, time_format2)))
147            plt.plot_date(x=float_test_time_stamps, y=test_data, label=test.name, fmt='.-', xdate=True)
148            plt.legend(fontsize='small', loc='best')
149        plt.ylabel('MPPS/Core (Norm)')
150        plt.title('Setup: ' + self.name)
151        plt.tick_params(
152            axis='x',
153            which='both',
154            bottom='off',
155            top='off',
156            labelbottom='off')
157        plt.xlabel('Time Period: ' + start_date[:-6] + ' - ' + self.end_date)
158        if save_path:
159            plt.savefig(os.path.join(save_path, self.name + file_name))
160            if not self.setup_trend_stats.empty:
161                (self.setup_trend_stats.round(2)).to_csv(os.path.join(save_path, self.name +
162                                                                      '_trend_stats.csv'))
163            plt.close('all')
164
165    def plot_latest_test_results_bar_chart(self, save_path='', img_file_name='_latest_test_runs.png',
166                                           stats_file_name='_latest_test_runs_stats.csv'):
167        plt.figure()
168        colors_for_bars = ['b', 'g', 'r', 'c', 'm', 'y']
169        self.latest_test_results[[1]].plot(kind='bar', legend=False,
170                                           color=colors_for_bars)  # plot only mpps data, which is in column 1
171        plt.xticks(rotation='horizontal')
172        plt.xlabel('Index of Tests')
173        plt.ylabel('MPPS/Core (Norm)')
174        plt.title("Test Runs for Setup: " + self.name)
175        if save_path:
176            plt.savefig(os.path.join(save_path, self.name + img_file_name))
177            (self.latest_test_results.round(2)).to_csv(
178                os.path.join(save_path, self.name + stats_file_name))
179        plt.close('all')
180
181    def analyze_all_setup_data(self):
182        self.analyze_all_tests()
183        self.analyze_latest_test_results()
184        self.analyze_all_tests_stats()
185        self.analyze_all_tests_trend()
186
187    def plot_all(self, save_path=''):
188        self.plot_latest_test_results_bar_chart(save_path)
189        self.plot_trend_graph_all_tests(save_path)
190
191
192def latest_runs_comparison_bar_chart(setup_name1, setup_name2, setup1_latest_result, setup2_latest_result,
193                                     save_path=''
194                                     ):
195    s1_res = setup1_latest_result[[0, 1]]  # column0 is test name, column1 is MPPS\Core
196    s2_res = setup2_latest_result[[0, 1, 2]]  # column0 is test name, column1 is MPPS\Core, column2 is Date
197    s1_res.columns = ['Test Name', setup_name1]
198    s2_res.columns = ['Test Name', setup_name2, 'Date']
199    compare_dframe = pd.merge(s1_res, s2_res, on='Test Name')
200    compare_dframe.plot(kind='bar')
201    plt.legend(fontsize='small', loc='best')
202    plt.xticks(rotation='horizontal')
203    plt.xlabel('Index of Tests')
204    plt.ylabel('MPPS/Core (Norm)')
205    plt.title("Comparison between " + setup_name1 + " and " + setup_name2)
206    if save_path:
207        plt.savefig(os.path.join(save_path, "_comparison.png"))
208        compare_dframe = compare_dframe.round(2)
209        compare_dframe.to_csv(os.path.join(save_path, '_comparison_stats_table.csv'))
210
211        # WARNING: if the file _all_stats.csv already exists, this script deletes it, to prevent overflowing of data
212
213
214def create_all_data(ga_data, end_date, save_path='', detailed_test_stats=''):
215    all_setups = {}
216    all_setups_data = []
217    setup_names = ga_data.keys()
218    for setup_name in setup_names:
219        s = Setup(setup_name, end_date, ga_data[setup_name])
220        s.analyze_all_setup_data()
221        s.plot_all(save_path)
222        all_setups_data.append(s.all_tests_data_table)
223        all_setups[setup_name] = s
224
225    if detailed_test_stats:
226        if os.path.exists(os.path.join(save_path, '_detailed_table.csv')):
227            os.remove(os.path.join(save_path, '_detailed_table.csv'))
228        if all_setups_data:
229            all_setups_data_dframe = pd.DataFrame().append(all_setups_data)
230            all_setups_data_dframe.to_csv(os.path.join(save_path, '_detailed_table.csv'))
231
232    trex07setup = all_setups['trex07']
233    trex08setup = all_setups['trex08']
234    latest_runs_comparison_bar_chart('Mellanox ConnectX-4',
235                                     'Intel XL710', trex07setup.latest_test_results,
236                                     trex08setup.latest_test_results,
237                                     save_path=save_path)
238