2-Variance F-test Text and Graphics Function¶
License¶
F_Variance_Equality_Test(): A basic 2-variance F-Test Utility Function including textual and graphical output by William J Capehart is licensed under CC BY-NC-SA 4.0
Libraries¶
This function uses the following libraries.
- numpy: The Go-To Python Library for Arrays and Basic Math Operation
- scipy: An open-source software for mathematics, science, and engineering
- matplotlib: The standard library for basic plotting in Python
- seaborn: An extention for Matplotlib that leverages Pandas (but you can use it with other datamodels).
2-Variance F-test Function Declaration¶
#########################################
#
# F Variance Equality Text and Graphics Function
#
def F_Variance_Equality_Test(s1 = None,
s2 = None,
df1 = None,
df2 = None,
test = "two-sided",
variable_name = "",
x1_name = "x₁",
x2_name = "x₂",
alpha = 0.05):
"""
Applies an F-test for variances to two samples with tablular and graphical output
Author: WJ Capehart, South Dakota Mines.
Licensing: Creative Commons BY-NC-SA version 4.0
(https://creativecommons.org/licenses/by-nc-sa/4.0/)
Citation:
"F_Variance_Equality_Test(): A basic 2-variance F-Test Utility Function
including textual and graphical output" by William J Capehart
is licensed under Creative Commons Attribution-NonCommercial-
ShareAlike 4.0
Args:
s1 (array): Experimental Sample Standard Deviation
s2 (array): Control Sample Standard Deviation
df1 (array): Experimental Sample Degrees of Freedom
df2 (array): Control Sample Degrees of Freedom
test (string): F-test: ["two-sided","greater","less"]
variable_name (string): User Text Name for Variable
x1_name (string): Experimental Sample Name
x2_name (string): Control Sample Name
alpha (float): Experimental Sample
Returns:
F_stat (float): Calculated t-statistic
p (float): p-value
Verdict (string): Hypothesis Result
"""
#
# Libraries
#
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
#
# Test Language for Output
#
if (test == "greater"):
Ho_sym, Ha_sym = ["≤", ">"]
x1_fill = "magenta"
x2_fill = "cyan"
x1_line = "darkred"
x2_line = "darkblue"
t_color = "blue"
elif (test == "less"):
Ho_sym, Ha_sym = ["≥", "<"]
x1_fill = "cyan"
x2_fill = "magenta"
x1_line = "darkblue"
x2_line = "darkred"
t_color = "blue"
else:
Ho_sym, Ha_sym = ["=", "≠"]
x1_fill = "cyan"
x2_fill = "grey"
x1_line = "darkblue"
x2_line = "black"
t_color = "green"
Ho_Text = x1_name + " " + Ho_sym + " " + x2_name
Ha_Text = x1_name + " " + Ha_sym + " " + x2_name
#
# F statistic
#
F_stat = s1*s1/s2/s2
Effect_Size = np.max([(s1*s1),
(s2*s2)]) / \
np.min([(s1*s1),
(s2*s2)])
#
# Output from T-Test
#
P_f_stat = stats.f.cdf(x = F_stat,
dfn = df1,
dfd = df2)
#
# Get Crit T-Thresholds, and calculated p vals s (<, >, two-tail)
#
if (test == "greater"):
F_crit_hi = stats.f.ppf(q = 1-alpha,
dfn = df1,
dfd = df2)
p_calc = 1-P_f_stat
alpha_tail = "{:02}".format(1-alpha)
elif (test == "less"):
F_crit_lo = stats.f.ppf(q = alpha,
dfn = df1,
dfd = df2)
p_calc = P_f_stat
alpha_tail = "{:02}".format(1-alpha)
else:
F_crit_lo = stats.f.ppf(q = alpha/2,
dfn = df1,
dfd = df2)
F_crit_hi = stats.f.ppf(q = 1-alpha/2,
dfn = df1,
dfd = df2)
p_calc = 2*(1-P_f_stat)
alpha_high = "{:03}".format(1-alpha/2)
alpha_low = "{:03}".format(alpha/2)
#
# Verdict of t-Test given alpha
#
if (p_calc > alpha):
Verdict = "Ho cannot be rejected; " + Ho_Text
else:
Verdict = "Ho is rejected; " + Ha_Text
#
# F-Test Report
#
print("")
print("╔═══════════════════════════════════════════════════")
print("║╭──────────────────────────────────────────────")
print("║│ 2-Variance F-test Report (ɑ="+str(alpha)+")")
if (variable_name != ""):
print("║│ Variable :", variable_name )
print("║│ Ho :", Ho_Text)
print("║│ Ha :", Ha_Text)
print("║│ F :", F_stat)
print("║│ Effect Size :", Effect_Size)
if (test == "two-sided"):
print("║│ F₍₁₋½ₐ₎ :", F_crit_lo, F_crit_hi)
elif (test == "greater"):
print("║│ F₍₁₋ₐ₎ :", F_crit_hi)
else:
print("║│ F₍₁₋ₐ₎ :", F_crit_lo)
print("║│ p :", p_calc )
print("║│ Result :", Verdict )
print("║╰──────────────────────────────────────────────")
#
# Calucate p(t) for Graphics
#
p_F_stat = stats.f.pdf(x = F_stat,\
dfn = df1,
dfd = df2)
#
# X-Axis for T-Values for Plots
#
F_plotedge_hi = stats.f.ppf(q = 0.99999,
dfn = df1,
dfd = df2)
F_plotedge_lo = stats.f.ppf(q = 1-0.99999,
dfn = df1,
dfd = df2)
F_plotedge_max = np.max([F_plotedge_hi,
F_stat])
F_plotedge_min = np.min([F_plotedge_lo,
F_stat])
F_axis_plot_range = np.linspace(start = 0,
stop = F_plotedge_max,
num = 1000)
#
# Plot T-Curve
#
sns.set_theme(style = "ticks",
rc = {"axes.spines.right":False,
"axes.spines.top":False})
fig, ax2 = plt.subplots(nrows = 1,
ncols = 1,
figsize = [8/2,3])
if (variable_name != ""):
fig.suptitle(r"2-$σ_x$ F-test : "+ variable_name,fontsize="small")
else:
fig.suptitle(r"2-$σ_x$ F-test",fontsize="small")
#
# Plot t-Distribution
#
if (p_calc > alpha):
ax2.set_title(r"$H_0$: " + Ho_Text, fontsize="x-small")
else:
ax2.set_title(r"$H_a$: " + Ha_Text, fontsize="x-small")
ax2.plot(F_axis_plot_range,
stats.f.pdf(x = F_axis_plot_range,
dfn = df1,
dfd = df2),
color = "grey",
label = r"$p(F)$")
if (test == "greater"):
F_axis_greater = np.linspace(start = F_plotedge_min,
stop = F_crit_hi,
num = 1000)
ax2.fill_between(F_axis_greater,
stats.f.pdf(x = F_axis_greater,
dfn = df1,
dfd = df2),
color = t_color,
alpha = 0.33333,
label = r"$P(F)$ = "+alpha_tail)
elif (test == "less"):
F_axis_less = np.linspace(start = F_crit_lo,
stop = F_plotedge_max,
num = 1000)
ax2.fill_between(F_axis_less,
stats.f.pdf(x = F_axis_less,
dfn = df1,
dfd = df2),
alpha = 0.33333,
color = t_color,
label = r"1-$P(F)$ = "+alpha_tail)
else:
F_axis_2tail = np.linspace(start = F_crit_lo,
stop = F_crit_hi,
num = 1000)
ax2.fill_between(F_axis_2tail,
stats.f.pdf(x = F_axis_2tail,
dfn = df1,
dfd = df2),
alpha = 0.33333,
color = t_color,
label = alpha_low+" > P(F) > "+alpha_high)
ax2.plot([F_stat, F_stat],
[ -1, p_F_stat],
marker = "o",
color = "red",
label = r"F value",
linestyle = "dotted")
ax2.legend(framealpha = 0, # makes legend transparent
fontsize = "xx-small")
ax2.set_xlim(left = 0)
ax2.set_ylim(bottom = 0)
ax2.set_xlabel(r"$F$", fontsize="x-small")
ax2.set_ylabel(r"$p(F)$", fontsize="x-small")
ax2.tick_params(labelsize = "x-small")
plt.tight_layout
plt.show()
print("╚═══════════════════════════════════════════════════")
print("")
return {"F": F_stat,
"p": p_calc,
"Result": Verdict}
#
#########################################
Testing Example¶
This test leverages a dataset compiled by
De Vito, S., E. Massera, M. Piga, L. Martinotto, G. Di Francia, 2008: On field calibration of an electronic nose for benzene estimation in an urban pollution monitoring scenario, Sensors and Actuators B: Chemical, 129 (2), 750-757, doi: 10.1016/j.snb.2007.09.060.
The dataset is maintained by the Univ of California at Irvine, Machine Learning Repository (doi: 10.24432/C59K5F)
Here, the data is extracted for two Benzine samples (a training sample with 70 samples and a testing sample of 30 samples) at 0700 UTC between 2004-March-11 and 2005-April-04.
Support Libraries for Demonstration¶
#########################################
#
# Demonstration Libraries
#
import pandas as pd
#
#########################################
#########################################
#
# Demonstration Libraries
#
url_xlsx = "http://kyrill.ias.sdsmt.edu:8080/thredds/fileServer/CLASS_Examples/CEE_284_Area/Statistics/Air_Quality_Extraction_H0700.xlsx"
training = pd.read_excel(io = url_xlsx,
sheet_name = "Training"). \
set_index("Time")
testing = pd.read_excel(io = url_xlsx,
sheet_name = "Testing"). \
set_index("Time")
display(training)
display(testing)
#
#########################################
Sample | Month | Hour | C6H6 (μg/m³) | CO (mg/m³) | NOx (ppb) | NO2 (μg/m³) | T (° C) | RH (%) | AH (g/kg) | |
---|---|---|---|---|---|---|---|---|---|---|
Time | ||||||||||
2004-04-07 07:00:00 | Train | Apr | 7 | 12.935403 | 2.8 | 152.0 | 111.0 | 14.300 | 58.175 | 0.943075 |
2004-04-12 07:00:00 | Train | Apr | 7 | 0.945425 | 0.5 | 39.0 | 53.0 | 13.475 | 50.100 | 0.770567 |
2004-04-13 07:00:00 | Train | Apr | 7 | 19.123453 | 3.9 | 328.0 | 130.0 | 10.975 | 64.150 | 0.839760 |
2004-05-02 07:00:00 | Train | May | 7 | 4.755590 | 0.8 | 48.0 | 46.0 | 14.300 | 66.175 | 1.072763 |
2004-05-04 07:00:00 | Train | May | 7 | 23.037221 | 4.5 | 302.0 | 109.0 | 16.550 | 78.000 | 1.457225 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2005-03-03 07:00:00 | Train | Mar | 7 | 6.742318 | 1.4 | 262.8 | 135.5 | 2.850 | 75.575 | 0.574151 |
2005-03-04 07:00:00 | Train | Mar | 7 | 7.486889 | 1.9 | 408.9 | 138.7 | 3.125 | 77.300 | 0.598485 |
2005-03-05 07:00:00 | Train | Mar | 7 | 2.042536 | 0.8 | 167.5 | 132.2 | 4.625 | 54.575 | 0.468217 |
2005-03-21 07:00:00 | Train | Mar | 7 | 6.074366 | 1.3 | 270.2 | 112.8 | 12.800 | 68.050 | 1.002346 |
2005-03-26 07:00:00 | Train | Mar | 7 | 5.691083 | 1.1 | 174.6 | 79.7 | 16.100 | 69.500 | 1.262319 |
70 rows × 10 columns
Sample | Month | Hour | C6H6 (μg/m³) | CO (mg/m³) | NOx (ppb) | NO2 (μg/m³) | T (° C) | RH (%) | AH (g/kg) | |
---|---|---|---|---|---|---|---|---|---|---|
Time | ||||||||||
2004-04-06 07:00:00 | Test | Apr | 7 | 24.134258 | 4.5 | 329.0 | 134.0 | 15.300 | 55.225 | 0.953788 |
2004-04-27 07:00:00 | Test | Apr | 7 | 23.419668 | 4.1 | 314.0 | 122.0 | 15.900 | 48.525 | 0.870351 |
2004-05-01 07:00:00 | Test | May | 7 | 6.451626 | 1.2 | 75.0 | 67.0 | 16.875 | 66.550 | 1.268826 |
2004-05-03 07:00:00 | Test | May | 7 | 21.128692 | 3.4 | 283.0 | 101.0 | 16.825 | 62.450 | 1.186945 |
2004-05-07 07:00:00 | Test | May | 7 | 19.779412 | 3.7 | 214.0 | 106.0 | 14.900 | 56.900 | 0.958184 |
2004-05-14 07:00:00 | Test | May | 7 | 22.780588 | 3.7 | 272.0 | 140.0 | 18.475 | 49.375 | 1.039693 |
2004-05-15 07:00:00 | Test | May | 7 | 9.061727 | 1.4 | 106.0 | 68.0 | 16.025 | 47.375 | 0.856424 |
2004-05-20 07:00:00 | Test | May | 7 | 19.618987 | 2.8 | 188.0 | 97.0 | 16.600 | 55.950 | 1.048554 |
2004-06-08 07:00:00 | Test | Jun | 7 | 19.370545 | 3.3 | 309.0 | 100.0 | 19.525 | 57.725 | 1.296638 |
2004-06-13 07:00:00 | Test | Jun | 7 | 1.279695 | 0.4 | 25.0 | 36.0 | 21.375 | 47.750 | 1.200534 |
2004-06-29 07:00:00 | Test | Jun | 7 | 23.496479 | 3.4 | 228.0 | 102.0 | 24.725 | 49.475 | 1.520097 |
2004-07-08 07:00:00 | Test | Jul | 7 | 16.944141 | 2.2 | 231.0 | 115.0 | 25.875 | 37.375 | 1.228885 |
2004-07-15 07:00:00 | Test | Jul | 7 | 15.564889 | 2.3 | 168.0 | 108.0 | 21.325 | 48.350 | 1.211945 |
2004-08-11 07:00:00 | Test | Aug | 7 | 9.468788 | 1.4 | 89.0 | 42.0 | 26.800 | 60.525 | 2.100800 |
2004-08-12 07:00:00 | Test | Aug | 7 | 10.956517 | 1.7 | 87.0 | 60.0 | 26.400 | 61.725 | 2.092947 |
2004-09-19 07:00:00 | Test | Sep | 7 | 4.809205 | 1.0 | 139.0 | 62.0 | 16.850 | 66.825 | 1.272082 |
2004-09-27 07:00:00 | Test | Sep | 7 | 26.900112 | 5.5 | 799.0 | 132.0 | 15.000 | 57.700 | 0.977823 |
2004-10-28 07:00:00 | Test | Oct | 7 | 19.994179 | 3.4 | 640.0 | 88.0 | 15.725 | 76.425 | 1.355764 |
2004-10-29 07:00:00 | Test | Oct | 7 | 15.929432 | 2.9 | 366.0 | 111.0 | 20.175 | 66.925 | 1.564267 |
2004-11-14 07:00:00 | Test | Nov | 7 | 0.212798 | 0.2 | 25.0 | 21.0 | 10.175 | 41.925 | 0.520914 |
2004-11-18 07:00:00 | Test | Nov | 7 | 9.890093 | 2.1 | 531.0 | 162.0 | 6.825 | 74.500 | 0.741531 |
2004-11-29 07:00:00 | Test | Nov | 7 | 9.817125 | 2.0 | 325.0 | 90.0 | 8.725 | 83.700 | 0.945385 |
2005-01-01 07:00:00 | Test | Jan | 7 | 4.458207 | 1.4 | 168.1 | 96.7 | 3.025 | 60.700 | 0.466738 |
2005-01-20 07:00:00 | Test | Jan | 7 | 2.741115 | 1.0 | 181.9 | 104.4 | 7.450 | 55.250 | 0.573390 |
2005-02-03 07:00:00 | Test | Feb | 7 | 4.247511 | 0.6 | 215.6 | 133.9 | 1.350 | 63.125 | 0.432208 |
2005-02-21 07:00:00 | Test | Feb | 7 | 5.462115 | 1.2 | 183.3 | 123.1 | 4.200 | 80.900 | 0.674249 |
2005-02-24 07:00:00 | Test | Feb | 7 | 5.482765 | 1.5 | 289.7 | 169.5 | 3.750 | 83.775 | 0.677052 |
2005-02-26 07:00:00 | Test | Feb | 7 | 1.912104 | 1.0 | 143.7 | 108.2 | 5.725 | 54.325 | 0.502158 |
2005-02-28 07:00:00 | Test | Feb | 7 | 0.655256 | 0.4 | 88.8 | 65.5 | 2.500 | 52.700 | 0.390810 |
2005-03-10 07:00:00 | Test | Mar | 7 | 6.423963 | 1.3 | 249.1 | 100.0 | 4.325 | 76.075 | 0.639467 |
Executing the Function¶
Here we will demonstrate all three basic statistical tests, challenging the Testing sample against the Training sample.
#########################################
#
# Demonstration of Function
#
s1 = testing["C6H6 (μg/m³)"].std(ddof=1)
df1 = testing["C6H6 (μg/m³)"].count() -1
s2 = training["C6H6 (μg/m³)"].std(ddof=1)
df2 = training["C6H6 (μg/m³)"].count() -1
#
# Two-Tail Function Output
#
F_test_Ha_Testing_ne_Training = \
F_Variance_Equality_Test(s1 = s1,
s2 = s2,
df1 = df1,
df2 = df2,
test = "two-sided",
variable_name = "Benzine Concentration [μg/m³]",
x1_name = "Testing",
x2_name = "Training",
alpha = 0.05)
print("Two-Tail Function Output")
display(F_test_Ha_Testing_ne_Training)
print("")
print("")
#
# Right-Tail Function Output
#
F_test_Ha_Testing_gt_Training = \
F_Variance_Equality_Test(s1 = s1,
s2 = s2,
df1 = df1,
df2 = df2,
test = "greater",
variable_name = "Benzine Concentration [μg/m³]",
x1_name = "Testing",
x2_name = "Training",
alpha = 0.05)
print("Right-Tail Function Output")
display(F_test_Ha_Testing_gt_Training)
print("")
print("")
#
# Left-Tail Function Output
#
F_test_Ha_Testing_lt_Training = \
F_Variance_Equality_Test(s1 = s1,
s2 = s2,
df1 = df1,
df2 = df2,
test = "less",
variable_name = "Benzine Concentration [μg/m³]",
x1_name = "Testing",
x2_name = "Training",
alpha = 0.05)
print("Left-Tail Function Output")
display(F_test_Ha_Testing_lt_Training)
print("")
print("")
#
#########################################
╔═══════════════════════════════════════════════════ ║╭────────────────────────────────────────────── ║│ 2-Variance F-test Report (ɑ=0.05) ║│ Variable : Benzine Concentration [μg/m³] ║│ Ho : Testing = Training ║│ Ha : Testing ≠ Training ║│ F : 1.2946678269778165 ║│ Effect Size : 1.2946678269778165 ║│ F₍₁₋½ₐ₎ : 0.5153166002398324 1.791875001204443 ║│ p : 0.380507713074242 ║│ Result : Ho cannot be rejected; Testing = Training ║╰──────────────────────────────────────────────
╚═══════════════════════════════════════════════════ Two-Tail Function Output
{'F': 1.2946678269778165, 'p': 0.380507713074242, 'Result': 'Ho cannot be rejected; Testing = Training'}
╔═══════════════════════════════════════════════════ ║╭────────────────────────────────────────────── ║│ 2-Variance F-test Report (ɑ=0.05) ║│ Variable : Benzine Concentration [μg/m³] ║│ Ho : Testing ≤ Training ║│ Ha : Testing > Training ║│ F : 1.2946678269778165 ║│ Effect Size : 1.2946678269778165 ║│ F₍₁₋ₐ₎ : 1.6317516032199866 ║│ p : 0.190253856537121 ║│ Result : Ho cannot be rejected; Testing ≤ Training ║╰──────────────────────────────────────────────
╚═══════════════════════════════════════════════════ Right-Tail Function Output
{'F': 1.2946678269778165, 'p': 0.190253856537121, 'Result': 'Ho cannot be rejected; Testing ≤ Training'}
╔═══════════════════════════════════════════════════ ║╭────────────────────────────────────────────── ║│ 2-Variance F-test Report (ɑ=0.05) ║│ Variable : Benzine Concentration [μg/m³] ║│ Ho : Testing ≥ Training ║│ Ha : Testing < Training ║│ F : 1.2946678269778165 ║│ Effect Size : 1.2946678269778165 ║│ F₍₁₋ₐ₎ : 0.5748445005452651 ║│ p : 0.809746143462879 ║│ Result : Ho cannot be rejected; Testing ≥ Training ║╰──────────────────────────────────────────────
╚═══════════════════════════════════════════════════ Left-Tail Function Output
{'F': 1.2946678269778165, 'p': 0.809746143462879, 'Result': 'Ho cannot be rejected; Testing ≥ Training'}
#########################################
#
# Version Information Utility
#
%load_ext version_information
%version_information numpy, scipy, matplotlib, pandas, version_information
#
#########################################
Software | Version |
---|---|
Python | 3.12.8 64bit [Clang 18.1.8 ] |
IPython | 9.0.2 |
OS | macOS 15.3.1 arm64 arm 64bit |
numpy | 1.26.4 |
scipy | 1.15.2 |
matplotlib | 3.9.4 |
pandas | 2.2.3 |
version_information | 1.0.4 |
Thu Mar 13 09:05:56 2025 MDT |