import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2, t as t_dist, norm
from ipywidgets import interact, FloatSlider, IntSlider
from IPython.display import display

# 数据表格处理
import pandas as pd
# 统计模型
import statsmodels.formula.api as smf

plt.rcParams['font.family'] = 'Noto Sans CJK SC'

n = 10

x = np.sort(np.random.randint(-10, 101, size=n))
x

array([-8,  0, 14, 16, 31, 36, 62, 84, 85, 86])

y = 5/9 * (x - 32)
y

array([-22.22222222, -17.77777778, -10.        ,  -8.88888889,
        -0.55555556,   2.22222222,  16.66666667,  28.88888889,
        29.44444444,  30.        ])

plt.figure(figsize=(7, 4))
plt.plot(x, y, '-o', color='red', markerfacecolor='red')
plt.xlabel("°F")
plt.ylabel("°C")
plt.title("华氏度与摄氏度的关系")
plt.show()

np.column_stack([x, y])

array([[ -8.        , -22.22222222],
       [  0.        , -17.77777778],
       [ 14.        , -10.        ],
       [ 16.        ,  -8.88888889],
       [ 31.        ,  -0.55555556],
       [ 36.        ,   2.22222222],
       [ 62.        ,  16.66666667],
       [ 84.        ,  28.88888889],
       [ 85.        ,  29.44444444],
       [ 86.        ,  30.        ]])

data = pd.DataFrame({"°F": x, "°C": y})
data

# 从字典创建 DataFrame
students = pd.DataFrame({
    '姓名': ['张三', '李四', '王五', '赵六', '孙七'],
    '数学': [85, 92, 78, 95, 88],
    '英语': [90, 87, 82, 91, 79],
    '物理': [76, 88, 95, 83, 92],
})
students

# 从二维列表创建
students2 = pd.DataFrame(
    [['张三', 85, 90, 76],
     ['李四', 92, 87, 88],
     ['王五', 78, 82, 95]],
    columns=['姓名', '数学', '英语', '物理']
)
students2

# 查看前几行
students.head(3)

# 查看形状（行数, 列数）
students.shape

(5, 4)

# 查看列名
students.columns.tolist()

['姓名', '数学', '英语', '物理']

# 查看数据类型
students.dtypes

姓名      str
数学    int64
英语    int64
物理    int64
dtype: object

# 快速统计摘要（仅对数值列）
students.describe()

# 选择单列（返回 Series）
students['数学']

0    85
1    92
2    78
3    95
4    88
Name: 数学, dtype: int64

# 选择多列（返回 DataFrame）
students[['姓名', '数学']]

# 按位置选择：iloc[行, 列]
students.iloc[1:3, 0:3]  # 第 2、3 行，前 3 列

# 按标签选择：loc[行标签, 列标签]
students.loc[0:2, '数学':'物理']  # 第 0~2 行，数学到物理列

# 条件筛选：数学成绩大于 85 的学生
students[students['数学'] > 85]

# 多条件筛选：数学 > 85 且英语 > 80
students[(students['数学'] > 85) & (students['英语'] > 80)]

# 添加新列：计算总分
students['总分'] = students['数学'] + students['英语'] + students['物理']
students

# 添加新列：计算平均分
students['平均分'] = students['总分'] / 3
students

# 修改现有值：将张三的数学成绩加 5
students.loc[0, '数学'] += 5
# 重新计算总分和平均分
students['总分'] = students['数学'] + students['英语'] + students['物理']
students['平均分'] = students['总分'] / 3
students

# 按平均分降序排列
students.sort_values('平均分', ascending=False)

# 按数学升序排列（原 DataFrame 不变，返回新对象）
students.sort_values('数学')

# 各科平均分
students[['数学', '英语', '物理']].mean()

数学    88.6
英语    85.8
物理    86.8
dtype: float64

# 各科最高分
students[['数学', '英语', '物理']].max()

数学    95
英语    91
物理    95
dtype: int64

# 各科标准差
students[['数学', '英语', '物理']].std()

数学    6.465292
英语    5.167204
物理    7.529940
dtype: float64

# 按行计算统计量：每个学生的最高分和最低分
scores = students[['数学', '英语', '物理']]
print('每个学生的最高分:')
print(scores.max(axis=1))
print()
print('每个学生的最低分:')
print(scores.min(axis=1))

每个学生的最高分:
0    90
1    92
2    95
3    95
4    92
dtype: int64

每个学生的最低分:
0    76
1    87
2    78
3    83
4    79
dtype: int64

# 用 apply 给每个学生评定等级
def grade(avg):
    if avg >= 90: return 'A'
    elif avg >= 80: return 'B'
    else: return 'C'

students['等级'] = students['平均分'].apply(grade)
students

# 添加班级列
students['班级'] = ['一班', '二班', '一班', '二班', '一班']
students

# 按班级分组，计算各科平均分
students.groupby('班级')[['数学', '英语', '物理']].mean()

# 按班级分组，计算人数和各科总分
students.groupby('班级').agg({
    '姓名': 'count',
    '数学': 'sum',
    '英语': 'sum',
    '物理': 'sum',
})

# 创建含缺失值的数据
data_missing = pd.DataFrame({
    '姓名': ['张三', '李四', '王五', '赵六'],
    '数学': [85, np.nan, 78, 95],
    '英语': [90, 87, np.nan, 91],
})
data_missing

# 检查缺失值
data_missing.isnull()

# 每列缺失值数量
data_missing.isnull().sum()

姓名    0
数学    1
英语    1
dtype: int64

# 删除含缺失值的行
data_missing.dropna()

# 用均值填充缺失值
filled = data_missing.copy()
filled['数学'] = filled['数学'].fillna(filled['数学'].mean())
filled['英语'] = filled['英语'].fillna(filled['英语'].mean())
filled

# DataFrame 转 NumPy 数组
arr = students[['数学', '英语', '物理']].values
print(type(arr))
arr

<class 'numpy.ndarray'>

array([[90, 90, 76],
       [92, 87, 88],
       [78, 82, 95],
       [95, 91, 83],
       [88, 79, 92]])

# NumPy 数组转 DataFrame
pd.DataFrame(arr, columns=['col1', 'col2', 'col3'])

# 写入 CSV 文件
data.to_csv('testCSVwrite.csv', index=False)

# 从 CSV 文件读取
data_again = pd.read_csv('testCSVwrite.csv')
data_again

# 写入 Excel 文件
students.to_excel('students.xlsx', index=False, sheet_name='成绩单')

# 从 Excel 文件读取
students_from_excel = pd.read_excel('students.xlsx', sheet_name='成绩单')
students_from_excel

# 写入多个工作表到同一个 Excel 文件
with pd.ExcelWriter('multi_sheet.xlsx', engine='openpyxl') as writer:
    students.to_excel(writer, sheet_name='全部学生', index=False)
    students[students['班级'] == '一班'].to_excel(writer, sheet_name='一班', index=False)
    students[students['班级'] == '二班'].to_excel(writer, sheet_name='二班', index=False)

# 读取所有工作表名称
xls = pd.ExcelFile('multi_sheet.xlsx')
print('工作表:', xls.sheet_names)

工作表: ['全部学生', '一班', '二班']

# 读取指定工作表
pd.read_excel('multi_sheet.xlsx', sheet_name='一班')

def linear_regression(x, y):
    """直接从数据计算线性回归参数"""
    n_lr = len(x)
    x0 = x - np.mean(x)
    y0 = y - np.mean(y)

    m_est = np.sum(x0 * y0) / np.sum(x0**2)          # 斜率估计
    b_est = np.mean(y) - m_est * np.mean(x)           # 截距估计
    s2_est = np.sum((m_est * x + b_est - y)**2) / (n_lr - 2)  # 噪声估计

    return b_est, m_est, s2_est

def show_noisy_analysis(noise):
    # ---------- 创建含噪声的数据 ----------
    noisy_data = data.copy()
    noisy_data["°C"] = noisy_data["°C"] + noise * np.random.randn(n)
    yy = noisy_data["°C"].values
    display(noisy_data)

    # ---------- OLS 回归 ----------
    ols = smf.ols('Q("°C") ~ Q("°F")', data=noisy_data).fit()
    print("\n普通最小二乘法 (OLS) 回归结果：")
    print(ols.summary())

    # ---------- 最小二乘法的线性代数解 ----------
    A = np.column_stack([np.ones(n), x])
    coeffs = np.linalg.lstsq(A, yy, rcond=None)[0]
    b_hat, m_hat = coeffs
    print(f"\n线性代数最小二乘解: 截距 b = {b_hat:.6f}, 斜率 m = {m_hat:.6f}")

    # ---------- 直接计算 ----------
    b_e, m_e, s2_e = linear_regression(x, yy)
    print(f"直接计算: 截距 = {b_e:.6f}, 斜率 = {m_e:.6f}, σ²ᵉ = {s2_e:.6f}")

    # ---------- 散点图 + 残差 + 最佳拟合线 ----------
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.scatter(x, yy, c='red', marker='o', s=50, label='含噪声的数据', zorder=3)
    for i in range(len(x)):
        ax.plot([x[i], x[i]], [m_hat * x[i] + b_hat, yy[i]],
                color='gray', ls='--', linewidth=1)
    ax.plot(x, m_hat * x + b_hat, color='blue', label='最佳拟合线')
    ax.plot(x, y, alpha=0.5, color='red', label='理论值')
    ax.set_xlabel("°F")
    ax.set_ylabel("°C")
    ax.set_ylim(-40, 40)
    ax.legend(loc='upper left')
    plt.tight_layout()
    plt.show()

    # ---------- 验证标准误差列 ----------
    sigma_e = np.sqrt(s2_e)
    se_b = sigma_e * np.linalg.norm(x) / np.linalg.norm(x - np.mean(x)) / np.sqrt(n)
    se_m = sigma_e / np.linalg.norm(x - np.mean(x))
    print(f"\n=== Coef. 列（回归系数） ===")
    print(f"截距 bᵉ = {b_e:.6f}")
    print(f"斜率 mᵉ = {m_e:.6f}")
    print(f"\n=== Std. Error 列（标准误差） ===")
    print(f"截距的标准误差 = {se_b:.6f}")
    print(f"斜率的标准误差 = {se_m:.6f}")
    print(f"\n=== t 列 (Coef. / Std. Error) ===")
    if se_b != 0:
        print(f"截距: {b_e:.6f} / {se_b:.6f} = {b_e / se_b:.6f}")
    if se_m != 0:
        print(f"斜率: {m_e:.6f} / {se_m:.6f} = {m_e / se_m:.6f}")

show_noisy_analysis(5.0)

普通最小二乘法 (OLS) 回归结果：
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                Q("°C")   R-squared:                       0.990
Model:                            OLS   Adj. R-squared:                  0.989
Method:                 Least Squares   F-statistic:                     798.8
Date:                Fri, 17 Apr 2026   Prob (F-statistic):           2.65e-09
Time:                        09:05:10   Log-Likelihood:                -20.832
No. Observations:                  10   AIC:                             45.66
Df Residuals:                       8   BIC:                             46.27
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -18.2101      1.063    -17.126      0.000     -20.662     -15.758
Q("°F")        0.5650      0.020     28.263      0.000       0.519       0.611
==============================================================================
Omnibus:                        0.450   Durbin-Watson:                   1.735
Prob(Omnibus):                  0.799   Jarque-Bera (JB):                0.277
Skew:                          -0.334   Prob(JB):                        0.871
Kurtosis:                       2.533   Cond. No.                         82.3
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

线性代数最小二乘解: 截距 b = -18.210070, 斜率 m = 0.564956
直接计算: 截距 = -18.210070, 斜率 = 0.564956, σ²ᵉ = 4.719215

=== Coef. 列（回归系数） ===
截距 bᵉ = -18.210070
斜率 mᵉ = 0.564956

=== Std. Error 列（标准误差） ===
截距的标准误差 = 1.063285
斜率的标准误差 = 0.019990

=== t 列 (Coef. / Std. Error) ===
截距: -18.210070 / 1.063285 = -17.126234
斜率: 0.564956 / 0.019990 = 28.262610

interact(show_noisy_analysis, noise=FloatSlider(min=0, max=1000, step=0.5, value=0.5, description='噪声:'));

interactive(children=(FloatSlider(value=0.5, description='噪声:', max=1000.0, step=0.5), Output()), _dom_classes…

def simulate(sigma, howmany):
    """向量化的模拟函数：运行 howmany 次含噪声的线性回归"""
    noise_matrix = sigma * np.random.randn(howmany, n)
    y_noisy_all = y[np.newaxis, :] + noise_matrix

    x_mean = np.mean(x)
    y_means = np.mean(y_noisy_all, axis=1)
    x0 = x - x_mean
    y0 = y_noisy_all - y_means[:, np.newaxis]

    m_ests = np.sum(x0 * y0, axis=1) / np.sum(x0**2)
    b_ests = y_means - m_ests * x_mean

    fitted = m_ests[:, np.newaxis] * x[np.newaxis, :] + b_ests[:, np.newaxis]
    s2_ests = np.sum((fitted - y_noisy_all)**2, axis=1) / (n - 2)

    return b_ests, m_ests, s2_ests

howmany = 100_000  # 为了便于看清位数，在数值常量中可以使用下划线来分段

@interact(sigma=FloatSlider(min=0, max=3, step=0.1, value=1, description='σ:'))
def show_simulations(sigma):
    b_ests, m_ests, s2_ests = simulate(sigma, howmany)

    print(f"第一次模拟: 截距={b_ests[0]:.6f}, 斜率={m_ests[0]:.6f}, "
          f"σ²ᵉ={s2_ests[0]:.6f}")

    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    # ---- 截距直方图 ----
    axes[0].hist(b_ests, bins=100, density=True, alpha=0.6)
    axes[0].axvline(-17.777777, color='white', linewidth=2)
    axes[0].set_title(f"截距（{howmany} 次模拟）")
    axes[0].set_xlim(-17.7777 - 3, -17.7777 + 3)
    axes[0].set_ylim(0, 1)

    # ---- 斜率直方图 ----
    axes[1].hist(m_ests, bins=100, density=True, alpha=0.6)
    axes[1].axvline(5/9, color='white', linewidth=2)
    axes[1].set_title(f"斜率（{howmany} 次模拟）")
    axes[1].set_xlim(5/9 - 0.1, 5/9 + 0.1)
    axes[1].set_ylim(0, 100)

    # ---- 残差直方图 ----
    if sigma > 0:
        scaled = s2_ests / (sigma**2 / (n - 2))
        axes[2].hist(scaled, bins=100, density=True, alpha=0.6)
        axes[2].axvline(n - 2, color='white', linewidth=4)
        xx_chi = np.linspace(0.01, max(scaled.max(), n + 5), 300)
        axes[2].plot(xx_chi, chi2.pdf(xx_chi, n - 2),
                     color='red', linewidth=4, label='χ² 分布')
        axes[2].legend()
    else:
        axes[2].text(0.5, 0.5, "σ=0 时无残差",
                     transform=axes[2].transAxes, ha='center')
    axes[2].set_title(f"残差（{howmany} 次模拟）")

    plt.tight_layout()
    plt.show()

    # ---- 统计量 ----
    print(f"\n--- 截距 ---")
    print(f"实验均值: {np.mean(b_ests):.6f}    理论值: -17.777777")
    print(f"实验标准差: {np.std(b_ests):.6f}")
    sb = sigma * np.linalg.norm(x) / np.linalg.norm(x - np.mean(x)) / np.sqrt(n)
    print(f"理论标准差: {sb:.6f}")

    print(f"\n--- 斜率 ---")
    print(f"实验均值: {np.mean(m_ests):.6f}    理论值: {5/9:.6f}")
    print(f"实验标准差: {np.std(m_ests):.6f}")
    sm_theory = sigma / np.linalg.norm(x - np.mean(x))
    print(f"理论标准差: {sm_theory:.6f}")

    print(f"\n--- 残差 ---")
    print(f"残差均值 (s²ᵉ): {np.mean(s2_ests):.6f}")
    print(f"σ²: {sigma**2:.6f}")
    print(f"残差标准差: {np.std(s2_ests):.6f}")
    if sigma > 0:
        print(f"理论标准差 σ²/√((n-2)/2): {sigma**2 / np.sqrt((n-2)/2):.6f}")

interactive(children=(FloatSlider(value=1.0, description='σ:', max=3.0), Output()), _dom_classes=('widget-inte…

def rand_t(k):
    return np.sqrt(k) * np.random.randn() / np.linalg.norm(np.random.randn(k))

@interact(k=IntSlider(min=3, max=100, value=8, description='k:'))
def show_t_distribution(k):
    # 向量化生成 t 分布样本
    numerator = np.sqrt(k) * np.random.randn(100_000)
    denominator = np.linalg.norm(np.random.randn(100_000, k), axis=1)
    samples = numerator / denominator

    fig, ax = plt.subplots(figsize=(8, 5))
    ax.hist(samples, bins=500, density=True, alpha=0.6, label=None)
    xx = np.linspace(-3, 3, 300)
    ax.plot(xx, t_dist.pdf(xx, k), color='red', linewidth=4, label='t 分布')
    ax.plot(xx, norm.pdf(xx), color='green', linewidth=2, label='正态分布')
    ax.set_xlim(-3, 3)
    ax.set_ylim(0, 0.4)
    ax.legend()
    ax.set_title(f"t 分布 (k={k}) 与正态分布的比较")
    plt.show()

interactive(children=(IntSlider(value=8, description='k:', min=3), Output()), _dom_classes=('widget-interact',…

	数学	英语	物理
count	5.000000	5.000000	5.00000
mean	87.600000	85.800000	86.80000
std	6.580274	5.167204	7.52994
min	78.000000	79.000000	76.00000
25%	85.000000	82.000000	83.00000
50%	88.000000	87.000000	88.00000
75%	92.000000	90.000000	92.00000
max	95.000000	91.000000	95.00000

	姓名	数学	英语
0	张三	85.0	90.0
1	李四	NaN	87.0
2	王五	78.0	NaN
3	赵六	95.0	91.0

	姓名	数学	英语
0	张三	85.0	90.000000
1	李四	86.0	87.000000
2	王五	78.0	89.333333
3	赵六	95.0	91.000000

操作	方法
创建	`pd.DataFrame(dict)`
查看前 N 行	`.head(n)`
形状	`.shape`
统计摘要	`.describe()`
选列	`df['列名']` / `df[['列1','列2']]`
按位置选	`.iloc[行, 列]`
按标签选	`.loc[行, 列]`
条件筛选	`df[条件]`
添加列	`df['新列'] = ...`
排序	`.sort_values('列名')`
分组聚合	`.groupby('列名').agg(...)`
缺失值	`.isnull()` / `.dropna()` / `.fillna()`
转 NumPy	`.values`

程序设计与计算思维¶

第 15 讲：线性模型¶

问题引入¶

准备工作¶

华氏度与摄氏度数据集¶

数据表格¶

Pandas 基础教程¶

创建 DataFrame¶

查看数据¶

选择数据¶

添加与修改列¶

排序¶

统计运算¶

apply：自定义逐行/逐列运算¶

分组（groupby）¶

缺失值处理¶

NumPy 与 Pandas 互相转换¶

Pandas 小结¶

Pandas 的数据文件读写¶

CSV 数据格式¶

将数据写入 CSV 文件¶

从 CSV 文件读取数据¶

Excel 文件的读写¶

处理含噪声的数据¶

交互式探索¶

验证回归结果¶

模型的含义¶

语法理解：`°C ~ 1 + °F`¶

蒙特卡洛模拟：大量运行含噪声模型¶

观察截距、斜率和误差的分布¶

解析线性模型输出表格¶

Coef. (系数) 列¶

Std. Error (标准误差) 列¶

t 列与假设检验¶

Pr(>|t|) 列¶

本讲小结¶

	°F	°C
0	-8	-22.222222
1	0	-17.777778
2	14	-10.000000
3	16	-8.888889
4	31	-0.555556
5	36	2.222222
6	62	16.666667
7	84	28.888889
8	85	29.444444
9	86	30.000000

	姓名	数学	英语	物理	总分	平均分
0	张三	85	90	76	251	83.666667
1	李四	92	87	88	267	89.000000
2	王五	78	82	95	255	85.000000
3	赵六	95	91	83	269	89.666667
4	孙七	88	79	92	259	86.333333

	姓名	数学	英语	物理	总分	平均分
0	张三	90	90	76	256	85.333333
1	李四	92	87	88	267	89.000000
2	王五	78	82	95	255	85.000000
3	赵六	95	91	83	269	89.666667
4	孙七	88	79	92	259	86.333333

	姓名	数学	英语	物理	总分	平均分	等级	班级
0	张三	90	90	76	256	85.333333	B	一班
1	李四	92	87	88	267	89.000000	B	二班
2	王五	78	82	95	255	85.000000	B	一班
3	赵六	95	91	83	269	89.666667	B	二班
4	孙七	88	79	92	259	86.333333	B	一班

	数学	英语	物理
班级
一班	85.333333	83.666667	87.666667
二班	93.500000	89.000000	85.500000

	姓名	数学	英语
0	False	False	False
1	False	True	False
2	False	False	True
3	False	False	False

	°F	°C
0	-8	-23.572800
1	0	-18.286569
2	14	-9.818268
3	16	-8.798184
4	31	0.954215
5	36	2.308073
6	62	13.853470
7	84	25.760709
8	85	33.154993
9	86	31.715910

	col1	col2	col3
0	90	90	76
1	92	87	88
2	78	82	95
3	95	91	83
4	88	79	92

程序设计与计算思维¶

第 15 讲：线性模型¶

问题引入¶

准备工作¶

华氏度与摄氏度数据集¶

数据表格¶

Pandas 基础教程¶

创建 DataFrame¶

查看数据¶

选择数据¶

添加与修改列¶

排序¶

统计运算¶

apply：自定义逐行/逐列运算¶

分组（groupby）¶

缺失值处理¶

NumPy 与 Pandas 互相转换¶

Pandas 小结¶

Pandas 的数据文件读写¶

CSV 数据格式¶

将数据写入 CSV 文件¶

从 CSV 文件读取数据¶

Excel 文件的读写¶

处理含噪声的数据¶

交互式探索¶

验证回归结果¶

模型的含义¶

语法理解：°C ~ 1 + °F¶

蒙特卡洛模拟：大量运行含噪声模型¶

观察截距、斜率和误差的分布¶

解析线性模型输出表格¶

Coef. (系数) 列¶

Std. Error (标准误差) 列¶

t 列与假设检验¶

Pr(>|t|) 列¶

本讲小结¶

语法理解：`°C ~ 1 + °F`¶