深入探索机器学习教程,本文全面覆盖从基础概念到实战应用,包括监督、非监督和强化学习分类,以及Python编程基础、数据预处理、常用算法实现与评估优化,带你从入门到掌握机器学习实战技能。
机器学习是一门关于让计算机从数据中学习和改进的学科。它通过算法使计算机系统能够自动改进,而无需进行显式编程。
机器学习主要分为三大类:
import pandas as pd import matplotlib.pyplot as plt # 加载数据 df = pd.read_csv('data.csv') print(df.head()) # 查看数据描述 print(df.describe()) # 绘制数据分布直方图 df['column_name'].hist() plt.show()
from IPython.display import display from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score # Jupyter Notebook入门 from IPython.display import display from IPython.html.widgets import interact # 基础数据处理与可视化 import pandas as pd import matplotlib.pyplot as plt # 数据清洗与整理 data = pd.DataFrame({ 'a': [1, 2, 3, 4, 5], 'b': [10, 20, 30, 40, 50], 'c': [1, 2, 1, 1, 1] }) data = data.dropna() # 使用中位数填充缺失值 data['b'].fillna(data['b'].median(), inplace=True) # 类型转换 data['c'] = data['c'].astype('category') display(data) # 数据集划分与验证集准备 from sklearn.model_selection import train_test_split X = data.drop('c', axis=1) y = data['c'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 模型评估与优化 from sklearn.linear_model import LinearRegression lr = LinearRegression() lr.fit(X_train, y_train) predictions = lr.predict(X_test) print(f"Mean Squared Error: {mean_squared_error(y_test, predictions)}") display(accuracy_score(y_test, predictions)) from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train, y_train) predictions = knn.predict(X_test) print(f"Accuracy: {accuracy_score(y_test, predictions)}")
from sklearn.preprocessing import StandardScaler # 特征选择 selected_features = ['feature1', 'feature2', 'feature3'] # 使用scikit-learn进行数据标准化 scaler = StandardScaler() X_scaled = scaler.fit_transform(data[selected_features])
from sklearn.tree import DecisionTreeClassifier # 决策树与随机森林案例 dt = DecisionTreeClassifier() dt.fit(X_train, y_train) predictions = dt.predict(X_test) print(f"Accuracy: {accuracy_score(y_test, predictions)}") from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=100) rf.fit(X_train, y_train) predictions = rf.predict(X_test) print(f"Accuracy: {accuracy_score(y_test, predictions)}")
from sklearn.svm import SVC # 支持向量机应用 svm = SVC(kernel='linear') svm.fit(X_train, y_train) predictions = svm.predict(X_test) print(f"Accuracy: {accuracy_score(y_test, predictions)}")
from sklearn.naive_bayes import GaussianNB # 朴素贝叶斯分类器演示 nb = GaussianNB() nb.fit(X_train, y_train) predictions = nb.predict(X_test) print(f"Accuracy: {accuracy_score(y_test, predictions)}")
# K近邻算法实战 from sklearn.model_selection import cross_val_score scores = cross_val_score(knn, X, y, cv=5) print(f"Cross-Validation Scores: {scores}") print(f"Mean Score: {scores.mean()}")
from sklearn.model_selection import GridSearchCV param_grid = {'n_neighbors': [3, 5, 7, 9]} grid_search = GridSearchCV(knn, param_grid, cv=5) grid_search.fit(X_train, y_train) print(f"Best Parameters: {grid_search.best_params_}")
from sklearn.datasets import load_iris # 加载小型数据集 iris = load_iris() X, y = iris.data, iris.target # 划分数据集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 训练模型 dt = DecisionTreeClassifier() dt.fit(X_train, y_train) # 预测和评估 predictions = dt.predict(X_test) accuracy = dt.score(X_test, y_test) print(f"Accuracy: {accuracy}") # 特征重要性分析 importances = dt.feature_importances_ print(f"Feature Importances: {importances}")
from sklearn.datasets import make_classification # 使用机器学习解决实际问题案例分享 X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42) # 划分数据集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 训练模型 rf = RandomForestClassifier(n_estimators=100) rf.fit(X_train, y_train) # 预测和评估 predictions = rf.predict(X_test) accuracy = rf.score(X_test, y_test) print(f"Accuracy: {accuracy}") # 特征重要性分析 importances = rf.feature_importances_ print(f"Feature Importances: {importances}")