import pandas as pd import numpy as np from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split def data_format(data_path, is_column=False, rate=0.25, md=0): """_summary_ Args: data_path (_type_): 数据路径 is_column (bool, optional): 是否为列数据. Defaults to False. rate (float, optional): 实验集划分的比例. Defaults to 0.25. md:模式,0为分类,1为预测 Returns:X_train, X_test, Y_train, Y_test _type_: np.array """ if md == 0: # 读入数据 X, Y = data_load_classify(data_path, is_column) # 归一化数据 sc = MinMaxScaler(feature_range=(-1, 1)) X = sc.fit_transform(X) elif md == 1: # 读入数据 X = data_load_forecast(data_path, is_column) # 归一化数据 sc = MinMaxScaler(feature_range=(-1, 1)) X = sc.fit_transform(X) # 分离Y # 分离第 128 个元素 Y = X[:, -1] # 分离前 127 个元素 X = X[:, :-1] # 划分数据集,75%用于训练,25%用于测试 X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=rate, random_state=7) return X_train, X_test, Y_train, Y_test def data_load_classify(data_path, is_column=False): """ 数据加载 data_path: 数据路径 is_column: 是否是列数据 return:X,Y """ # 读取csv文件 df = pd.read_csv(data_path) # 进行数据清洗 data_clean(df, is_column) # 去除第一列 df = df.drop(df.columns[0], axis=1) # 初始化X,Y X, Y = [], [] # 遍历DataFrame的每一行 for index, row in df.iterrows(): # 获取前128个数据项 X.append(row.iloc[0:128]) Y.append(int(row.iloc[128])) return np.array(X), np.array(Y) def data_load_forecast(data_path, is_column=False): """ 数据加载 data_path: 数据路径 is_column: 是否是列数据 return:X,Y """ # 读取csv文件 df = pd.read_csv(data_path) # 进行数据清洗 data_clean(df, is_column) df = df[df['output'] == 1] # 去除第一列 df = df.drop(df.columns[0], axis=1) # 初始化X,Y X= [] # 遍历DataFrame的每一行 for index, row in df.iterrows(): # 获取前127个数据项 X.append(row.iloc[0:128]) return np.array(X) def data_clean(data, is_column=False): """_summary_ Args: data (_type_): csv数据 is_column (bool, optional): 清除含有NaN数据的列. Defaults to False.即清除含有NaN数据的行 Returns: _type_: 清洗过的数据 """ if not is_column: data = data.dropna(axis=0) return data else: data = data.dropna(axis=1) return data if __name__ == '__main__': # 加载数据 X_train, X_test, Y_train, Y_test = data_format( 'data/archive/PowerQualityDistributionDataset1.csv', md = 1) print(Y_train)