84 lines
2.1 KiB
Python
84 lines
2.1 KiB
Python
|
import pandas as pd
|
|||
|
import numpy as np
|
|||
|
from keras.utils import to_categorical
|
|||
|
from sklearn.preprocessing import MinMaxScaler
|
|||
|
from sklearn.model_selection import train_test_split
|
|||
|
|
|||
|
|
|||
|
def data_format(data_path, is_column=False, rate=0.25):
|
|||
|
"""_summary_
|
|||
|
|
|||
|
Args:
|
|||
|
data_path (_type_): 数据路径
|
|||
|
is_column (bool, optional): 是否为列数据. Defaults to False.
|
|||
|
rate (float, optional): 实验集划分的比例. Defaults to 0.25.
|
|||
|
|
|||
|
Returns:X_train, X_test, Y_train, Y_test
|
|||
|
_type_: np.array
|
|||
|
"""
|
|||
|
# 读入数据
|
|||
|
X, Y = data_load(data_path, is_column)
|
|||
|
|
|||
|
# 归一化数据
|
|||
|
sc = MinMaxScaler(feature_range=(-1, 1))
|
|||
|
X = sc.fit_transform(X)
|
|||
|
|
|||
|
# 划分数据集,75%用于训练,25%用于测试
|
|||
|
X_train, X_test, Y_train, Y_test = train_test_split(
|
|||
|
X, Y, test_size=rate, random_state=7)
|
|||
|
|
|||
|
return X_train, X_test, Y_train, Y_test
|
|||
|
|
|||
|
|
|||
|
def data_load(data_path, is_column=False):
|
|||
|
"""
|
|||
|
数据加载
|
|||
|
data_path: 数据路径
|
|||
|
is_column: 是否是列数据
|
|||
|
return:X,Y
|
|||
|
"""
|
|||
|
# 读取csv文件
|
|||
|
df = pd.read_csv(data_path)
|
|||
|
|
|||
|
# 进行数据清洗
|
|||
|
data_clean(df, is_column)
|
|||
|
|
|||
|
# 去除第一列
|
|||
|
df = df.drop(df.columns[0], axis=1)
|
|||
|
|
|||
|
# 初始化X,Y
|
|||
|
X, Y = [], []
|
|||
|
|
|||
|
# 遍历DataFrame的每一行
|
|||
|
for index, row in df.iterrows():
|
|||
|
# 获取前128个数据项
|
|||
|
X.append(row.iloc[0:128])
|
|||
|
Y.append(int(row.iloc[128]))
|
|||
|
|
|||
|
return np.array(X), np.array(Y)
|
|||
|
|
|||
|
|
|||
|
def data_clean(data, is_column=False):
|
|||
|
"""_summary_
|
|||
|
|
|||
|
Args:
|
|||
|
data (_type_): csv数据
|
|||
|
is_column (bool, optional): 清除含有NaN数据的列. Defaults to False.即清除含有NaN数据的行
|
|||
|
|
|||
|
Returns:
|
|||
|
_type_: 清洗过的数据
|
|||
|
"""
|
|||
|
if not is_column:
|
|||
|
data = data.dropna(axis=0)
|
|||
|
return data
|
|||
|
else:
|
|||
|
data = data.dropna(axis=1)
|
|||
|
return data
|
|||
|
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
# 加载数据
|
|||
|
X_train, X_test, Y_train, Y_test = data_format(
|
|||
|
'data/archive/PowerQualityDistributionDataset1.csv')
|
|||
|
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
|