Liu/attack_nomal/data_load.py

84 lines
2.1 KiB
Python
Raw Normal View History

import pandas as pd
import numpy as np
from keras.utils import to_categorical
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
def data_format(data_path, is_column=False, rate=0.25):
"""_summary_
Args:
data_path (_type_): 数据路径
is_column (bool, optional): 是否为列数据. Defaults to False.
rate (float, optional): 实验集划分的比例. Defaults to 0.25.
Returns:X_train, X_test, Y_train, Y_test
_type_: np.array
"""
# 读入数据
X, Y = data_load(data_path, is_column)
# 归一化数据
sc = MinMaxScaler(feature_range=(-1, 1))
X = sc.fit_transform(X)
# 划分数据集75%用于训练25%用于测试
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=rate, random_state=7)
return X_train, X_test, Y_train, Y_test
def data_load(data_path, is_column=False):
"""
数据加载
data_path: 数据路径
is_column: 是否是列数据
return:X,Y
"""
# 读取csv文件
df = pd.read_csv(data_path)
# 进行数据清洗
data_clean(df, is_column)
# 去除第一列
df = df.drop(df.columns[0], axis=1)
# 初始化X,Y
X, Y = [], []
# 遍历DataFrame的每一行
for index, row in df.iterrows():
# 获取前128个数据项
X.append(row.iloc[0:128])
Y.append(int(row.iloc[128]))
return np.array(X), np.array(Y)
def data_clean(data, is_column=False):
"""_summary_
Args:
data (_type_): csv数据
is_column (bool, optional): 清除含有NaN数据的列. Defaults to False.即清除含有NaN数据的行
Returns:
_type_: 清洗过的数据
"""
if not is_column:
data = data.dropna(axis=0)
return data
else:
data = data.dropna(axis=1)
return data
if __name__ == '__main__':
# 加载数据
X_train, X_test, Y_train, Y_test = data_format(
'data/archive/PowerQualityDistributionDataset1.csv')
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)