Liu/defend/data_load.py

90 lines
2.1 KiB
Python
Raw Permalink Normal View History

2024-01-26 20:42:33 +08:00
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
def data_format(data_path, is_column=False, rate=0.25):
"""_summary_
Args:
data_path (_type_): 数据路径
is_column (bool, optional): 是否为列数据. Defaults to False.
rate (float, optional): 实验集划分的比例. Defaults to 0.25.
md:模式,0为分类,1为预测
Returns:X_train, X_test, Y_train, Y_test
_type_: np.array
"""
# 读入数据
X = data_load_forecast(data_path, is_column)
# 归一化数据
sc = MinMaxScaler(feature_range=(-1, 1))
X = sc.fit_transform(X)
# 分离Y
# 分离第 128 个元素
Y = X[:, -1]
# 分离前 127 个元素
X = X[:, :-1]
# 划分数据集75%用于训练25%用于测试
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=rate, random_state=7)
return X_train, X_test, Y_train, Y_test
def data_load_forecast(data_path, is_column=False):
"""
数据加载
data_path: 数据路径
is_column: 是否是列数据
return:X,Y
"""
# 读取csv文件
df = pd.read_csv(data_path)
# 进行数据清洗
data_clean(df, is_column)
df = df[df['output'] == 1]
# 去除第一列
df = df.drop(df.columns[0], axis=1)
# 初始化X,Y
X = []
# 遍历DataFrame的每一行
for index, row in df.iterrows():
# 获取前127个数据项
X.append(row.iloc[0:128])
return np.array(X)
def data_clean(data, is_column=False):
"""_summary_
Args:
data (_type_): csv数据
is_column (bool, optional): 清除含有NaN数据的列. Defaults to False.即清除含有NaN数据的行
Returns:
_type_: 清洗过的数据
"""
if not is_column:
data = data.dropna(axis=0)
return data
else:
data = data.dropna(axis=1)
return data
if __name__ == '__main__':
# 加载数据
X_train, X_test, Y_train, Y_test = data_format(
'data/archive/PowerQualityDistributionDataset1.csv')
print(X_train.shape)