Liu/defend/data_load.py

90 lines
2.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
def data_format(data_path, is_column=False, rate=0.25):
"""_summary_
Args:
data_path (_type_): 数据路径
is_column (bool, optional): 是否为列数据. Defaults to False.
rate (float, optional): 实验集划分的比例. Defaults to 0.25.
md:模式,0为分类,1为预测
Returns:X_train, X_test, Y_train, Y_test
_type_: np.array
"""
# 读入数据
X = data_load_forecast(data_path, is_column)
# 归一化数据
sc = MinMaxScaler(feature_range=(-1, 1))
X = sc.fit_transform(X)
# 分离Y
# 分离第 128 个元素
Y = X[:, -1]
# 分离前 127 个元素
X = X[:, :-1]
# 划分数据集75%用于训练25%用于测试
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=rate, random_state=7)
return X_train, X_test, Y_train, Y_test
def data_load_forecast(data_path, is_column=False):
"""
数据加载
data_path: 数据路径
is_column: 是否是列数据
return:X,Y
"""
# 读取csv文件
df = pd.read_csv(data_path)
# 进行数据清洗
data_clean(df, is_column)
df = df[df['output'] == 1]
# 去除第一列
df = df.drop(df.columns[0], axis=1)
# 初始化X,Y
X = []
# 遍历DataFrame的每一行
for index, row in df.iterrows():
# 获取前127个数据项
X.append(row.iloc[0:128])
return np.array(X)
def data_clean(data, is_column=False):
"""_summary_
Args:
data (_type_): csv数据
is_column (bool, optional): 清除含有NaN数据的列. Defaults to False.即清除含有NaN数据的行
Returns:
_type_: 清洗过的数据
"""
if not is_column:
data = data.dropna(axis=0)
return data
else:
data = data.dropna(axis=1)
return data
if __name__ == '__main__':
# 加载数据
X_train, X_test, Y_train, Y_test = data_format(
'data/archive/PowerQualityDistributionDataset1.csv')
print(X_train.shape)