Appearance
DataFrame数据修改常用操作
1、创建DataFrame
python
df = pd.DataFrame(data)
# 从 CSV 文件读取
DataFrame df = pd.read_csv('file.csv')
# 推荐使用kpl,可以完整保留行索引与列索引
df_loaded = pd.read_pickle(r'E:\py\df\5555.pkl')
2、选择数据
python
# 选择单列
df['姓名']
# 选择多列 索引列用()
df[['姓名', '城市']]
df[('三年级','1班')]
# 选择行
df.iloc[0] # 根据索引选择第 0 行
# 条件选择 多个条件用&来串联
df[(df['年龄'] >) 30 & (df['年龄'] < 600)]
3、添加删除列
python
# 添加新列
df['薪资'] = [5000, 6000, 7000]
# 删除列
df.drop('薪资', axis=1, inplace=True)
4、排序与分组求和
python
# 根据某列排序
df.sort_values(by='年龄', ascending=True, inplace=True)
# 分组聚合 平均值 中位数等
grouped = df.groupby('城市').mean() # 按城市分组并计算平均值
5、数据的横向纵向连接
python
# 合并两个 DataFrame ,同sql来实现 连接
merged = pd.merge(df1, df2, on='姓名')
# 连接 DataFrame
concat_df = pd.concat([df1, df2], axis=0) # 纵向连接
6、数据处理
python
# 检查缺失值
df.isnull().sum()
df.isna()
# 删除缺失值
df.dropna(inplace=True)
# 填充缺失值
df.fillna(0, inplace=True)
# 转置 DataFrame
transposed = df.T
7、索引
python
#设置列索引 以及列索引重命名
df = df.set_index(['line_gl_account_type','line_gl_parent_account_name','line_gl_account_name'])
df.index.names = ['Account Type','Parent Account', 'Account']
#重置索引,索引需要唯一不重复
df_loaded = df_loaded.reset_index()
8、pandas.io.formats.style.Styler对象的一些操作
python
#DataFrame执行过df.style.apply就变成了Styler对象,
#添加框线操作
styled_df = styled_df.set_table_styles(
[{'selector': 'th', 'props': [('border', '1px solid black')]},
{'selector': 'td', 'props': [('border', '1px solid black')]}])
#左对齐 右对齐操作
styled_df.set_properties(**{'text-align': 'right'})
9、一些无分类操作
python
#添加空白行赋值<br> 就不会被隐藏
empty_rows.iloc[1, df.columns.get_loc(('line_gl_account_type', ''))] = '<br> '
#自定义排序
custom_order = ['Income', 'Cost of Goods Sold', 'Expense', 'Other Income', 'Tax Expense']
df_loaded[('line_gl_account_type', '')] = pd.Categorical(df_loaded[('line_gl_account_type', '')],categories=custom_order, ordered=True)
df_sorted = df_loaded.sort_values(by=[('line_gl_account_type', ''), ('line_gl_parent_account_name', ''), ('groupingid', '')])
#输出到本地html
file_path = r"E:\py\output.html"
styled_df.to_html(file_path)
代码demo
python
import time
import os
import json
import re
import pandas as pd
# 45 Gross Profit
line1 = {"location":{("line_gl_account_type",""): "Cost of Goods Sold"},
"typpe": "sum",
"conditions":{("line_gl_account_type",""): ["Income","Cost of Goods Sold"]},
"alias":{"line_gl_account_type": "Gross Profit","line_gl_parent_account_code": "Gross Profit","line_gl_account_name": "Gross Profit"}}
# 457 Net Ordinary Income
line2 = {"location":{("line_gl_account_type",""): "Expense"},
"typpe": "sum",
"conditions":{("line_gl_account_type",""): ["Income","Cost of Goods Sold","Expense"]},
"alias":{"line_gl_account_type": "Net Ordinary Income","line_gl_parent_account_code": "Net Ordinary Income","line_gl_account_name": "Net Ordinary Income"}}
# 4578 Income(Loss) Before Taxes
line3 = {"location":{("line_gl_account_type",""): "Other Income"},
"typpe": "sum",
"conditions":{("line_gl_account_type",""): ["Income","Cost of Goods Sold","Expense","Other Income"]},
"alias":{"line_gl_account_type": "Income(Loss) Before Taxes","line_gl_parent_account_code": "Income(Loss) Before Taxes","line_gl_account_name": "Income(Loss) Before Taxes"}}
# 4578 9 Net Income
line4= {"location":{("line_gl_account_type",""): "Tax Expense"},
"typpe": "sum",
"conditions":{("line_gl_account_type",""): ["Income","Cost of Goods Sold","Expense","Other Income","Tax Expense"]},
"alias":{"line_gl_account_type": "Net Income","line_gl_parent_account_code": "Net Income","line_gl_account_name": "Net Income"}}
lines = [line1,line2,line3,line4]
# col0 = {
# "location":{("clientid","clientid"): "1000"},
# "typpe": "sum",
# # "conditions":{("clientid",""): ["1000","1001","1004","1015"]},
# "alias":{"ns_subsidiary": "test2","ns_doc_type": "test1"}}
col0 = {}
cols=[col0]
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None) #
pd.set_option('display.width', None)
def df_insert_col(df,cols):
# 循环添加自定义列
for col in cols:
float_int_columns = [col for col in df.columns if df[col].dtype in ['float64', 'int64']]
columns_to_select = [col for col in float_int_columns if col != ("groupingid", "")]
df[('Total Amount', 'Total Amount')] = df[columns_to_select].sum(axis=1).round(2)
df[('% of Income', '')] = df[('Total Amount', 'Total Amount')] / df.loc[
df[("groupingid", "")] == 0, ('Total Amount', 'Total Amount')].sum()
# print(df)
# 根据条件添加自定义的列
return df
def df_insert_line(df,lines):
# 循环添加自定义行
# df['line_gl_parent_account_code'] = df['line_gl_parent_account_code'].astype(object)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None) # 显示所有列
df = df.reset_index(drop=True)
for line in lines:
# 根据条件确定添加自定义行的位置
mask = pd.Series([True] * len(df))
for col1, value in line['location'].items():
mask &= (df[col1] == value)
max_index = df[mask].index.max()
# 获取添加自定义行的求和条件
# columns_to_select = list(line['field_define'])
float_int_columns = [col for col in df.columns if df[col].dtype in ['float64', 'int64']]
columns_to_select = [col for col in float_int_columns if col != ("groupingid", "") or col != ("groupingid", "")]
# 获取添加自定义行的求和条件
# 对groupingid为0的所有数据进行过滤
sum_df = df[df[("groupingid", "")] == 0].reset_index(drop=True)
# 对condition条件进行过滤求和
mask = pd.Series([True] * len(sum_df))
# 遍历条件字典,根据每个条件更新mask
for key, values in line['conditions'].items():
mask &= sum_df[key].isin(values)
# 获取满足条件的行的索引
indices = sum_df[mask].index
# 根据条件添加自定义的行
sum_value = pd.DataFrame()
if line["typpe"] == "sum":
sum_value = sum_df.loc[indices, columns_to_select].sum().round(2).to_frame().T
elif line["typpe"] == "avg":
sum_value = df[columns_to_select].mean().to_frame().T
elif line["typpe"] == "max":
sum_value = df[columns_to_select].max().to_frame().T
elif line["typpe"] == "min":
sum_value = df[columns_to_select].min().to_frame().T
elif line["typpe"] == "count":
sum_value = df[columns_to_select].count().to_frame().T
elif line["typpe"] == "std":
sum_value = df[columns_to_select].std().to_frame().T
elif line["typpe"] == "var":
sum_value = df[columns_to_select].var().to_frame().T
elif line["typpe"] == "median":
sum_value = df[columns_to_select].median().to_frame().T
else:
return {"error": "Invalid type"}
#自定义行的命名
for col_name, col_value in line['alias'].items():
sum_value.insert(0, col_name, col_value)
sum_value['groupingid'] =-1
# 第二层
first_list = [x[0] for x in list(df.columns)]
second_list = [x[1] for x in list(df.columns)]
arrays = [first_list, second_list]
# 创建多层索引对象
multi_index = pd.MultiIndex.from_arrays(arrays, names=('', ''))
sum_value.columns = multi_index
sum_value = sum_value.reset_index(drop=True)
# 将自定义行添加到原df中
max_index = max_index + 1
upper_df = df.iloc[:max_index]
lower_df = df.iloc[max_index:]
# 插入到指定位置
df = pd.concat([upper_df, sum_value, lower_df], ignore_index=True)
return df
def convert(df_loaded):
df_loaded = df_loaded.drop(df_loaded.columns[[-1, -3]], axis=1)
# 获取那些列可以进行计算
float_int_columns = [col for col in df_loaded.columns if df_loaded[col].dtype in ['float64', 'int64']]
columns_to_select = [col for col in float_int_columns if col != ("groupingid", "")]
# 重新生成index
df_loaded = df_loaded.reset_index()
# 定义排序的顺序
custom_order = ['Income', 'Cost of Goods Sold', 'Expense', 'Other Income', 'Tax Expense']
df_loaded[('line_gl_account_type', '')] = pd.Categorical(df_loaded[('line_gl_account_type', '')],
categories=custom_order, ordered=True)
df_sorted = df_loaded.sort_values(
by=[('line_gl_account_type', ''), ('line_gl_parent_account_name', ''), ('groupingid', '')])
# 插入行
df = df_insert_line(df_sorted, lines)
# 插入列
df = df_insert_col(df, cols)
# 创建两行空行
empty_rows = pd.DataFrame([[None] * len(df.columns)]*2, columns=df.columns)
empty_rows.iloc[0, df.columns.get_loc(('line_gl_account_type', ''))] = ' '
empty_rows.iloc[1, df.columns.get_loc(('line_gl_account_type', ''))] = ' '
empty_rows.iloc[0, df.columns.get_loc(('line_gl_parent_account_name', ''))] = ' '
empty_rows.iloc[1, df.columns.get_loc(('line_gl_parent_account_name', ''))] = ' '
empty_rows.iloc[0, df.columns.get_loc(('line_gl_account_name', ''))] = ' '
empty_rows.iloc[1, df.columns.get_loc(('line_gl_account_name', ''))] = ' '
# 将空行添加到 DataFrame 的末尾
df = pd.concat([df, empty_rows], ignore_index=True)
# 第二部分自定义单元格line1
df_cell = pd.DataFrame(columns=df.columns)
df_cell[('line_gl_account_type', '')] = 'Gross Profit Ratio'
df_cell[('line_gl_parent_account_name', '')] = 'Gross Profit Ratio'
df_cell[('line_gl_account_name', '')] = 'Gross Profit Ratio'
if columns_to_select:
for column in columns_to_select:
per = df.loc[df[('line_gl_account_type', '')] == 'Gross Profit', column] / df.loc[
(df[('groupingid', '')] == 0) & (df[('line_gl_account_type', '')] == 'Income'), column].sum()
df_cell[column] = per
df_cell[('groupingid', '')] = -2
df_cell[('line_gl_account_type', '')] = 'Gross Profit Ratio'
df_cell[('line_gl_parent_account_name', '')] = 'Gross Profit Ratio'
df_cell[('line_gl_account_name', '')] = 'Gross Profit Ratio'
df = pd.concat([df, df_cell], ignore_index=True)
# 第二部分自定义单元格line2
df_cell = df2 = pd.DataFrame(columns=df.columns)
if columns_to_select:
for column in columns_to_select:
per = df.loc[df[('line_gl_account_type', '')] == 'Net Income', column] / df.loc[
(df[('groupingid', '')] == 0) & (df[('line_gl_account_type', '')] == 'Income'), column].sum()
df_cell[column] = per
df_cell[('groupingid', '')] = -2
df_cell[('line_gl_account_type', '')] = 'Net Profit Ratio'
df_cell[('line_gl_parent_account_name', '')] = 'Net Profit Ratio'
df_cell[('line_gl_account_name', '')] = 'Net Profit Ratio'
df = pd.concat([df, df_cell], ignore_index=True)
# 第三部分自定义单元格line3
df_cell = df2 = pd.DataFrame(columns=df.columns)
if columns_to_select:
for column in columns_to_select:
per = df.loc[df[('line_gl_account_type', '')] == 'Net Ordinary Income', column] + df.loc[
(df[('groupingid', '')] == 0) & (df[('line_gl_account_name', '')].astype(str).str.startswith(
('50070', '70010', '70015'))), column].sum()
df_cell[column] = per
df_cell[('groupingid', '')] = -2
df_cell[('line_gl_account_type', '')] = 'EBITDA'
df_cell[('line_gl_parent_account_name', '')] = 'EBITDA'
df_cell[('line_gl_account_name', '')] = 'EBITDA'
df = pd.concat([df, df_cell], ignore_index=True)
# 删除数据为空的行
df = df[df[('Total Amount', 'Total Amount')] != 0]
# 将('line_gl_account_type', '') 为'Income', 'Other Income' 的类型为float6464 的数据乘以-1
columns_to_modify = df.select_dtypes(include=['float64']).columns.difference([('groupingid', '')])
df.loc[df[('line_gl_account_type', '')].isin(['Income', 'Other Income']), columns_to_modify] *= -1
df.loc[df[('groupingid', '')] == -1, columns_to_modify] *= -1
df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Income'), (
'line_gl_parent_account_name', '')] = 'Total Income'
df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Income'), (
'line_gl_account_name', '')] = 'Total Income'
df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Cost of Goods Sold'), (
'line_gl_parent_account_name', '')] = 'Total Cost of Goods Sold'
df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Cost of Goods Sold'), (
'line_gl_account_name', '')] = 'Total Cost of Goods Sold'
df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Expense'), (
'line_gl_parent_account_name', '')] = 'Total Expense'
df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Expense'), (
'line_gl_account_name', '')] = 'Total Expense'
df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Other Income'), (
'line_gl_parent_account_name', '')] = 'Total Other Income'
df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Other Income'), (
'line_gl_account_name', '')] = 'Total Other Income'
df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Tax Expense'), (
'line_gl_parent_account_name', '')] = 'Total Tax Expense'
df.loc[(df[('groupingid', '')] == 3) & (df[('line_gl_account_type', '')] == 'Tax Expense'), (
'line_gl_account_name', '')] = 'Total Tax Expense'
condition = df[('groupingid', '')].isin([1, 3])
# 赋值
df.loc[condition & df[('line_gl_account_type', '')].isna(), 'line_gl_account_type'] = 'Subtotal'
df.loc[condition & df[('line_gl_parent_account_name', '')].isna(), 'line_gl_parent_account_name'] = 'Subtotal'
df.loc[condition & df[('line_gl_account_name', '')].isna(), 'line_gl_account_name'] = 'Subtotal'
df = df.set_index(['line_gl_account_type',
'line_gl_parent_account_name',
'line_gl_account_name'])
df.index.names = [
'Account Type',
'Parent Account',
'Account'
]
df = df[df[('groupingid', '')] != 7]
def highlight_row_red(row):
# 如果 'groupingid' 列的值为 3,则整行标记为红色背景
if row[('groupingid', '')] == 1:
return ['background-color: #e6f7e6;font-weight: bold' for _ in row] # 中红色
elif row[('groupingid', '')] == 3:
return ['background-color: #cce5ff;font-weight: bold' for _ in row] # 中红色
elif row[('groupingid', '')] == -1:
return ['background-color: #66b3ff;font-weight: bold' for _ in row] # 浅蓝色
elif row[('groupingid', '')] == -2:
return ['background-color: #1a75ff;font-weight: bold' for _ in row] # 中蓝色
else:
return ['' for _ in row] # 默认无色
# 应用样式
# df = df.round(2)
for column in df.select_dtypes(include=['float64']).columns:
df[column] = df[column].apply(lambda x: f"{x:,.2f}" if pd.notna(x) and x > 1 else x)
for col in df.index[-3:]:
df[col] = df[col].apply(lambda x: f"{x:.2%}" if pd.notna(x) else x)
# 应用样式
# float_columns = df.select_dtypes(include='float64').columns
styled_df = df.style.apply(highlight_row_red, axis=1).format(na_rep='', precision=2)
# 加边框线
styled_df = styled_df.set_table_styles(
[{'selector': 'th', 'props': [('border', '1px solid black')]},
{'selector': 'td', 'props': [('border', '1px solid black')]}]
)
# float_cols = [('2024-02', 'line_net_amount')]
# text_cols = [('2024-06', 'line_net_amount')]
# text_cols = df.select_dtypes(include=['object']).columns
# # 查找 float64 类型的列,并对这些列进行右对齐
# float_cols = df.select_dtypes(include=['float64']).columns
styled_df = styled_df.set_properties(**{'text-align': 'right'})
# styled_df = styled_df.hide_columns([('groupingid', '')])
# 获取那些列可以进行计算
return styled_df
# if __name__ == '__main__':
# df_loaded = pd.read_pickle(r'E:\py\df\5555.pkl')
# convert(df_loaded)