Gom nhóm dữ liệu cột giống nhau trong Excel thành những file riêng bằng Python

Lọc dữ liệu Excel theo nhóm giá trị Segment và tạo nhóm Segment thành những file riêng.

Example 1

from pathlib import Path

import pandas as pd  # pip install pandas

# Define & create output directory
output_dir = Path(__file__).parent / "output"
output_dir.mkdir(parents=True, exist_ok=True)

# Define Excel file path
excel_file = Path(__file__).parent / "Data sample 1.xlsx"

df = pd.read_excel(excel_file)
column_name = "Segment"
unique_values = df[column_name].unique()

for unique_value in unique_values:
    df_output = df[df[column_name].str.fullmatch(unique_value)]
    output_path = output_dir / f"{unique_value}.xlsx"
    df_output.to_excel(output_path, sheet_name=unique_value[:31], index=False)

Data sample: https://docs.google.com/spreadsheets/d/1CPpcxYVzAZtlNZnJ-_4sldZku-xB1jpN

Example 2

from pathlib import Path

import pandas as pd  # pip install pandas


# Define & create output directory
output_dir = Path(__file__).parent / "output"
output_dir.mkdir(parents=True, exist_ok=True)

# Define Excel file path
excel_file = Path(__file__).parent / "Data sample 2.xlsx"

df = pd.read_excel(excel_file)
column_name = "Country"
df[column_name] = df[column_name].str.strip().str.title()
unique_values = df[column_name].unique()

for unique_value in unique_values:
    df_output = df[df[column_name].str.fullmatch(unique_value)]
    output_path = output_dir / f"{unique_value}.xlsx"
    df_output.to_excel(output_path, sheet_name=unique_value[:31], index=False)

Data sample: https://docs.google.com/spreadsheets/d/1wC7kDFZ9-umysOnRkl3bDgSbBZUUEg9h

Example 3

from pathlib import Path

import pandas as pd  # pip install pandas

# Define & create output directory
output_dir = Path(__file__).parent / "output"
output_dir.mkdir(parents=True, exist_ok=True)

# Define Excel file path
excel_file = Path(__file__).parent / "Data sample 3.xlsx"

df = pd.read_excel(excel_file)
column_name = "Country"
replace_symbols = ['>', '<', ':', '"', '/', '\\\\', '\|', '\?', '\*']
df[column_name] = (
    df[column_name].replace(replace_symbols, '', regex=True).str.strip().str.title()
)
unique_values = df[column_name].unique()

for unique_value in unique_values:
    df_output = df[df[column_name].str.fullmatch(unique_value)]
    output_path = output_dir / f"{unique_value}.xlsx"
    df_output.to_excel(output_path, sheet_name=unique_value[:31], index=False)

Data sample: https://docs.google.com/spreadsheets/d/14OwiSeX3t2jY5gysOjK3Un0tAEzjpTiL

Example 4: Split data with numeric column

from pathlib import Path

import pandas as pd  

output_dir = Path(__file__).parent / "output"
output_dir.mkdir(parents=True, exist_ok=True)

excel_file = Path(__file__).parent / "Result1_20230519.xlsx"

df = pd.read_excel(excel_file, sheet_name='Result1_20230519')
column_name = "CTKD"
df[column_name] = (
    df[column_name].apply(str).str.replace(',', '')
)

unique_values = df[column_name].unique()

for unique_value in unique_values:
    df_output = df[df[column_name].str.fullmatch(unique_value)]
    output_path = output_dir / f"{unique_value}.xlsx"
    df_output.to_excel(output_path, sheet_name=unique_value[:31], index=False)