aW1wb3J0IHBhbmRhcyBhcyBwZAppbXBvcnQgbnVtcHkgYXMgbnAKCiMgMS4g7YyM7J28IOydveq4sApkZiA9IHBkLnJlYWRfY3N2KCdfZmU5ODdiZmI3MGFjNGJiYWI1MWNiM2M1Yzk5YmQ3MDZfc2FsZXNfdHJhbnNhY3Rpb25zX2pyLmNzdicpCgojIDIuIHF1YW50aXR5IOyXtCDsspjrpqw6IOyIq+yekOuhnCDrs4DtmZgg6rCA64ql7ZWcIOqwkuunjCDrs4DtmZgsIOq3uCDsmbjripQgTmFOCmRmWydxdWFudGl0eSddID0gcGQudG9fbnVtZXJpYyhkZlsncXVhbnRpdHknXSwgZXJyb3JzPSdjb2VyY2UnKQoKIyAzLiB1bml0X3ByaWNlIOyXtCDsspjrpqwKZGVmIGNsZWFuX3VuaXRfcHJpY2UocHJpY2UpOgogICAgaWYgaXNpbnN0YW5jZShwcmljZSwgc3RyKToKICAgICAgICAjICQg6riw7Zi4IOygnOqxsAogICAgICAgIHByaWNlID0gcHJpY2UucmVwbGFjZSgnJCcsICcnKS5zdHJpcCgpCiAgICAgICAgIyAmcXVvdDtUZW4mcXVvdDsg6rCZ7J2AIO2FjeyKpO2KuOuKlCBOYU7snLzroZwg7LKY66asCiAgICAgICAgaWYgcHJpY2UubG93ZXIoKSA9PSAndGVuJzoKICAgICAgICAgICAgcmV0dXJuIG5wLm5hbgogICAgICAgIHRyeToKICAgICAgICAgICAgcmV0dXJuIGZsb2F0KHByaWNlKQogICAgICAgIGV4Y2VwdDoKICAgICAgICAgICAgcmV0dXJuIG5wLm5hbgogICAgZWxzZToKICAgICAgICB0cnk6CiAgICAgICAgICAgIHJldHVybiBmbG9hdChwcmljZSkKICAgICAgICBleGNlcHQ6CiAgICAgICAgICAgIHJldHVybiBucC5uYW4KCmRmWyd1bml0X3ByaWNlJ10gPSBkZlsndW5pdF9wcmljZSddLmFwcGx5KGNsZWFuX3VuaXRfcHJpY2UpCgojIDQuIHF1YW50aXR5IOqysOy4oeqwkuydhCDspJHslZnqsJLsnLzroZwg64yA7LK0CnF1YW50aXR5X21lZGlhbiA9IGRmWydxdWFudGl0eSddLm1lZGlhbigpCmRmWydxdWFudGl0eSddID0gZGZbJ3F1YW50aXR5J10uZmlsbG5hKHF1YW50aXR5X21lZGlhbikKCiMgNS4gdW5pdF9wcmljZSDqsrDsuKHqsJLsnYQg7Y+J6reg7Jy866GcIOuMgOyytAp1bml0X3ByaWNlX21lYW4gPSBkZlsndW5pdF9wcmljZSddLm1lYW4oKQpkZlsndW5pdF9wcmljZSddID0gZGZbJ3VuaXRfcHJpY2UnXS5maWxsbmEodW5pdF9wcmljZV9tZWFuKQoKIyA2LiB0b3RhbF9wcmljZSDsnqzqs4TsgrAgKHF1YW50aXR5ICogdW5pdF9wcmljZSkKZGZbJ3RvdGFsX3ByaWNlJ10gPSBkZlsncXVhbnRpdHknXSAqIGRmWyd1bml0X3ByaWNlJ10KCiMgNy4gb3JkZXJfZGF0ZV9taXNzaW5nIO2UjOuemOq3uCDsg53shLEKZGZbJ29yZGVyX2RhdGVfbWlzc2luZyddID0gZGZbJ29yZGVyX2RhdGUnXS5hcHBseSgKICAgIGxhbWJkYSB4OiAxIGlmIHBkLmlzbmEoeCkgb3IgKGlzaW5zdGFuY2UoeCwgc3RyKSBhbmQgc3RyKHgpLnN0cmlwKCkgPT0gJycpIGVsc2UgMAopCgojIDguIGN1c3RvbWVyX2VtYWlsX21pc3Npbmcg7ZSM656Y6re4IOyDneyEsQpkZWYgaXNfbWlzc2luZ19lbWFpbChlbWFpbCk6CiAgICBpZiBwZC5pc25hKGVtYWlsKToKICAgICAgICByZXR1cm4gMQogICAgZW1haWxfc3RyID0gc3RyKGVtYWlsKS5zdHJpcCgpCiAgICBpZiBlbWFpbF9zdHIgPT0gJyc6CiAgICAgICAgcmV0dXJuIDEKICAgIHJldHVybiAwCgpkZlsnY3VzdG9tZXJfZW1haWxfbWlzc2luZyddID0gZGZbJ2N1c3RvbWVyX2VtYWlsJ10uYXBwbHkoaXNfbWlzc2luZ19lbWFpbCkKCiMgOS4g6rKw6rO8IOyggOyepQpkZi50b19jc3YoJ3NhbGVzX3RyYW5zYWN0aW9uc19qcl9jbGVhbl9taXNzaW5nLmNzdicsIGluZGV4PUZhbHNlKQoKIyAxMC4g7LKY66asIOqysOqzvCDstpzroKUKcHJpbnQoJnF1b3Q7PSZxdW90OyAqIDgwKQpwcmludCgmcXVvdDtzYWxlc190cmFuc2FjdGlvbnNfanJfY2xlYW5fbWlzc2luZy5jc3Yg7YyM7J287J20IOyDneyEseuQmOyXiOyKteuLiOuLpCEmcXVvdDspCnByaW50KCZxdW90Oz0mcXVvdDsgKiA4MCkKCnByaW50KGYmcXVvdDtcbuuNsOydtO2EsCDtmJXtg5w6IHtkZi5zaGFwZVswXX3tlokgJnRpbWVzOyB7ZGYuc2hhcGVbMV197Je0JnF1b3Q7KQpwcmludChmJnF1b3Q7XG7sspjrpqzrkJwg7Je067OEIOygleuztDomcXVvdDspCgojIOyImOy5mO2YlSDsl7Qg7Ya16rOECm51bWVyaWNfY29scyA9IFsncXVhbnRpdHknLCAndW5pdF9wcmljZScsICd0b3RhbF9wcmljZSddCmZvciBjb2wgaW4gbnVtZXJpY19jb2xzOgogICAgaWYgY29sIGluIGRmLmNvbHVtbnM6CiAgICAgICAgcHJpbnQoZiZxdW90O1xue2NvbH06JnF1b3Q7KQogICAgICAgIHByaW50KGYmcXVvdDsgIC0g7LWc7IaM6rCSOiB7ZGZbY29sXS5taW4oKTouMmZ9JnF1b3Q7KQogICAgICAgIHByaW50KGYmcXVvdDsgIC0g7LWc64yA6rCSOiB7ZGZbY29sXS5tYXgoKTouMmZ9JnF1b3Q7KQogICAgICAgIHByaW50KGYmcXVvdDsgIC0g7Y+J6reg6rCSOiB7ZGZbY29sXS5tZWFuKCk6LjJmfSZxdW90OykKICAgICAgICBpZiBjb2wgPT0gJ3F1YW50aXR5JzoKICAgICAgICAgICAgcHJpbnQoZiZxdW90OyAgLSDspJHslZnqsJI6IHtkZltjb2xdLm1lZGlhbigpOi4yZn0mcXVvdDspCiAgICAgICAgaWYgY29sID09ICd1bml0X3ByaWNlJzoKICAgICAgICAgICAgcHJpbnQoZiZxdW90OyAgLSDsgqzsmqnrkJwg7Y+J6reg6rCSOiB7dW5pdF9wcmljZV9tZWFuOi4yZn0mcXVvdDspCgpwcmludChmJnF1b3Q7XG7tlIzrnpjqt7gg7Je0IO2GteqzhDomcXVvdDspCnByaW50KGYmcXVvdDsgIC0gb3JkZXJfZGF0ZV9taXNzaW5nICgx7J24IOqyveyasCk6IHtkZlsnb3JkZXJfZGF0ZV9taXNzaW5nJ10uc3VtKCl96rCcJnF1b3Q7KQpwcmludChmJnF1b3Q7ICAtIGN1c3RvbWVyX2VtYWlsX21pc3NpbmcgKDHsnbgg6rK97JqwKToge2RmWydjdXN0b21lcl9lbWFpbF9taXNzaW5nJ10uc3VtKCl96rCcJnF1b3Q7KQoKIyDqsrDsuKHqsJIg7LKY66asIOyghO2bhCDruYTqtZAKcHJpbnQoZiZxdW90O1xucXVhbnRpdHkg6rKw7Lih6rCSIOyymOumrDomcXVvdDspCnByaW50KGYmcXVvdDsgIC0g7JuQ67O4IOqysOy4oeqwkiDsiJg6IHtwZC5yZWFkX2NzdignX2ZlOTg3YmZiNzBhYzRiYmFiNTFjYjNjNWM5OWJkNzA2X3NhbGVzX3RyYW5zYWN0aW9uc19qci5jc3YnKVsncXVhbnRpdHknXS5pc25hKCkuc3VtKCl9JnF1b3Q7KQpwcmludChmJnF1b3Q7ICAtIOyymOumrCDtm4Qg6rKw7Lih6rCSIOyImDoge2RmWydxdWFudGl0eSddLmlzbmEoKS5zdW0oKX0mcXVvdDspCnByaW50KGYmcXVvdDsgIC0g7IKs7Jqp65CcIOykkeyVmeqwkjoge3F1YW50aXR5X21lZGlhbjouMmZ9JnF1b3Q7KQoKcHJpbnQoZiZxdW90O1xudW5pdF9wcmljZSDqsrDsuKHqsJIg7LKY66asOiZxdW90OykKb3JpZ2luYWxfdW5pdF9wcmljZSA9IHBkLnJlYWRfY3N2KCdfZmU5ODdiZmI3MGFjNGJiYWI1MWNiM2M1Yzk5YmQ3MDZfc2FsZXNfdHJhbnNhY3Rpb25zX2pyLmNzdicpWyd1bml0X3ByaWNlJ10KIyDsm5Drs7jsl5DshJwg67mE7Iir7J6QIOqwkiDqsJzsiJgg7ZmV7J24Cm5vbl9udW1lcmljX2NvdW50ID0gb3JpZ2luYWxfdW5pdF9wcmljZS5hcHBseShsYW1iZGEgeDogbm90IChpc2luc3RhbmNlKHgsIChpbnQsIGZsb2F0KSkgb3IgKGlzaW5zdGFuY2UoeCwgc3RyKSBhbmQgeC5yZXBsYWNlKCckJywgJycpLnJlcGxhY2UoJy4nLCAnJykuaXNkaWdpdCgpKSkpLnN1bSgpCnByaW50KGYmcXVvdDsgIC0g7JuQ67O4IOu5hOyIq+yekCDqsJIg7IiYOiB7bm9uX251bWVyaWNfY291bnR9JnF1b3Q7KQpwcmludChmJnF1b3Q7ICAtIOyymOumrCDtm4Qg6rKw7Lih6rCSIOyImDoge2RmWyd1bml0X3ByaWNlJ10uaXNuYSgpLnN1bSgpfSZxdW90OykKcHJpbnQoZiZxdW90OyAgLSDsgqzsmqnrkJwg7Y+J6reg6rCSOiB7dW5pdF9wcmljZV9tZWFuOi4yZn0mcXVvdDspCgpwcmludCgmcXVvdDtcbuyymOumrOuQnCDrjbDsnbTthLAg7IOY7ZSMICjsspjsnYwgMTDtlokpOiZxdW90OykKcHJpbnQoZGYuaGVhZCgxMCkudG9fc3RyaW5nKCkpCgpwcmludChmJnF1b3Q7XG7tjIzsnbwg6rK966GcOiBzYWxlc190cmFuc2FjdGlvbnNfanJfY2xlYW5fbWlzc2luZy5jc3YmcXVvdDspCnByaW50KCZxdW90Oz0mcXVvdDsgKiA4MCk=
import pandas as pd
import numpy as np
# 1. 파일 읽기
df = pd.read_csv('_fe987bfb70ac4bbab51cb3c5c99bd706_sales_transactions_jr.csv')
# 2. quantity 열 처리: 숫자로 변환 가능한 값만 변환, 그 외는 NaN
df['quantity'] = pd.to_numeric(df['quantity'], errors='coerce')
# 3. unit_price 열 처리
def clean_unit_price(price):
if isinstance(price, str):
# $ 기호 제거
price = price.replace('$', '').strip()
# "Ten" 같은 텍스트는 NaN으로 처리
if price.lower() == 'ten':
return np.nan
try:
return float(price)
except:
return np.nan
else:
try:
return float(price)
except:
return np.nan
df['unit_price'] = df['unit_price'].apply(clean_unit_price)
# 4. quantity 결측값을 중앙값으로 대체
quantity_median = df['quantity'].median()
df['quantity'] = df['quantity'].fillna(quantity_median)
# 5. unit_price 결측값을 평균으로 대체
unit_price_mean = df['unit_price'].mean()
df['unit_price'] = df['unit_price'].fillna(unit_price_mean)
# 6. total_price 재계산 (quantity * unit_price)
df['total_price'] = df['quantity'] * df['unit_price']
# 7. order_date_missing 플래그 생성
df['order_date_missing'] = df['order_date'].apply(
lambda x: 1 if pd.isna(x) or (isinstance(x, str) and str(x).strip() == '') else 0
)
# 8. customer_email_missing 플래그 생성
def is_missing_email(email):
if pd.isna(email):
return 1
email_str = str(email).strip()
if email_str == '':
return 1
return 0
df['customer_email_missing'] = df['customer_email'].apply(is_missing_email)
# 9. 결과 저장
df.to_csv('sales_transactions_jr_clean_missing.csv', index=False)
# 10. 처리 결과 출력
print("=" * 80)
print("sales_transactions_jr_clean_missing.csv 파일이 생성되었습니다!")
print("=" * 80)
print(f"\n데이터 형태: {df.shape[0]}행 × {df.shape[1]}열")
print(f"\n처리된 열별 정보:")
# 수치형 열 통계
numeric_cols = ['quantity', 'unit_price', 'total_price']
for col in numeric_cols:
if col in df.columns:
print(f"\n{col}:")
print(f" - 최소값: {df[col].min():.2f}")
print(f" - 최대값: {df[col].max():.2f}")
print(f" - 평균값: {df[col].mean():.2f}")
if col == 'quantity':
print(f" - 중앙값: {df[col].median():.2f}")
if col == 'unit_price':
print(f" - 사용된 평균값: {unit_price_mean:.2f}")
print(f"\n플래그 열 통계:")
print(f" - order_date_missing (1인 경우): {df['order_date_missing'].sum()}개")
print(f" - customer_email_missing (1인 경우): {df['customer_email_missing'].sum()}개")
# 결측값 처리 전후 비교
print(f"\nquantity 결측값 처리:")
print(f" - 원본 결측값 수: {pd.read_csv('_fe987bfb70ac4bbab51cb3c5c99bd706_sales_transactions_jr.csv')['quantity'].isna().sum()}")
print(f" - 처리 후 결측값 수: {df['quantity'].isna().sum()}")
print(f" - 사용된 중앙값: {quantity_median:.2f}")
print(f"\nunit_price 결측값 처리:")
original_unit_price = pd.read_csv('_fe987bfb70ac4bbab51cb3c5c99bd706_sales_transactions_jr.csv')['unit_price']
# 원본에서 비숫자 값 개수 확인
non_numeric_count = original_unit_price.apply(lambda x: not (isinstance(x, (int, float)) or (isinstance(x, str) and x.replace('$', '').replace('.', '').isdigit()))).sum()
print(f" - 원본 비숫자 값 수: {non_numeric_count}")
print(f" - 처리 후 결측값 수: {df['unit_price'].isna().sum()}")
print(f" - 사용된 평균값: {unit_price_mean:.2f}")
print("\n처리된 데이터 샘플 (처음 10행):")
print(df.head(10).to_string())
print(f"\n파일 경로: sales_transactions_jr_clean_missing.csv")
print("=" * 80)