# 0. setting
import pandas as pd
import geopandas as gpd
import numpy as np
import datetime as dttm
import matplotlib.pyplot as plt
import random
import math
from cartoframes.viz import Layer, color_category_style, popup_element
from sklearn.cluster import DBSCAN
air_origin = pd.read_csv('../dataset/airbnb/South Korea-seoul_Monthly_Match_2022-08-17.csv')
extair_origin = pd.read_csv('../dataset/airbnb/South Korea-seoul_Property_Extended_Match_2022-08-17.csv')
/tmp/ipykernel_1510968/1429626531.py:12: DtypeWarning: Columns (24,27,28) have mixed types. Specify dtype option on import or set low_memory=False. air_origin = pd.read_csv('../dataset/airbnb/South Korea-seoul_Monthly_Match_2022-08-17.csv') /tmp/ipykernel_1510968/1429626531.py:13: DtypeWarning: Columns (26,51,59,60,62) have mixed types. Specify dtype option on import or set low_memory=False. extair_origin = pd.read_csv('../dataset/airbnb/South Korea-seoul_Property_Extended_Match_2022-08-17.csv')
# 1. handling
air = air_origin[('2015-01-01' <= air_origin['Reporting Month']) &
('2020-01-01' > air_origin['Reporting Month'])].copy() # 15~19년, copy 안 붙이면 경고나온다~
temp1 = sorted(air['Reporting Month'].unique().tolist()) # 60개월
temp2 = list(range(60)) # 0~59
months = dict(zip(temp1, temp2)) # (15.01 = 0) ~ (19.12 = 59)
air['timestamp'] = [months[a] for a in air['Reporting Month']]
air['timestamp_interval'] = ((air['timestamp'] / 12).apply(math.floor)).astype(str)
air.columns
Index(['Property ID', 'Property Type', 'Listing Type', 'Bedrooms', 'Reporting Month', 'Occupancy Rate', 'Revenue (USD)', 'Revenue (Native)', 'ADR (USD)', 'ADR (Native)', 'Number of Reservations', 'Reservation Days', 'Available Days', 'Blocked Days', 'Country', 'State', 'City', 'Zipcode', 'Neighborhood', 'Metropolitan Statistical Area', 'Latitude', 'Longitude', 'Active', 'Scraped During Month', 'Currency Native', 'Airbnb Property ID', 'Airbnb Host ID', 'HomeAway Property ID', 'HomeAway Property Manager', 'timestamp', 'timestamp_interval'], dtype='object')
# 1-1. bedrooms
bed_outliers = air[air['Bedrooms'] > 10]['Property ID'].unique() # Bed > 10인 ID
extair_origin[np.isin(extair_origin['Property ID'], bed_outliers)][['Listing Title', 'Property Type', 'Bedrooms', 'Listing URL']].head(15) # 대체로 게스트하우스, 전체 방 개수 센듯
# bedrooms 숫자는 크게 중요하지 않을 듯. 값어치를 말하는 건 아니네.
# 근데 hostel 이런 류는 예약 건수가 무지 많을 수도.
Listing Title | Property Type | Bedrooms | Listing URL | |
---|---|---|---|---|
18924 | 중랑구 아름다운 집 | Rental unit | 18.0 | https://www.airbnb.com/rooms/12191689 |
28589 | 비전 게스트하우스 | Serviced apartment | 15.0 | https://www.airbnb.com/rooms/17594321 |
33392 | 명동 남산을 구경하는 도심 속의 숙소 | Other | 20.0 | https://www.airbnb.com/rooms/20398602 |
33545 | Top Hotel N Residence | Room in boutique hotel | 36.0 | https://www.airbnb.com/rooms/20528918 |
48087 | High residence 201A in sinchon | Hostel | 32.0 | https://www.airbnb.com/rooms/22866486 |
48458 | 호텔일성(ILSUNG) 게스트하우스 | Guesthouse | 13.0 | https://www.airbnb.com/rooms/23149545 |
49001 | #03 고려대-안암-동대문-신설동-제기동 Dongdeamun Family Gositel | Private room in bed and breakfast | 15.0 | https://www.airbnb.com/rooms/23583191 |
50878 | Cosy Place In (Yeongdeungpo, 永登浦驛), businesstr... | Tiny house | 14.0 | https://www.airbnb.com/rooms/24720342 |
50884 | Cosy Place In (Yeongdeungpo, 永登浦驛), businesstr... | Tiny house | 14.0 | https://www.airbnb.com/rooms/24724097 |
51687 | Nice stay in sinchon | Hostel | 32.0 | https://www.airbnb.com/rooms/25357742 |
52152 | High residence 107 in sinchon | Hostel | 32.0 | https://www.airbnb.com/rooms/25921847 |
52214 | 호스텔베네 | Room in hotel | 30.0 | https://www.airbnb.com/rooms/25980574 |
52608 | "N House" recommended by Airbnb,교대 근처 원룸텔 | Serviced apartment | 50.0 | https://www.airbnb.com/rooms/26276555 |
52611 | We Provide kimchi, Ready to eat-rice, Side dishes | Serviced apartment | 50.0 | https://www.airbnb.com/rooms/26280360 |
52668 | High residence 209 in sinchon | Hostel | 32.0 | https://www.airbnb.com/rooms/26338792 |
air[air['Bedrooms'] > 5]['Number of Reservations'].mean() # 그렇지도 않네?...
2.117564484997368
# 1-2. revenue
air[['Revenue (USD)', 'ADR (USD)', 'Reservation Days']] # ADR = Revenue / Reservation Days
Revenue (USD) | ADR (USD) | Reservation Days | |
---|---|---|---|
94394 | 645.0 | 43.00 | 15.0 |
94395 | 731.0 | 43.00 | 17.0 |
94396 | 301.0 | 43.00 | 7.0 |
94397 | 506.0 | 42.17 | 12.0 |
94398 | 1050.0 | 42.00 | 25.0 |
... | ... | ... | ... |
2038341 | 733.5 | 48.90 | 15.0 |
2038342 | 724.5 | 48.30 | 15.0 |
2038343 | 573.3 | 47.77 | 12.0 |
2038344 | 931.2 | 49.01 | 19.0 |
2038345 | 471.0 | 52.33 | 9.0 |
1147756 rows × 3 columns
air[['Number of Reservations', 'Reservation Days', 'Available Days', 'Blocked Days']]
# 한 달 = Reservation(예약) + Available(빈 날) + Blocked(쉬는 날) 이었네.
# Number of Reservations 이건 예약 건수. 3-4일도 1건으로 보니까 값이 작지. -> 평균 숙박 일수를 구할 순 있네(밑에서 해본다)
Number of Reservations | Reservation Days | Available Days | Blocked Days | |
---|---|---|---|---|
94394 | 4.0 | 15.0 | 16.0 | 0.0 |
94395 | 4.0 | 17.0 | 13.0 | 0.0 |
94396 | 2.0 | 7.0 | 18.0 | 6.0 |
94397 | 1.0 | 12.0 | 16.0 | 2.0 |
94398 | 4.0 | 25.0 | 6.0 | 0.0 |
... | ... | ... | ... | ... |
2038341 | 7.0 | 15.0 | 1.0 | 15.0 |
2038342 | 6.0 | 15.0 | 3.0 | 12.0 |
2038343 | 5.0 | 12.0 | 3.0 | 16.0 |
2038344 | 9.0 | 19.0 | 9.0 | 2.0 |
2038345 | 8.0 | 9.0 | 19.0 | 3.0 |
1147756 rows × 4 columns
air['Number of Reservations'].describe()[1:8].round(2) # 한 달에 두 건이 평균. 마저도 장사가 잘 되는 곳, 아닌 곳 편차가 큼
mean 2.19 std 3.31 min 0.00 25% 0.00 50% 0.00 75% 4.00 max 31.00 Name: Number of Reservations, dtype: float64
air[air['Number of Reservations'] > 0] # 1/3 정도만 손님 받네
Property ID | Property Type | Listing Type | Bedrooms | Reporting Month | Occupancy Rate | Revenue (USD) | Revenue (Native) | ADR (USD) | ADR (Native) | ... | Longitude | Active | Scraped During Month | Currency Native | Airbnb Property ID | Airbnb Host ID | HomeAway Property ID | HomeAway Property Manager | timestamp | timestamp_interval | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
94394 | ab-5511718 | Apartment | Entire home/apt | 0.0 | 2015-03-01 | 0.484 | 645.0 | 721846.0 | 43.00 | 48123.07 | ... | 127.004239 | True | False | KRW | 5511718.0 | 3466949.0 | NaN | NaN | 2 | 0 |
94395 | ab-5511718 | Apartment | Entire home/apt | 0.0 | 2015-04-01 | 0.567 | 731.0 | 792733.0 | 43.00 | 46631.35 | ... | 127.004239 | True | False | KRW | 5511718.0 | 3466949.0 | NaN | NaN | 3 | 0 |
94396 | ab-5511718 | Apartment | Entire home/apt | 0.0 | 2015-05-01 | 0.280 | 301.0 | 326436.0 | 43.00 | 46633.71 | ... | 127.004239 | True | True | KRW | 5511718.0 | 3466949.0 | NaN | NaN | 4 | 0 |
94397 | ab-5511718 | Apartment | Entire home/apt | 0.0 | 2015-06-01 | 0.429 | 506.0 | 563312.0 | 42.17 | 46942.67 | ... | 127.004239 | True | True | KRW | 5511718.0 | 3466949.0 | NaN | NaN | 5 | 0 |
94398 | ab-5511718 | Apartment | Entire home/apt | 0.0 | 2015-07-01 | 0.806 | 1050.0 | 1196920.0 | 42.00 | 47876.80 | ... | 127.004239 | True | True | KRW | 5511718.0 | 3466949.0 | NaN | NaN | 6 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2038341 | ha-987448 | Apartment | Entire home/apt | 1.0 | 2019-08-01 | 0.938 | 733.5 | NaN | 48.90 | NaN | ... | 126.914691 | True | True | NaN | NaN | NaN | 987448 | NaN | 55 | 4 |
2038342 | ha-987448 | Apartment | Entire home/apt | 1.0 | 2019-09-01 | 0.833 | 724.5 | NaN | 48.30 | NaN | ... | 126.914691 | True | True | NaN | NaN | NaN | 987448 | NaN | 56 | 4 |
2038343 | ha-987448 | Apartment | Entire home/apt | 1.0 | 2019-10-01 | 0.800 | 573.3 | NaN | 47.77 | NaN | ... | 126.914691 | True | True | NaN | NaN | NaN | 987448 | NaN | 57 | 4 |
2038344 | ha-987448 | Apartment | Entire home/apt | 1.0 | 2019-11-01 | 0.679 | 931.2 | NaN | 49.01 | NaN | ... | 126.914691 | True | True | NaN | NaN | NaN | 987448 | NaN | 58 | 4 |
2038345 | ha-987448 | Apartment | Entire home/apt | 1.0 | 2019-12-01 | 0.321 | 471.0 | NaN | 52.33 | NaN | ... | 126.914691 | True | True | NaN | NaN | NaN | 987448 | NaN | 59 | 4 |
530742 rows × 31 columns
reserve_outliers = air[(air['Number of Reservations'] > 25)]['Property ID'].unique() # 25 넘긴 달 있다면 모두 체크
extair_origin[np.isin(extair_origin['Property ID'], reserve_outliers)][['Listing Title', 'Property Type', 'Bedrooms', 'Listing URL']].head(15) # 열리는 게 없네..
Listing Title | Property Type | Bedrooms | Listing URL | |
---|---|---|---|---|
3375 | 나만의 조용한휴식처(영 하우스) 친구,가족에좋다.2인/4인실. | Residential home | 1.0 | https://www.airbnb.com/rooms/4305228 |
16283 | Gaesuzak Guesthouse | Home | 5.0 | https://www.airbnb.com/rooms/9752842 |
18055 | Seoul Namsan Tower View Luxury 2rooms with NET... | Apartment | 2.0 | https://www.airbnb.com/rooms/11445411 |
18677 | Unique & Antic PartyHouse in Hongdae area-4INN... | House | 2.0 | https://www.airbnb.com/rooms/12057136 |
20176 | Another day in Hongdae,Seoul | Condominium | 2.0 | https://www.airbnb.com/rooms/13175035 |
24620 | Roomy 2R1B APT@ near line2 Konkuk Univ station | Rental unit | 2.0 | https://www.airbnb.com/rooms/15640283 |
26123 | MangoMangoChaChaCha_01 | House | 1.0 | https://www.airbnb.com/rooms/16275763 |
28388 | boutique hotel!낡은여관에 현대적 감각을 덧입혀 탄생시킨 부띠끄 호텔(NEW) | Room in boutique hotel | 1.0 | https://www.airbnb.com/rooms/17480823 |
28750 | 콜마이홈 이태원동 | House | 1.0 | https://www.airbnb.com/rooms/17689612 |
29267 | Whiskey Whiskey Whiskey 03 | Entire house | 2.0 | https://www.airbnb.com/rooms/17958182 |
29397 | 콜마이홈 한남동 | House | 1.0 | https://www.airbnb.com/rooms/18034123 |
30665 | 2-room Cozy studio only for you guys @Sinsa | Apartment | 2.0 | https://www.airbnb.com/rooms/18724287 |
31914 | Boan Stay Room 31 | Hostel | 1.0 | https://www.airbnb.com/rooms/19455588 |
32200 | Joinhouse# sale# Fashion street center location | Apartment | 1.0 | https://www.airbnb.com/rooms/19619402 |
33184 | [12.C] Garosugil | Apartment | 0.0 | https://www.airbnb.com/rooms/20225045 |
# air_origin[np.isin(air_origin['Property ID'], reserve_outliers)][['Property ID', 'Number of Reservations', 'Revenue (USD)']]
air_origin[air_origin['Property ID'] == 'ab-35321379'][['Reporting Month', 'Reservation Days', 'Number of Reservations', 'Revenue (USD)', 'ADR (USD)']] # 반짝하고 사라졌네?
# 오히려 1) 여러 달에 걸쳐서 사업, 2) reservation > 0, ... 이런 애들을 추려야 할듯
Reporting Month | Reservation Days | Number of Reservations | Revenue (USD) | ADR (USD) | |
---|---|---|---|---|---|
1955128 | 2019-06-01 | 0.0 | 0.0 | 0.00 | NaN |
1955129 | 2019-07-01 | 1.0 | 1.0 | 152.00 | 152.00 |
1955130 | 2019-08-01 | 31.0 | 18.0 | 4874.00 | 157.23 |
1955131 | 2019-09-01 | 29.0 | 23.0 | 4619.00 | 159.28 |
1955132 | 2019-10-01 | 31.0 | 10.0 | 5076.00 | 163.74 |
1955133 | 2019-11-01 | 29.0 | 26.0 | 5592.50 | 192.84 |
1955134 | 2019-12-01 | 31.0 | 15.0 | 5826.50 | 187.95 |
1955135 | 2020-01-01 | 31.0 | 17.0 | 5528.00 | 178.32 |
1955136 | 2020-02-01 | 24.0 | 19.0 | 4277.60 | 178.23 |
1955137 | 2020-03-01 | 25.0 | 10.0 | 3974.00 | 158.96 |
1955138 | 2020-04-01 | 16.0 | 9.0 | 2624.00 | 164.00 |
1955139 | 2020-05-01 | 31.0 | 27.0 | 5049.00 | 162.87 |
1955140 | 2020-06-01 | 23.0 | 22.0 | 4115.00 | 178.91 |
1955141 | 2020-07-01 | 29.0 | 22.0 | 4594.00 | 158.41 |
1955142 | 2020-08-01 | 30.0 | 26.0 | 5720.00 | 190.67 |
1955143 | 2020-09-01 | 21.0 | 17.0 | 3948.00 | 188.00 |
1955144 | 2020-10-01 | 23.0 | 20.0 | 4568.00 | 198.61 |
1955145 | 2020-11-01 | 25.0 | 24.0 | 5099.00 | 203.96 |
1955146 | 2020-12-01 | 23.0 | 17.0 | 8300.00 | 360.87 |
1955147 | 2021-01-01 | 23.0 | 19.0 | 5388.00 | 234.26 |
1955148 | 2021-02-01 | 25.0 | 23.0 | 5312.00 | 212.48 |
1955149 | 2021-03-01 | 23.0 | 16.0 | 5625.40 | 244.58 |
1955150 | 2021-04-01 | 26.0 | 7.0 | 6434.53 | 247.48 |
1955151 | 2021-05-01 | 28.0 | 20.0 | 6851.60 | 244.70 |
1955152 | 2021-06-01 | 20.0 | 13.0 | 4853.08 | 242.65 |
1955153 | 2021-07-01 | 12.0 | 3.0 | 2340.23 | 195.02 |
1955154 | 2021-08-01 | 2.0 | 1.0 | 525.00 | 262.50 |
1955155 | 2021-09-01 | 2.0 | 1.0 | 525.00 | 262.50 |
1955156 | 2021-10-01 | 1.0 | 1.0 | 395.00 | 395.00 |
1955157 | 2021-11-01 | 0.0 | 0.0 | 0.00 | NaN |
1955158 | 2021-12-01 | 0.0 | 0.0 | 0.00 | NaN |
1955159 | 2022-01-01 | 0.0 | 0.0 | 0.00 | NaN |
1955160 | 2022-02-01 | 0.0 | 0.0 | 0.00 | NaN |
1955161 | 2022-03-01 | 0.0 | 0.0 | 0.00 | NaN |
1955162 | 2022-04-01 | 0.0 | 0.0 | 0.00 | NaN |
1955163 | 2022-05-01 | 0.0 | 0.0 | 0.00 | NaN |
1955164 | 2022-06-01 | 0.0 | 0.0 | 0.00 | NaN |
# 2. select reasonable observations
air['Property ID'].value_counts() > 5 # 한 달 반짝은 빼야되겠지?
ab-4740031 True ab-971508 True ab-960532 True ab-960581 True ab-1571427 True ... ab-40978458 False ab-40978094 False ab-40978056 False ab-40978048 False ab-39870870 False Name: Property ID, Length: 58745, dtype: bool
air['counts'] = air.groupby('Property ID')['Property ID'].transform('count') # 중복횟수 체크
temp3 = air[(air['counts'] > 11) & # 1) 12개월 이상 reporting이면서,
(air['Number of Reservations'] > 0)].copy() # 2) reservation 1회 이상인 observation만 남김
air[['counts', 'Revenue (USD)', 'Reservation Days', 'Number of Reservations']].describe().round(2) # 오리지널 5년치
counts | Revenue (USD) | Reservation Days | Number of Reservations | |
---|---|---|---|---|
count | 1147756.00 | 956890.00 | 1147756.00 | 1147756.00 |
mean | 28.77 | 611.38 | 7.13 | 2.19 |
std | 14.96 | 2875.04 | 9.88 | 3.31 |
min | 1.00 | 0.00 | 0.00 | 0.00 |
25% | 16.00 | 0.00 | 0.00 | 0.00 |
50% | 26.00 | 132.00 | 0.00 | 0.00 |
75% | 39.00 | 925.50 | 14.00 | 4.00 |
max | 60.00 | 1896514.00 | 31.00 | 31.00 |
temp3[['counts', 'Revenue (USD)', 'Reservation Days', 'Number of Reservations']].describe().round(2) # 1), 2) 추린 데이터
counts | Revenue (USD) | Reservation Days | Number of Reservations | |
---|---|---|---|---|
count | 482137.00 | 482137.00 | 482137.00 | 482137.00 |
mean | 32.70 | 1097.60 | 15.25 | 4.66 |
std | 13.77 | 3908.74 | 9.21 | 3.33 |
min | 12.00 | 0.00 | 0.00 | 1.00 |
25% | 21.00 | 330.00 | 7.00 | 2.00 |
50% | 31.00 | 807.73 | 15.00 | 4.00 |
75% | 42.00 | 1514.11 | 23.00 | 7.00 |
max | 60.00 | 1896514.00 | 31.00 | 31.00 |
revenue_low, revenue_high = np.percentile(temp3['Revenue (USD)'], [5, 95])
temp4 = temp3[(revenue_low < temp3['Revenue (USD)']) & (temp3['Revenue (USD)'] < revenue_high)].copy()
temp4[['counts', 'Revenue (USD)', 'Reservation Days', 'Number of Reservations']].describe().round(2) # 3) revenue 상하위 5%씩 쳐냄
counts | Revenue (USD) | Reservation Days | Number of Reservations | |
---|---|---|---|---|
count | 433547.00 | 433547.00 | 433547.00 | 433547.00 |
mean | 32.65 | 954.40 | 15.59 | 4.72 |
std | 13.74 | 681.53 | 8.86 | 3.22 |
min | 12.00 | 69.05 | 1.00 | 1.00 |
25% | 21.00 | 370.00 | 8.00 | 2.00 |
50% | 31.00 | 808.59 | 15.00 | 4.00 |
75% | 42.00 | 1431.00 | 23.00 | 7.00 |
max | 60.00 | 2859.80 | 31.00 | 30.00 |
# 3. visualize
air_sample = temp4.sample(frac = 0.05) # sample 뽑아서 해보자
temp5 = gpd.GeoDataFrame(air_sample, geometry = gpd.points_from_xy(x = air_sample.Longitude, y = air_sample.Latitude))
Layer(temp5, color_category_style('timestamp_interval')) # 연차마다 다른 건 없는데?... 관광지/교통 위주로 더 생길 뿐
temp5.crs = 'EPSG:4326' # GPS
temp5 = temp5.to_crs('EPSG:5181') # 중부원점 좌표계
gdf = temp5.copy()
gdf['x'] = temp5.geometry.centroid.x / 1000 # 1km
gdf['y'] = temp5.geometry.centroid.y / 1000
gdf['counts'] = temp5['counts'] / 10
gdf['Number of Reservations'] = temp5['Number of Reservations'] * 10
dbscan = DBSCAN(eps = 1.0, min_samples = 50)
clusters = dbscan.fit_predict(gdf[['x', 'y']].values) # 단위를 다 맞춰줘야겠는데. 더불어 time은 별 의미가 안 보임.
# 그걸 찾아야 되겠구나...
gdf['cluster_label'] = clusters.astype(str)
Layer(gdf, color_category_style('cluster_label')) # 여기까진 다 아는 내용인데. 새로운 feature 발굴하는 방향으로 가야?
# 공항철도, 용도지구, 관광특구, ... 그런 애들이랑 붙여봐야 할듯 -> 자료를 구해보자
# LTE CELL은 어떻게 구분이 돼있을까. 그 위에 던져봐야겠는데.
# 시간 순서에 따라 변하는... 그런 toggle은 없나? 검색이 쉽지 않네.
import pandas as pd
import folium
from folium.plugins import TimestampedGeoJson
# Create a list of features for each time step
features = []
for timestamp, group in gdf.groupby('Reporting Month'):
for _, item in group.iterrows():
feature = {
'type': 'Feature',
'geometry': {
'type': 'Point',
'coordinates': [item.Longitude, item.Latitude]
},
'properties': {
'time': timestamp,
'icon': 'circle',
'iconstyle': {
'color': 'red',
'radius': 1,
'fill_color': 'red'
}
#'popup': f"{timestamp}: {group['value'].values[0]}"
}
}
features.append(feature)
# Create a GeoJSON dictionary with the list of features
geojson = {
'type': 'FeatureCollection',
'features': features
}
# Create a Folium map centered on the median of the data's coordinates
map_center = [gdf['Latitude'].median(), gdf['Longitude'].median()]
m = folium.Map(location=map_center, zoom_start=12)
# Add the TimestampedGeoJson plugin to the map
TimestampedGeoJson(
geojson,
period='P1M', # 이 자체가 monthly구나.
duration='P1M',
auto_play=False,
loop=True,
max_speed=1,
add_last_point=True,
date_options='YYYY-MM',
time_slider_drag_update=True,
).add_to(m)
# Display the map
m