Data Preparation¶
%matplotlib inline
import pandas
from sqlalchemy import create_engine
Download the data¶
url = ("http://data.insideairbnb.com/france/ile-de-france/paris"
"/2019-09-16/data/listings.csv.gz")
%time db = pandas.read_csv(url)
<string>:2: DtypeWarning: Columns (43,61,62) have mixed types. Specify dtype option on import or set low_memory=False.
CPU times: user 9.02 s, sys: 4.61 s, total: 13.6 s
Wall time: 2min 22s
tokeep = [
"id",
"neighbourhood_cleansed",
"property_type",
"room_type",
"accommodates",
"bathrooms",
"bedrooms",
"beds",
"bed_type"
]
review = [i for i in db.columns if "review_scores" in i]
Build numeric price¶
cleaner = lambda p: float(p.replace(",", "").strip("$"))
db["Price"] = db["price"].apply(cleaner)
Keep relevant variables¶
final = db[tokeep + review + ["Price"]].dropna()
Write out¶
As
.csv
! rm -f /paris_abb.csv.zip
final[tokeep + ["Price"]].to_csv("paris_abb.csv.zip",
index=False)
As
.sqlite
! rm -f /paris_abb_mini.db
engine = create_engine("sqlite:///paris_abb_mini.db")
final[tokeep + ["Price"]].to_sql("db", engine, index=False)
Index on neighbourhood_cleansed
qry = """
CREATE INDEX idx_neighbourhood_cleansed
ON db (neighbourhood_cleansed);
"""
conn = engine.connect()
conn.execute(qry)
<sqlalchemy.engine.result.ResultProxy at 0x7fb1c2f75fd0>
out = conn.execute("select * from SQLite_master")
for table in out.fetchall():
print(table[4])
CREATE TABLE db (
id BIGINT,
neighbourhood_cleansed TEXT,
property_type TEXT,
room_type TEXT,
accommodates BIGINT,
bathrooms FLOAT,
bedrooms FLOAT,
beds FLOAT,
bed_type TEXT,
"Price" FLOAT
)
CREATE INDEX idx_neighbourhood_cleansed
ON db (neighbourhood_cleansed)
conn.close()
engine.dispose()
Full table dump on sqlite (optional, final file was eventually removed from the pack as 270MB)
! rm -f paris_abb.db
engine = create_engine("sqlite:///paris_abb.db")
db.rename(columns={"Price": "price_val"})\
.to_sql("db", engine, index=False)
Index on neighbourhood_cleansed
qry = """
CREATE INDEX idx_neighbourhood_cleansed
ON db (neighbourhood_cleansed);
"""
conn = engine.connect()
conn.execute(qry)
<sqlalchemy.engine.result.ResultProxy at 0x7fb1aa334860>
out = conn.execute("select * from SQLite_master")
for table in out.fetchall():
print(table[4])
CREATE TABLE db (
id BIGINT,
listing_url TEXT,
scrape_id BIGINT,
last_scraped TEXT,
name TEXT,
summary TEXT,
space TEXT,
description TEXT,
experiences_offered TEXT,
neighborhood_overview TEXT,
notes TEXT,
transit TEXT,
access TEXT,
interaction TEXT,
house_rules TEXT,
thumbnail_url FLOAT,
medium_url FLOAT,
picture_url TEXT,
xl_picture_url FLOAT,
host_id BIGINT,
host_url TEXT,
host_name TEXT,
host_since TEXT,
host_location TEXT,
host_about TEXT,
host_response_time TEXT,
host_response_rate TEXT,
host_acceptance_rate FLOAT,
host_is_superhost TEXT,
host_thumbnail_url TEXT,
host_picture_url TEXT,
host_neighbourhood TEXT,
host_listings_count FLOAT,
host_total_listings_count FLOAT,
host_verifications TEXT,
host_has_profile_pic TEXT,
host_identity_verified TEXT,
street TEXT,
neighbourhood TEXT,
neighbourhood_cleansed TEXT,
neighbourhood_group_cleansed FLOAT,
city TEXT,
state TEXT,
zipcode TEXT,
market TEXT,
smart_location TEXT,
country_code TEXT,
country TEXT,
latitude FLOAT,
longitude FLOAT,
is_location_exact TEXT,
property_type TEXT,
room_type TEXT,
accommodates BIGINT,
bathrooms FLOAT,
bedrooms FLOAT,
beds FLOAT,
bed_type TEXT,
amenities TEXT,
square_feet FLOAT,
price TEXT,
weekly_price TEXT,
monthly_price TEXT,
security_deposit TEXT,
cleaning_fee TEXT,
guests_included BIGINT,
extra_people TEXT,
minimum_nights BIGINT,
maximum_nights BIGINT,
minimum_minimum_nights BIGINT,
maximum_minimum_nights BIGINT,
minimum_maximum_nights BIGINT,
maximum_maximum_nights BIGINT,
minimum_nights_avg_ntm FLOAT,
maximum_nights_avg_ntm FLOAT,
calendar_updated TEXT,
has_availability TEXT,
availability_30 BIGINT,
availability_60 BIGINT,
availability_90 BIGINT,
availability_365 BIGINT,
calendar_last_scraped TEXT,
number_of_reviews BIGINT,
number_of_reviews_ltm BIGINT,
first_review TEXT,
last_review TEXT,
review_scores_rating FLOAT,
review_scores_accuracy FLOAT,
review_scores_cleanliness FLOAT,
review_scores_checkin FLOAT,
review_scores_communication FLOAT,
review_scores_location FLOAT,
review_scores_value FLOAT,
requires_license TEXT,
license TEXT,
jurisdiction_names TEXT,
instant_bookable TEXT,
is_business_travel_ready TEXT,
cancellation_policy TEXT,
require_guest_profile_picture TEXT,
require_guest_phone_verification TEXT,
calculated_host_listings_count BIGINT,
calculated_host_listings_count_entire_homes BIGINT,
calculated_host_listings_count_private_rooms BIGINT,
calculated_host_listings_count_shared_rooms BIGINT,
reviews_per_month FLOAT,
price_val FLOAT
)
CREATE INDEX idx_neighbourhood_cleansed
ON db (neighbourhood_cleansed)
conn.close()
engine.dispose()
Review scores
! rm -f paris_abb_review.csv.zip
final[["id"] + review].to_csv("paris_abb_review.csv.zip",
index=False)