Web Scraping rumah.com

Web scraping is performed on the website rumah.com to retrieve data such as the title, location, rooms, toilets, area, and price of houses.

	Title	Location	Bedroom	Toilet	Area	Price
0	The Awani Residence 3	55 Jalan Cangkorah, Gunung Batu, Bandung, Jawa Barat	2	1	40 m²	577,7 jt - Rp 1,001564 M
1	G-Land Padalarang Valley	Komp. Kopo Plaza, Jl. Peta, Suka Asih, Bojongloa Kaler, Bandung, Jawa Barat	2	1	30 m²	350 jt - Rp 500 jt
2	Valle Verde	33 Jl. Pasirhalang, Andir, Bandung, Jawa Barat	2	2	98 m²	1,8461075 M
3	Rumah Lux Furnished Pasir Luyu Sayap BKR	Pasir Luyu, Sayap BKR, Regol, Bandung, Jawa Barat	4	3	550 m²	7,5 M
4	Rumah Baru Minimalis 2 Lantai di Punawangi Kota Baru Parahyangan	Tarubhawana, Kota Baru Parahyangan, Bandung, Jawa Barat	3	2	95 m²	2,0392476 M
5	Dijual Rumah Baru dekat Tol Gedebage Bandung Timur	Derwati, Bandung Timur, Bandung, Jawa Barat	2	1	41 m²	645,465 jt
6	Rumah luas Setra Duta dgn fasilitas kolam renang	Jl. Setra Duta, Setra Duta, Bandung, Jawa Barat	7	6	581 m²	15 M
7	Rumah Singgasana Pradana Selantai Terawat Posisi Bagus	Singgasana, Singgasana Pradana, Bandung, Jawa Barat	4	3	300 m²	5,2 M
8	Jual Rumah Baru sayap Setraduta Bandung	Setra Duta, Bandung, Jawa Barat	3	2	99 m²	1,527 M
9	Almaas 3	Jl Ketapang andir, Bandung Selatan, Bandung, Jawa Barat	2	1	60 m²	453
10	Cherry Field	Jl. Ciganitri, Buah Batu, Bandung, Buahbatu, Bandung, Jawa Barat	4	3	140 m²	1,75 M

Dengan code sebagai berikut:

from bs4 import BeautifulSoup
import pandas as pd
import time
from undetected_chromedriver import Chrome # bypass cloudflare
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from plyer import notification

# house data
data = []

# URL -> www.rumah.com
def openBrowser(page):
    url = f'https://www.rumah.com/properti-dijual/{page+1}?district_code=IDJB01&freetext=Jawa+Barat%2C+Bandung&property_type=B&property_type_code%5B0%5D=BUNG&region_code=IDJB&search=true'
    driver = Chrome()
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, "layout-web")))
    contents = soup.findAll('div', class_="listing-description")
    driver.quit()
    return contents

# get data that we want
def getData(contents):
    for item in contents:
        nama_rumah = item.find('a', class_='nav-link').text
        lokasi_rumah = item.find('p', class_='listing-location').text
        harga_rumah = item.find('span', class_='price').text
        luas_rumah = item.find('li', class_='listing-floorarea')
        jmlh_kamar_mandi = item.find('span', class_='bed')
        jmlh_toilet = item.find('span', class_='bath')
        tipe_properti = item.find('ul', class_='listing-property-type')
        jmlh_km = ""
        jmlh_t = ""
        luas_r = ""
        # these variables are NoneType
        if(jmlh_kamar_mandi): jmlh_km = jmlh_kamar_mandi.text
        if(jmlh_toilet): jmlh_t = jmlh_toilet.text
        if(luas_rumah): luas_r = luas_rumah.text
        if(tipe_properti): tipe_p = tipe_properti.text
        data.append((nama_rumah, lokasi_rumah, jmlh_km, jmlh_t, luas_r, harga_rumah))


first_page = 117
last_page = 132

for page in range(first_page,last_page):
    try:
        contents = openBrowser(page)
        getData(contents)
        print("Success at page: " + str(page))
        if(page == last_page-1):
            # notif_success.play()
            notification.notify(
                title = "Success",
                message = 'message',
                app_icon = None,
                timeout = 20,
            )
    except:
        print("Failed at page: " + str(page))
        notification.notify(
            title = "Failed",
            message = 'message',
            app_icon = None,
            timeout = 20,
        )
        break

df = pd.DataFrame(data, columns=['Judul', 'Lokasi' , "Kamar", "Toilet", 'Luas', 'Harga'])
df.to_csv(FOLDER_PATH)
print(df)
Chrome().quit()

print(df)

The code is a web scraping script written in Python using libraries such as BeautifulSoup, Selenium, pandas, undetected_chromedriver, and plyer. The undetected_chromedriver library is used to prevent bot detection on websites, which could lead to IP blocking. This script retrieves property listings from the website www.rumah.com for houses available for sale in Bandung, West Java. It iterates through multiple web page URLs, collecting data such as house title, location, number of bedrooms, number of bathrooms, area size, and price. The script provides the starting page to iterate through until the last page, which can be adjusted accordingly. Then, the data is stored in a Pandas DataFrame and saved to a CSV file.

Example data obtained can be viewed on GitHub