Python Tutorial

Sunday, March 24, 2019

Beautiful Soup IMDB movie page parser

Scrap all information of a movie from imdb page

Source code available on github
from bs4 import BeautifulSoup
import requests


class IMDB(object):
    """Get all information of a imdb movie, pass html content of imdb page"""

    def __init__(self, html=None):
        if not html:
            self.soup = None
        else:
            self.soup = BeautifulSoup(html, "html.parser")

    def _get_year(self):

        d = self.soup.select(".title_bar_wrapper .titleBar .subtext a")
        release_date = self._get_data_prity(d[len(d) - 1])

        return release_date.split()[2]

    def _get_rating(self):
        d = self.soup.select(".title_bar_wrapper .ratingValue span")[0]

        return self._get_data_prity(d)

    def _get_name(self):
        d = self.soup.select(".title_bar_wrapper .titleBar .title_wrapper h1")[0]

        return self._get_data_prity(d)

    def get_info(self):
        m = dict()
        m["name"] = self._get_name()
        m["year"] = self._get_year()
        m["rate"] = self._get_rating()
        m["review_count"] = self._get_review_count()
        m["category"] = self._get_category()

        return m

    def _get_category(self):
        w = []
        a = self.soup.select(".title_bar_wrapper .subtext a")
        for i in range(0, len(a) - 1):
            n = a[i]
            w.append(self._get_data_prity(n))
        return w

    def _get_review_count(self):
        d = self.soup.select(".title_bar_wrapper .imdbRating .small")[0]
        return self._get_data_prity(d)

    def _get_data_prity(self, d):
        r = d.find(text=True, recursive=False)
        return str(r.strip())


def get_url_content(url):
    r = requests.get(url)
    content = r.text.encode('utf-8', 'ignore')
    return content


if __name__ == '__main__':
    url = "https://www.imdb.com/title/tt0034583/"
    imdbHtml = get_url_content(url)
    imdb = IMDB(imdbHtml)
    print (imdb.get_info())


Output:

{'category': ['Drama', 'Romance', 'War'], 'rate': '8.5', 'review_count': '469,931', 'name': 'Casablanca', 'year': '1943'}