Life is very easy with Python: Beautiful Soup

Showing posts with label Beautiful Soup. Show all posts

Sunday, March 24, 2019

Beautiful Soup IMDB movie page parser

Scrap all information of a movie from imdb page

Source code available on github

from bs4 import BeautifulSoup
import requests


class IMDB(object):
    """Get all information of a imdb movie, pass html content of imdb page"""

    def __init__(self, html=None):
        if not html:
            self.soup = None
        else:
            self.soup = BeautifulSoup(html, "html.parser")

    def _get_year(self):

        d = self.soup.select(".title_bar_wrapper .titleBar .subtext a")
        release_date = self._get_data_prity(d[len(d) - 1])

        return release_date.split()[2]

    def _get_rating(self):
        d = self.soup.select(".title_bar_wrapper .ratingValue span")[0]

        return self._get_data_prity(d)

    def _get_name(self):
        d = self.soup.select(".title_bar_wrapper .titleBar .title_wrapper h1")[0]

        return self._get_data_prity(d)

    def get_info(self):
        m = dict()
        m["name"] = self._get_name()
        m["year"] = self._get_year()
        m["rate"] = self._get_rating()
        m["review_count"] = self._get_review_count()
        m["category"] = self._get_category()

        return m

    def _get_category(self):
        w = []
        a = self.soup.select(".title_bar_wrapper .subtext a")
        for i in range(0, len(a) - 1):
            n = a[i]
            w.append(self._get_data_prity(n))
        return w

    def _get_review_count(self):
        d = self.soup.select(".title_bar_wrapper .imdbRating .small")[0]
        return self._get_data_prity(d)

    def _get_data_prity(self, d):
        r = d.find(text=True, recursive=False)
        return str(r.strip())


def get_url_content(url):
    r = requests.get(url)
    content = r.text.encode('utf-8', 'ignore')
    return content


if __name__ == '__main__':
    url = "https://www.imdb.com/title/tt0034583/"
    imdbHtml = get_url_content(url)
    imdb = IMDB(imdbHtml)
    print (imdb.get_info())

Output:


{'category': ['Drama', 'Romance', 'War'], 'rate': '8.5', 'review_count': '469,931', 'name': 'Casablanca', 'year': '1943'}

Saturday, November 3, 2012

Beautiful Soup CSS selector

Beautiful Soup supports a subset of the CSS selector standard. Just construct the selector as a string and pass it into the .select() method of a Tag or the BeautifulSoup object itself.
I used this html file for practice. All source code available on github

 
from pprint import pprint
from bs4 import BeautifulSoup

html_content = open('bs_sample3.html') 
# http://dl.dropbox.com/u/49962071/blog/python/resource/bs_sample3.html
soup = BeautifulSoup(html_content) # making soap

pprint(soup.select("title")) # get title tag
pprint(soup.select("body a")) # all a tag inside body
pprint(soup.select("html head title")) # html->head->title
pprint(soup.select("head > title")) # head->title
pprint(soup.select("p > a")) # all a tag that inside p
pprint(soup.select("body > a")) # all a tag inside body
pprint(soup.select(".sister")) # select by class
pprint(soup.select("#link1")) # select by id
pprint(soup.select('a[href="http://example.com/elsie"]')) 
# find tags by attribute value
pprint(soup.select('a[href^="http://example.com/"]'))
# find tags by attribute value, all contains 'http://example.com/'
pprint(soup.select('p[lang|=en]')) # Match language codes

Friday, November 2, 2012

Beautiful Soup find_all() search API

find_all() is the most popular method in the Beautiful Soup search API. It's reduce your code size massively. We can use regular expression, custom function into it. I used this html file for practice.
All source code available on github

 
from pprint import pprint
import re
from bs4 import BeautifulSoup

html_content = open('bs_sample.html') 
#http://dl.dropbox.com/u/49962071/blog/python/resource/bs_sample.html
soup = BeautifulSoup(html_content) # making soap

for tag in soup.find_all(re.compile("^p")): # find all tag start with p
    print tag.name

for tag in soup.find_all(re.compile("t")): # find all tag contains t
    print tag.name

for tag in soup.find_all(True): # find all tag
    print tag.name

pprint(soup.find_all('a')) # find all a tag
print 20*"++"
pprint(soup.find_all(["a", "b"])) # find multiple tag


def has_class_but_no_id(tag):
    return tag.has_key('class') and not tag.has_key('id')

pprint(soup.find_all(has_class_but_no_id)) 
# pass a function to find_all

pprint(soup.find_all(text=re.compile("sisters"))) 
# find all tag content contains key 'sisters'
print 20*"++"
pprint(soup.find_all(href=re.compile("my_url"))) # all links contains key "my_url"
pprint(soup.find_all(id=True)) # all links has id
pprint(soup.find_all(class_=True)) # all links has class

def has_six_characters(css_class):
    return css_class is not None and len(css_class) == 7

pprint(soup.find_all(class_=has_six_characters)) 
# find all class name contains 7 characters

pprint(soup.find_all("a", "sister")) # find all a tag have class named 'sister'
pprint(soup.find_all("a", re.compile("sister"))) 
# find all a tag have class named contains 'sister'
print 20*"++"

pprint(soup.find_all(href=re.compile("elsie"), id='link1'))
# url name contains elsie and have id = link1
pprint(soup.find_all(attrs={'href' : re.compile("elsie"), 'id': 'link1'})) 
# url name contains elsie and have id = link1

pprint(soup.find_all("a", limit=2)) # use limit on find_all

pprint(soup.html.find_all("title", recursive=True)) # use recursive on findall

Beautiful Soup 4 exploring

Quick explore to Beautiful Soup 4. I used this document for practicing.
All source codes of this blog available on github.

 
from pprint import pprint
from bs4 import BeautifulSoup

html_content = open('bs_sample.html') 
# http://dl.dropbox.com/u/49962071/blog/python/resource/bs_sample.html

soup = BeautifulSoup(html_content) # making soap

print soup.prettify() # prettify html_content even complete uncompleted tag

print soup.title # page title tag
print soup.title.name # page title name
print soup.title.parent.name # page title parent
print soup.p # first p tag
print soup.p.string # string content of first p tag
print soup.p['class'] # first p tag class name
print soup.a  # first a tag
pprint( soup.find_all('a'))  # all a tag
pprint( soup.find_all('p'))  # all p tag
print soup.find(id='link3') # find tag with id = link3
print 'All links:'
for link in soup.find_all('a'):
    print link.get('href') # get url

print soup.get_text() # return text part of html_document

Saturday, January 28, 2012

Python Beautiful Soup Url extract from web page

from BeautifulSoup import BeautifulSoup, SoupStrainer
import re
import urllib2

def get_url_content(site_url):
    rt=""
    try:
        request = urllib2.Request(site_url) 
        f=urllib2.urlopen(request)
        content=f.read()
        f.close()
    except urllib2.HTTPError, error:
        content=str(error.read())
    return content

response=get_url_content('http://www.sust.edu/')

for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
    if link.has_key('href'):
        print link['href']

Output:


All urls under this link

Beautiful Soup Python : Install

Beautiful Soup is an HTML/XML parser for Python that can turn even invalid markup into a parse tree. It provides simple, idiomatic ways of navigating, searching, and modifying the parse tree. It commonly saves programmers hours or days of work.

pip command

pip install beautifulsoup4

Install Steps:

- Download library from here
- Then extract the file.
- cd to this file directory from command prompt.
- run command python setup.py install

Python Tutorial

Life is very easy with Python