Sunday, March 24, 2019

Beautiful Soup IMDB movie page parser

Scrap all information of a movie from imdb page

Source code available on github

from bs4 import BeautifulSoup
import requests


class IMDB(object):
    """Get all information of a imdb movie, pass html content of imdb page"""

    def __init__(self, html=None):
        if not html:
            self.soup = None
        else:
            self.soup = BeautifulSoup(html, "html.parser")

    def _get_year(self):

        d = self.soup.select(".title_bar_wrapper .titleBar .subtext a")
        release_date = self._get_data_prity(d[len(d) - 1])

        return release_date.split()[2]

    def _get_rating(self):
        d = self.soup.select(".title_bar_wrapper .ratingValue span")[0]

        return self._get_data_prity(d)

    def _get_name(self):
        d = self.soup.select(".title_bar_wrapper .titleBar .title_wrapper h1")[0]

        return self._get_data_prity(d)

    def get_info(self):
        m = dict()
        m["name"] = self._get_name()
        m["year"] = self._get_year()
        m["rate"] = self._get_rating()
        m["review_count"] = self._get_review_count()
        m["category"] = self._get_category()

        return m

    def _get_category(self):
        w = []
        a = self.soup.select(".title_bar_wrapper .subtext a")
        for i in range(0, len(a) - 1):
            n = a[i]
            w.append(self._get_data_prity(n))
        return w

    def _get_review_count(self):
        d = self.soup.select(".title_bar_wrapper .imdbRating .small")[0]
        return self._get_data_prity(d)

    def _get_data_prity(self, d):
        r = d.find(text=True, recursive=False)
        return str(r.strip())


def get_url_content(url):
    r = requests.get(url)
    content = r.text.encode('utf-8', 'ignore')
    return content


if __name__ == '__main__':
    url = "https://www.imdb.com/title/tt0034583/"
    imdbHtml = get_url_content(url)
    imdb = IMDB(imdbHtml)
    print (imdb.get_info())

Output:


{'category': ['Drama', 'Romance', 'War'], 'rate': '8.5', 'review_count': '469,931', 'name': 'Casablanca', 'year': '1943'}

Friday, May 13, 2016

Selenium unittest example

import unittest
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

class JPythonSearch(unittest.TestCase):

    def setUp(self):
        self.driver = webdriver.Firefox()

    def test_search_in_python_org(self):
     URL = "http://jpython.blogspot.com"
        driver = self.driver
        driver.get(URL)
        self.assertIn("Life is very easy with Python", driver.title)
        elem = driver.find_element_by_name("q")
        elem.send_keys("BFS")
        elem.send_keys(Keys.RETURN)
        assert "No results found." not in driver.page_source


    def tearDown(self):
        self.driver.close()

if __name__ == "__main__":
    unittest.main()

Output:


.
----------------------------------------------------------------------
Ran 1 test in 25.592s

OK
[Finished in 25.7s]

Selenium example

Here I am showing the example of browse and search into a site programmatically.

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

URL = "http://jpython.blogspot.com"
driver = webdriver.Firefox()
driver.get(URL)
print driver.title


search = driver.find_element_by_name("q") # find search field
search.send_keys("BFS") # set search key
search.send_keys(Keys.RETURN) # hit enter

driver.implicitly_wait(10) # wait 10 seconds
driver.close()

Output:


Life is very easy with Python

Selenium Python bindings provides a simple API to write functional/acceptance tests using Selenium WebDriver. Through Selenium Python API you can access all functionalities of Selenium WebDriver in an intuitive way.

Installation

pip install selenium

If you don't have pip installed try this

Wednesday, November 11, 2015

map keyword

Apply function to every item of iterable and return a list of the results. If additional iterable arguments are passed, function must take that many arguments and is applied to the items from all iterables in parallel. If one iterable is shorter than another it is assumed to be extended with None items.

All source code available on github

intArray = [2, 3, 4, 6, 5, 7]
strArray = ['2', '3', '4', '6', '5', '7']

# square all elements of array
print map(lambda x:x**2, intArray)

# convert string array to int array
print map(int, strArray)

a = [1, 2, 3]
b = [4, 5, 6]

# sum two array elements  by index
print map(lambda x, y : x+y, a, b)

Output:

[4, 9, 16, 36, 25, 49]
[2, 3, 4, 6, 5, 7]
[5, 7, 9]

Monday, October 19, 2015

Dijkstra algorithm

Dijkstra algorithm is a single source shortest path algorithm. For a given source node in the graph, the algorithm finds the shortest path between that node and every other.

Here is the python implementation of Dijkstra algorithm

from heapq import heappush, heappop

# 0 index base dijkstra algorithm
def Dijkstra(graph, source):
    A = [None] * len(graph)
    queue = [(0, source)]
    while queue:
        path_len, v = heappop(queue)
        if A[v] is None: # v is unvisited
            A[v] = path_len
            for w, edge_len in graph[v].items():
                if A[w] is None:
                    heappush(queue, (path_len + edge_len, w))

    # set -1 for unreachable           
    return [-1 if x is None else x for x in A] 

graph = {
  0: { 1:2, 2:4, 3:1 },
  1: { 2:1, 3:3 },
  2: { 4: 7},
  3: { 2: 2 },
  4: { 0:2, 3:3 }, 
  5: {}
}
source = 0

print Dijkstra(graph, source)

Output:

[0, 2, 3, 1, 10, -1]

Saturday, October 17, 2015

python heapq/priority queue/Min heap

Python heapq provides implementation of the heap(min) queue algorithm, also known as the priority queue algorithm. Here's some example code.

from heapq import heappush, heappop

# priority queue normal
h = []
heappush(h, 5)
heappush(h, 1)
heappush(h, 3)

print heappop(h),"size",len(h)
print heappop(h),"size",len(h)
print heappop(h),"size",len(h)

# priority queue with tuple number and string
h = []
heappush(h, (5, "sample text"))
heappush(h, (1, "important text"))
heappush(h, (1, "a important text"))
heappush(h, (9, "un-important text"))

print heappop(h)
print heappop(h)

# priority queue with tuple number only 
h = []
heappush(h, (5, 3))
heappush(h, (7, 3))
heappush(h, (1, 3))
heappush(h, (1, 1))
heappush(h, (1, 2))
heappush(h, (3, 2))
heappush(h, (3, 1))

print heappop(h)
print heappop(h)
print heappop(h)

Output:

1 size 2
3 size 1
5 size 0
(1, 'a important text')
(1, 'important text')
(1, 1)
(1, 2)
(1, 3)

Python Tutorial

Life is very easy with Python

Sunday, March 24, 2019

Beautiful Soup IMDB movie page parser

Friday, May 13, 2016

Selenium unittest example

Selenium example

Selenium installation

Installation

Wednesday, November 11, 2015

map keyword

Monday, October 19, 2015

Dijkstra algorithm

Saturday, October 17, 2015

python heapq/priority queue/Min heap

Search This Blog

Followers

About Me

Subjects

Archive