We provide programming data of 20 most popular languages, hope to help you!
However, the URL does not change when I navigate to different dates or adresses (‘Område’). I have read a couple of similar problems suggesting to inspect the webpage, look under ’Network’ and then ‘XHR’ or ‘JS’ to find the data source of the table and get information from there.
# load libraries
library(RSelenium)
# open browser
selCommand <- wdman::selenium(jvmargs = c("-Dwebdriver.chrome.verboseLogging=true"), retcommand = TRUE)
Sys.sleep(2)
shell(selCommand, wait = FALSE, minimized = TRUE)
Sys.sleep(2)
remdr <- remoteDriver(port = 4567L, browserName = "firefox")
Sys.sleep(10)
remdr$open()
remdr$navigate(url = 'https://matchpadel.halbooking.dk/newlook/proc_baner.asp')
Therefore, I'm not able to write the code that will scrape those pages because I don't know how to specify the URL for each hospital. I apologize, this has to be a very basic question but I wasn't able to google anything useful on it for Access VBA buttons and drop-down lists before scraping data on .aspx web pages. Related. 302. Change the
Public Function btnGetWebData_Click()
Dim strURL
Dim HTML_Content As HTMLDocument
Dim dados As Object
'Create HTMLFile Object
Set HTML_Content = New HTMLDocument
'Get the WebPage Content to HTMLFile Object
With CreateObject("msxml2.xmlhttp")
.Open "GET", "http://healthapps.state.nj.us/facilities/acFacilityList.aspx", False
'http://healthapps.state.nj.us/facilities/acFacilityList.aspx
.Send
HTML_Content.Body.innerHTML = .responseText
Debug.Print .responseText
Debug.Print HTML_Content.Body.innerHTML
End With
End Function
Option Explicit
Public Sub VisitPages()
Dim IE As New InternetExplorer
With IE
.Visible = True
.navigate "http://healthapps.state.nj.us/facilities/acSetSearch.aspx?by=county"
While .Busy Or .readyState < 4: DoEvents: Wend
With .document
.querySelector("#middleContent_cbType_5").Click
.querySelector("#middleContent_cbType_12").Click
.querySelector("#middleContent_btnGetList").Click
End With
While .Busy Or .readyState < 4: DoEvents: Wend
Dim list As Object, i As Long
Set list = .document.querySelectorAll("#main_table [href*=doPostBack]")
For i = 0 To list.Length - 1
list.item(i).Click
While .Busy Or .readyState < 4: DoEvents: Wend
Application.Wait Now + TimeSerial(0, 0, 3) '<== Delete me later. This is just to demo page changes
'do stuff with new page
.Navigate2 .document.URL '<== back to homepage
While .Busy Or .readyState < 4: DoEvents: Wend
Set list = .document.querySelectorAll("#main_table [href*=doPostBack]") 'reset list (often required in these scenarios)
Next
Stop '<== Delete me later
'.Quit '<== Remember to quit application
End With
End Sub
Option Explicit
Public Sub VisitPages()
Dim IE As New InternetExplorer
With IE
.Visible = True
.navigate "http://healthapps.state.nj.us/facilities/acSetSearch.aspx?by=county"
While .Busy Or .readyState < 4: DoEvents: Wend
With .document
.querySelector("#middleContent_cbType_5").Click
.querySelector("#middleContent_cbType_12").Click
.querySelector("#middleContent_btnGetList").Click
End With
While .Busy Or .readyState < 4: DoEvents: Wend
Dim list As Object, i As Long, col As Collection
Set col = New Collection
Set list = .document.querySelectorAll("#main_table [href*=doPostBack]")
For i = 0 To list.Length - 1
col.Add CStr(list.item(i))
Next
For i = 1 To col.Count
.document.parentWindow.execScript col.item(i)
While .Busy Or .readyState < 4: DoEvents: Wend
'Do stuff with page
.Navigate2 .document.URL
While .Busy Or .readyState < 4: DoEvents: Wend
Next
Stop '<== Delete me later
'.Quit '<== Remember to quit application
End With
End Sub
def startWebDriver():
global driver
options = Options()
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(executable_path = '/home/Downloads/chromedriver_linux64/chromedriver',options=options)
startWebDriver()
count = 0
s = set()
driver.get('https://www.nytimes.com/search? endDate=20181231&query=trump&sort=best&startDate=20180101')
time.sleep(4)
element = driver.find_element_by_xpath('//*[@id="site-content"]/div/div/div[2]/div[2]/div/button')
while(count < 10):
element.click()
time.sleep(4)
count=count+1
url = driver.current_url
import requests
url = 'https://samizdat-graphql.nytimes.com/graphql/v2'
headers = {
'nyt-app-type': 'project-vi',
'nyt-app-version': '0.0.3',
'nyt-token': 'MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAlYOpRoYg5X01qAqNyBDM32EI/E77nkFzd2rrVjhdi/VAZfBIrPayyYykIIN+d5GMImm3wg6CmTTkBo7ixmwd7Xv24QSDpjuX0gQ1eqxOEWZ0FHWZWkh4jfLcwqkgKmfHJuvOctEiE/Wic5Qrle323SMDKF8sAqClv8VKA8hyrXHbPDAlAaxq3EPOGjJqpHEdWNVg2S0pN62NSmSudT/ap/BqZf7FqsI2cUxv2mUKzmyy+rYwbhd8TRgj1kFprNOaldrluO4dXjubJIY4qEyJY5Dc/F03sGED4AiGBPVYtPh8zscG64yJJ9Njs1ReyUCSX4jYmxoZOnO+6GfXE0s2xQIDAQAB'
}
data = '''
{"operationName":"SearchRootQuery","variables":{"first":10,"sort":"best","beginDate":"20180101","text":"trump","cursor":"YXJyYXljb25uZWN0aW9uOjk="},"extensions":{"persistedQuery":{"version":1,"sha256Hash":"d2895d5a5d686528b9b548f018d7d0c64351ad644fa838384d94c35c585db813"}}}
'''
with requests.Session() as r:
re = r.post(url, headers = headers, data = data)
print(re.json())
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument('disable-infobars')
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get("https://www.nytimes.com/search?%20endDate=20181231&query=trump&sort=best&startDate=20180101")
myLength = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//main[@id='site-content']//figure[@class='css-rninck toneNews']//following::a[1]"))))
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='Show More']"))).click()
WebDriverWait(driver, 20).until(lambda driver: len(driver.find_elements_by_xpath("//main[@id='site-content']//figure[@class='css-rninck toneNews']//following::a[1]")) > myLength)
titles = driver.find_elements_by_xpath("//main[@id='site-content']//figure[@class='css-rninck toneNews']//following::a[1]")
myLength = len(titles)
except TimeoutException:
break
for title in titles:
print(title.get_attribute("href"))
driver.quit()
{
"operationName":"SearchRootQuery",
"variables":{
"first":10,
"sort":"best",
"beginDate":"20180101",
"endDate":"20181231",
"text":"trump" ...
}}
prefix = 'https://www.timeanddate.com'
weather_request = requests.get(prefix + '/weather/belgium/antwerp/historic?month=4&year=2017',
'html.parser')
weather = BeautifulSoup(weather_request.content)
for option in weather.select('select > option'):
append_to_mylist(option.get('value'), option.text)
import re
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
for day in range(1, 31):
print('Getting info for day {}..'.format(day))
url = 'https://www.timeanddate.com/scripts/cityajax.php?n=belgium/antwerp&mode=historic&hd=201704{:02d}&month=4&year=2017&json=1'.format(day)
data = requests.get(url).text
data = json.loads(re.sub(r'(c|h|s):', r'"\1":', data))
# uncomment this to print raw data:
# print(json.dumps(data, indent=4))
# construct the table from json:
table = '<table>'
for row in data:
table += '<tr>'
for cell in row['c']:
table += '<td>' + BeautifulSoup(cell['h'], 'html.parser').get_text(strip=True, separator=' ') + '</td>'
table += '</tr>'
table += '</table>'
# now in `table` is HTML table, you can parse it with BeautifulSoup, or pass it to Pandas:
df = pd.read_html(table)[0]
print(df)
print('-' * 120)
Getting info for day 1..
0 1 2 3 4 5 6 7 8
0 12:20 am Sat, Apr 1 NaN 50 °F Clear. 2 mph ↑ 94% 29.92 "Hg 2 mi
1 12:50 am NaN 46 °F Fog. 2 mph ↑ 100% 29.92 "Hg 2 mi
2 1:20 am NaN 48 °F Light fog. 3 mph ↑ 87% 29.89 "Hg 0 mi
3 1:50 am NaN 48 °F Clear. 3 mph ↑ 94% 29.89 "Hg 1 mi
4 2:20 am NaN 46 °F Fog. 5 mph ↑ 100% 29.89 "Hg 1 mi
5 3:20 am NaN 46 °F Clear. 3 mph ↑ 93% 29.89 "Hg 1 mi
6 3:50 am NaN 46 °F Fog. 6 mph ↑ 93% 29.86 "Hg 1 mi
7 4:20 am NaN 46 °F Fog. 3 mph ↑ 100% 29.86 "Hg 1 mi
8 4:50 am NaN 46 °F Fog. 3 mph ↑ 100% 29.86 "Hg 1 mi
9 5:20 am NaN 46 °F Fog. 2 mph ↑ 93% 29.86 "Hg 2 mi
10 5:50 am NaN 48 °F Clear. 3 mph ↑ 87% 29.86 "Hg 4 mi
11 6:20 am NaN 48 °F Clear. 5 mph ↑ 87% 29.83 "Hg 4 mi
12 6:50 am NaN 48 °F Clear. 5 mph ↑ 94% 29.86 "Hg 4 mi
13 7:20 am NaN 50 °F Sprinkles. Clear. 6 mph ↑ 94% 29.86 "Hg 4 mi
14 7:50 am NaN 52 °F Sprinkles. Broken clouds. 9 mph ↑ 88% 29.86 "Hg 3 mi
15 8:20 am NaN 52 °F Light rain. Partly sunny. 8 mph ↑ 88% 29.86 "Hg 5 mi
16 8:50 am NaN 52 °F Light rain. Passing clouds. 6 mph ↑ 94% 29.86 "Hg 5 mi
17 9:20 am NaN 52 °F Drizzle. Partly sunny. 5 mph ↑ 94% 29.86 "Hg 5 mi
18 9:50 am NaN 52 °F Broken clouds. 5 mph ↑ 94% 29.86 "Hg 5 mi
19 10:20 am NaN 52 °F Broken clouds. 6 mph ↑ 94% 29.89 "Hg NaN
20 10:50 am NaN 52 °F Sprinkles. Broken clouds. 8 mph ↑ 94% 29.89 "Hg 5 mi
21 11:20 am NaN 52 °F Partly sunny. 5 mph ↑ 94% 29.89 "Hg NaN
22 11:50 am NaN 54 °F Scattered clouds. 2 mph ↑ 88% 29.89 "Hg NaN
23 12:20 pm NaN 55 °F Scattered clouds. 5 mph ↑ 82% 29.89 "Hg NaN
24 12:50 pm NaN 55 °F Scattered clouds. 3 mph ↑ 77% 29.89 "Hg NaN
25 1:20 pm NaN 57 °F Passing clouds. 5 mph ↑ 72% 29.89 "Hg NaN
26 1:50 pm NaN 57 °F Passing clouds. 3 mph ↑ 67% 29.89 "Hg NaN
27 2:20 pm NaN 57 °F Passing clouds. 7 mph ↑ 72% 29.89 "Hg NaN
28 2:50 pm NaN 57 °F Scattered clouds. 3 mph ↑ 72% 29.89 "Hg NaN
29 3:20 pm NaN 55 °F Sprinkles. Broken clouds. 9 mph ↑ 77% 29.89 "Hg 4 mi
30 3:50 pm NaN 55 °F Sprinkles. Broken clouds. 3 mph ↑ 77% 29.86 "Hg 5 mi
31 4:20 pm NaN 55 °F Sprinkles. Broken clouds. 2 mph ↑ 82% 29.89 "Hg NaN
32 4:50 pm NaN 57 °F Scattered clouds. 2 mph ↑ 77% 29.86 "Hg NaN
33 5:20 pm NaN 57 °F Scattered clouds. 7 mph ↑ 72% 29.89 "Hg NaN
34 5:50 pm NaN 55 °F Scattered clouds. 6 mph ↑ 88% 29.89 "Hg NaN
35 6:20 pm NaN 55 °F Passing clouds. 6 mph ↑ 82% 29.89 "Hg NaN
36 6:50 pm NaN 55 °F Passing clouds. 3 mph ↑ 82% 29.89 "Hg NaN
37 7:20 pm NaN 54 °F Passing clouds. 5 mph ↑ 94% 29.89 "Hg NaN
38 7:50 pm NaN 54 °F Passing clouds. 5 mph ↑ 88% 29.89 "Hg NaN
39 8:20 pm NaN 54 °F Passing clouds. 7 mph ↑ 88% 29.92 "Hg NaN
40 8:50 pm NaN 54 °F Clear. 7 mph ↑ 88% 29.92 "Hg 10 mi
41 9:20 pm NaN 54 °F Clear. 2 mph ↑ 88% 29.92 "Hg 10 mi
42 9:50 pm NaN 52 °F Clear. 5 mph ↑ 94% 29.92 "Hg 10 mi
43 10:20 pm NaN 48 °F Clear. 2 mph ↑ 100% 29.95 "Hg 10 mi
44 10:50 pm NaN 52 °F Clear. 3 mph ↑ 88% 29.95 "Hg 4 mi
45 11:20 pm NaN 46 °F Fog. 2 mph ↑ 93% 29.95 "Hg 1 mi
46 11:50 pm NaN 46 °F Clear. 3 mph ↑ 93% 29.95 "Hg 0 mi
------------------------------------------------------------------------------------------------------------------------
Getting info for day 2..
0 1 2 3 4 5 6 7 8
0 12:20 am Sun, Apr 2 NaN 45 °F Fog. 2 mph ↑ 100% 29.95 "Hg 0 mi
1 12:50 am NaN 45 °F Fog. 2 mph ↑ 93% 29.98 "Hg 1 mi
2 1:20 am NaN 45 °F Fog. 2 mph ↑ 100% 29.95 "Hg 0 mi
3 1:50 am NaN 45 °F Clear. 3 mph ↑ 87% 29.98 "Hg 4 mi
4 2:20 am NaN 48 °F Clear. 6 mph ↑ 87% 29.98 "Hg 10 mi
5 2:50 am NaN 48 °F Clear. 2 mph ↑ 87% 29.98 "Hg 10 mi
6 3:20 am NaN 48 °F Clear. 5 mph ↑ 87% 29.98 "Hg 10 mi
7 3:50 am NaN 48 °F Clear. 2 mph ↑ 87% 29.98 "Hg 6 mi
8 4:50 am NaN 46 °F Clear. 2 mph ↑ 87% 30.01 "Hg 10 mi
9 5:20 am NaN 46 °F Passing clouds. 3 mph ↑ 87% 30.01 "Hg NaN
10 5:50 am NaN 46 °F Clear. 2 mph ↑ 87% 30.01 "Hg 10 mi
11 6:20 am NaN 46 °F Clear. 1 mph ↑ 87% 30.04 "Hg 4 mi
12 6:50 am NaN 45 °F Light fog. 2 mph ↑ 93% 30.04 "Hg 5 mi
... and so on.
This was also a simple lab where we had to change the URL and print the page title. This code would pass the lab. Part 3: Soup-ed body and head. This is the link to this lab. In the last lab, you saw how you can extract the title from the page. It is equally easy to extract out certain sections too.
import requests
res = requests.get('https://codedamn.com')
print(res.text)
print(res.status_code)
import requests
# Make a request to https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/
# Store the result in 'res' variable
res = requests.get(
'https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/')
txt = res.text
status = res.status_code
print(txt, status)
# print the result
from bs4 import BeautifulSoup
page = requests.get("https://codedamn.com")
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.title.text # gets you the text of the <title>(...)</title>
import requests
from bs4 import BeautifulSoup
# Make a request to https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/
page = requests.get(
"https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/")
soup = BeautifulSoup(page.content, 'html.parser')
# Extract title of page
page_title = soup.title.text
# print the result
print(page_title)
import requests
from bs4 import BeautifulSoup
# Make a request
page = requests.get(
"https://codedamn.com")
soup = BeautifulSoup(page.content, 'html.parser')
# Extract title of page
page_title = soup.title.text
# Extract body of page
page_body = soup.body
# Extract head of page
page_head = soup.head
# print the result
print(page_body, page_head)
import requests
from bs4 import BeautifulSoup
# Make a request
page = requests.get(
"https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/")
soup = BeautifulSoup(page.content, 'html.parser')
# Extract title of page
page_title = soup.title
# Extract body of page
page_body = soup.body
# Extract head of page
page_head = soup.head
# print the result
print(page_title, page_head)
import requests
from bs4 import BeautifulSoup
# Make a request
page = requests.get(
"https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/")
soup = BeautifulSoup(page.content, 'html.parser')
# Extract first <h1>(...)</h1> text
first_h1 = soup.select('h1')[0].text
import requests
from bs4 import BeautifulSoup
# Make a request
page = requests.get(
"https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/")
soup = BeautifulSoup(page.content, 'html.parser')
# Create all_h1_tags as empty list
all_h1_tags = []
# Set all_h1_tags to all h1 tags of the soup
for element in soup.select('h1'):
all_h1_tags.append(element.text)
# Create seventh_p_text and set it to 7th p element text of the page
seventh_p_text = soup.select('p')[6].text
print(all_h1_tags, seventh_p_text)
info = {
"title": 'Asus AsusPro Adv... '.strip(),
"review": '2 reviews\n\n\n'.strip()
}
import requests
from bs4 import BeautifulSoup
# Make a request
page = requests.get(
"https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/")
soup = BeautifulSoup(page.content, 'html.parser')
# Create top_items as empty list
top_items = []
# Extract and store in top_items according to instructions on the left
products = soup.select('div.thumbnail')
for elem in products:
title = elem.select('h4 > a.title')[0].text
review_label = elem.select('div.ratings')[0].text
info = {
"title": title.strip(),
"review": review_label.strip()
}
top_items.append(info)
print(top_items)
import requests
from bs4 import BeautifulSoup
# Make a request
page = requests.get(
"https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/")
soup = BeautifulSoup(page.content, 'html.parser')
# Create top_items as empty list
image_data = []
# Extract and store in top_items according to instructions on the left
images = soup.select('img')
for image in images:
src = image.get('src')
alt = image.get('alt')
image_data.append({"src": src, "alt": alt})
print(image_data)
info = {
"href": "<link here>",
"text": "<link text here>"
}
import requests
from bs4 import BeautifulSoup
# Make a request
page = requests.get(
"https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/")
soup = BeautifulSoup(page.content, 'html.parser')
# Create top_items as empty list
all_links = []
# Extract and store in top_items according to instructions on the left
links = soup.select('a')
for ahref in links:
text = ahref.text
text = text.strip() if text is not None else ''
href = ahref.get('href')
href = href.strip() if href is not None else ''
all_links.append({"href": href, "text": text})
print(all_links)
import requests
from bs4 import BeautifulSoup
import csv
# Make a request
page = requests.get(
"https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/")
soup = BeautifulSoup(page.content, 'html.parser')
all_products = []
products = soup.select('div.thumbnail')
for product in products:
# TODO: Work
print("Work on product here")
keys = all_products[0].keys()
with open('products.csv', 'w', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(all_products)
import requests
from bs4 import BeautifulSoup
import csv
# Make a request
page = requests.get(
"https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/")
soup = BeautifulSoup(page.content, 'html.parser')
# Create top_items as empty list
all_products = []
# Extract and store in top_items according to instructions on the left
products = soup.select('div.thumbnail')
for product in products:
name = product.select('h4 > a')[0].text.strip()
description = product.select('p.description')[0].text.strip()
price = product.select('h4.price')[0].text.strip()
reviews = product.select('div.ratings')[0].text.strip()
image = product.select('img')[0].get('src')
all_products.append({
"name": name,
"description": description,
"price": price,
"reviews": reviews,
"image": image
})
keys = all_products[0].keys()
with open('products.csv', 'w', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(all_products)
price. upc. image_url. url. In code, this is how you create a new Item class in Scrapy: from scrapy import Item, Field class BookItem (Item): title = Field () price = Field () upc = Field () image_url = Field () url = Field () As you can see in the code snippet, you need to import two Scrapy objects: Item and Field.
virtualenv env
source env/bin/activate
pip install scrapy
scrapy startproject bookscraper
📦bookscraper
┣ 📂bookscraper
┃ ┣ 📂spiders
┃ ┃ ┗ 📜bookscraper.py
┃ ┣ 📜items.py
┃ ┣ 📜middlewares.py
┃ ┣ 📜pipelines.py
┃ ┗ 📜settings.py
┗ 📜scrapy.cfg
from scrapy import Item, Field
class BookItem(Item):
title = Field()
price = Field()
upc = Field()
image_url = Field()
url = Field()
touch bookscraper.py
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from bookscraper.items import BookItem
class BookScraper(CrawlSpider):
name = "bookscraper"
start_urls = ["http://books.toscrape.com/"]
rules = (
Rule(LinkExtractor(restrict_css=".nav-list > li > ul > li > a"), follow=True),
Rule(LinkExtractor(restrict_css=".product_pod > h3 > a"), callback="parse_book")
)
def parse_book(self, response):
book_item = BookItem()
book_item["image_url"] = response.urljoin(response.css(".item.active > img::attr(src)").get())
book_item["title"] = response.css(".col-sm-6.product_main > h1::text").get()
book_item["price"] = response.css(".price_color::text").get()
book_item["upc"] = response.css(".table.table-striped > tr:nth-child(1) > td::text").get()
book_item["url"] = response.url
return book_item
scrapy crawl bookscraper
{'image_url': 'http://books.toscrape.com/media/cache/0f/76/0f76b00ea914ced1822d8ac3480c485f.jpg',
'price': '£12.61',
'title': 'The Third Wave: An Entrepreneur’s Vision of the Future',
'upc': '3bebf34ee9330cbd',
'url': 'http://books.toscrape.com/catalogue/the-third-wave-an-entrepreneurs-vision-of-the-future_862/index.html'}
2022-05-01 18:46:18 [scrapy.core.scraper] DEBUG: Scraped from <200 http://books.toscrape.com/catalogue/shoe-dog-a-memoir-by-the-creator-of-nike_831/index.html>
{'image_url': 'http://books.toscrape.com/media/cache/fc/21/fc21d144c7289e5b1cb133e01a925126.jpg',
'price': '£23.99',
'title': 'Shoe Dog: A Memoir by the Creator of NIKE',
'upc': '0e0dcc3339602b28',
'url': 'http://books.toscrape.com/catalogue/shoe-dog-a-memoir-by-the-creator-of-nike_831/index.html'}
2022-05-01 18:46:18 [scrapy.core.scraper] DEBUG: Scraped from <200 http://books.toscrape.com/catalogue/the-10-entrepreneur-live-your-startup-dream-without-quitting-your-day-job_836/index.html>
{'image_url': 'http://books.toscrape.com/media/cache/50/4b/504b1891508614ff9393563f69d66c95.jpg',
'price': '£27.55',
'title': 'The 10% Entrepreneur: Live Your Startup Dream Without Quitting Your '
'Day Job',
'upc': '56e4f9eab2e8e674',
'url': 'http://books.toscrape.com/catalogue/the-10-entrepreneur-live-your-startup-dream-without-quitting-your-day-job_836/index.html'}
2022-05-01 18:46:18 [scrapy.core.scraper] DEBUG: Scraped from <200 http://books.toscrape.com/catalogue/far-from-true-promise-falls-trilogy-2_320/index.html>
{'image_url': 'http://books.toscrape.com/media/cache/9c/aa/9caacda3ff43984447ee22712e7e9ca9.jpg',
'price': '£34.93',
'title': 'Far From True (Promise Falls Trilogy #2)',
'upc': 'ad15a9a139919918',
'url': 'http://books.toscrape.com/catalogue/far-from-true-promise-falls-trilogy-2_320/index.html'}
2022-05-01 18:46:18 [scrapy.core.scraper] DEBUG: Scraped from <200 http://books.toscrape.com/catalogue/the-travelers_285/index.html>
{'image_url': 'http://books.toscrape.com/media/cache/42/a3/42a345bdcb3e13d5922ff79cd1c07d0e.jpg',
'price': '£15.77',
'title': 'The Travelers',
'upc': '2b685187f55c5d31',
'url': 'http://books.toscrape.com/catalogue/the-travelers_285/index.html'}
2022-05-01 18:46:18 [scrapy.core.scraper] DEBUG: Scraped from <200 http://books.toscrape.com/catalogue/the-bone-hunters-lexy-vaughan-steven-macaulay-2_343/index.html>
{'image_url': 'http://books.toscrape.com/media/cache/8d/1f/8d1f11673fbe46f47f27b9a4c8efbf8a.jpg',
'price': '£59.71',
'title': 'The Bone Hunters (Lexy Vaughan & Steven Macaulay #2)',
'upc': '9c4d061c1e2fe6bf',
'url': 'http://books.toscrape.com/catalogue/the-bone-hunters-lexy-vaughan-steven-macaulay-2_343/index.html'}
URL change using. EC.url_changes() New opened window using. EC.new_window_is_opened() Changes in title using: EC.title_is() If you have any page redirections, you can see if there is a change in title or URL to check for it. There are many conditions to check for; we just take an example to show you how much power you have. …
$ pip install beautifulsoup4
from bs4 import BeautifulSoup
$ python myfile.py
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("https://www.python.org/")
res = BeautifulSoup(html.read(),"html5lib");
print(res.title)
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
try:
html = urlopen("https://www.python.org/")
except HTTPError as e:
print(e)
else:
res = BeautifulSoup(html.read(),"html5lib")
print(res.title)
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
try:
html = urlopen("https://www.python.org/")
except HTTPError as e:
print(e)
except URLError:
print("Server down or incorrect domain")
else:
res = BeautifulSoup(html.read(),"html5lib")
print(res.titles)
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
try:
html = urlopen("https://www.python.org/")
except HTTPError as e:
print(e)
except URLError:
print("Server down or incorrect domain")
else:
res = BeautifulSoup(html.read(),"html5lib")
if res.title is None:
print("Tag not found")
else:
print(res.title)
tags = res.findAll("h2", {"class": "widget-title"})
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
try:
html = urlopen("https://www.python.org/")
except HTTPError as e:
print(e)
except URLError:
print("Server down or incorrect domain")
else:
res = BeautifulSoup(html.read(),"html5lib")
tags = res.findAll("h2", {"class": "widget-title"})
for tag in tags:
print(tag.getText())
tags = res.findAll("span", "a" "img")
tags = res.findAll("a", {"class": ["url", "readmorebtn"]})
tags = res.findAll(text="Python Programming Basics with Examples")
tags = res.span.findAll("a")
tag = res.find("nav", {"id": "site-navigation"}).select("a")[3]
import re
tags = res.findAll("img", {"src": re.compile("\.\./uploads/photo_.*\.png")})
$ pip install selenium
from selenium import webdriver
browser = webdriver.Chrome()
browser.get("https://www.python.org/")
nav = browser.find_element_by_id("mainnav")
print(nav.text)
from selenium import webdriver
browser = webdriver.PhantomJS()
browser.get("https://www.python.org/")
print(browser.find_element_by_class_name("introduction").text)
browser.close()
browser.find_element_by_id("id")
browser.find_element_by_css_selector("#id")
browser.find_element_by_link_text("Click Here")
browser.find_element_by_name("Home")
browser.find_elements_by_id("id")
browser.find_elements_by_css_selector("#id")
browser.find_elements_by_link_text("Click Here")
browser.find_elements_by_name("Home")
from selenium import webdriver
from bs4 import BeautifulSoup
browser = webdriver.PhantomJS()
browser.get("https://www.python.org/")
page = BeautifulSoup(browser.page_source,"html5lib")
links = page.findAll("a")
for link in links:
print(link)
browser.close()
from selenium import webdriver
browser = webdriver.PhantomJS()
browser.get("https://developer.mozilla.org/en-US/docs/Web/HTML/Element/iframe")
iframe = browser.find_element_by_tag_name("iframe")
browser.switch_to.default_content()
browser.switch_to.frame(iframe)
iframe_source = browser.page_source
print(iframe_source) #returns iframe source
print(browser.current_url) #returns iframe URL
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
try:
html = urlopen("https://developer.mozilla.org/en-US/docs/Web/HTML/Element/iframe")
except HTTPError as e:
print(e)
except URLError:
print("Server down or incorrect domain")
else:
res = BeautifulSoup(html.read(), "html5lib")
tag = res.find("iframe")
print(tag['src']) #URl of iframe ready for scraping
from selenium import webdriver
import time
browser = webdriver.PhantomJS()
browser.get("https://www.w3schools.com/xml/ajax_intro.asp")
browser.find_element_by_tag_name("button").click()
time.sleep(2) #Explicit wait
browser.get_screenshot_as_file("image.png")
browser.close()
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.PhantomJS()
browser.get("https://resttesttest.com/")
browser.find_element_by_id("submitajax").click()
try:
element = WebDriverWait(browser, 10).until(EC.text_to_be_present_in_element((By.ID, "statuspre"),"HTTP 200 OK"))
finally:
browser.get_screenshot_as_file("image.png")
browser.close()
EC.url_changes()
EC.new_window_is_opened()
EC.title_is()
from selenium import webdriver
browser = webdriver.PhantomJS()
browser.get("https://likegeeks.com/")
print(browser.get_cookies())
from selenium import webdriver
browser = webdriver.PhantomJS()
browser.get("https://likegeeks.com/")
browser.delete_all_cookies()
Host https://www.google.com/
Connection keep-alive
Accept text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
User-Agent Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
39.0.2171.95 Safari/537.36
Referrer https://www.google.com/
Accept-Encoding gzip, deflate, sdch
Accept-Language en-US,en;q=0.8
Accept-Encoding identity
User-Agent Python-urllib/3.4
import time
time.sleep(3)
from urllib2 import Request