| 9 | |
| 10 | |
| 11 | class Phonearena: |
| 12 | def __init__(self): |
| 13 | self.phones = [] |
| 14 | self.features = ["Brand", "Model Name", "Model Image"] |
| 15 | self.temp1 = [] |
| 16 | self.phones_brands = [] |
| 17 | self.url = "https://www.phonearena.com/phones/" # GSMArena website url |
| 18 | # Folder name on which files going to save. |
| 19 | self.new_folder_name = "GSMArenaDataset" |
| 20 | # It create the absolute path of the GSMArenaDataset folder. |
| 21 | self.absolute_path = os.getcwd().strip() + "/" + self.new_folder_name |
| 22 | |
| 23 | def crawl_html_page(self, sub_url): |
| 24 | url = sub_url # Url for html content parsing. |
| 25 | |
| 26 | # Handing the connection error of the url. |
| 27 | try: |
| 28 | page = requests.get(url) |
| 29 | # It parses the html data from requested url. |
| 30 | soup = BeautifulSoup(page.text, "html.parser") |
| 31 | return soup |
| 32 | |
| 33 | except ConnectionError: |
| 34 | print("Please check your network connection and re-run the script.") |
| 35 | exit() |
| 36 | |
| 37 | except Exception: |
| 38 | print("Please check your network connection and re-run the script.") |
| 39 | exit() |
| 40 | |
| 41 | def crawl_phone_urls(self): |
| 42 | phones_urls = [] |
| 43 | for i in range(1, 238): # Right now they have 237 page of phone data. |
| 44 | print(self.url + "page/" + str(i)) |
| 45 | soup = self.crawl_html_page(self.url + "page/" + str(i)) |
| 46 | table = soup.findAll("div", {"class": "stream-item"}) |
| 47 | table_a = [k.find("a") for k in table] |
| 48 | for a in table_a: |
| 49 | temp = a["href"] |
| 50 | phones_urls.append(temp) |
| 51 | return phones_urls |
| 52 | |
| 53 | def crawl_phones_models_specification(self, li): |
| 54 | phone_data = {} |
| 55 | for link in li: |
| 56 | print(link) |
| 57 | try: |
| 58 | soup = self.crawl_html_page(link) |
| 59 | model = soup.find(class_="page__section page__section_quickSpecs") |
| 60 | model_name = model.find("header").h1.text |
| 61 | model_img_html = model.find(class_="head-image") |
| 62 | model_img = model_img_html.find("img")["data-src"] |
| 63 | specs_html = model.find( |
| 64 | class_="phone__section phone__section_widget_quickSpecs" |
| 65 | ) |
| 66 | release_date = specs_html.find(class_="calendar") |
| 67 | release_date = release_date.find(class_="title").p.text |
| 68 | display = specs_html.find(class_="display") |
no outgoing calls
no test coverage detected