MCPcopy
hub / github.com/jaakkopasanen/AutoEq / crawl

Method crawl

dbtools/oratory1990_crawler.py:104–142  ·  view source on GitHub ↗
(self)

Source from the content-addressed store, hash-verified

102 return item.name is None or item.form is None or item.rig is None
103
104 def crawl(self):
105 if self.driver is None:
106 raise TypeError('self.driver cannot be None')
107 self.name_index = self.read_name_index()
108 document = self.get_beautiful_soup('https://www.reddit.com/r/oratory1990/wiki/index/list_of_presets')
109 table_header = document.find(id='wiki_full_list_of_eq_settings.3A')
110 if table_header is None:
111 raise RedditCrawlFailed('Failed to parse Reddit page. Maybe try again?')
112 self.crawl_index = NameIndex()
113 manufacturer, model = None, None
114 for row in table_header.parent.find('table').find('tbody').find_all('tr'):
115 cells = row.find_all('td')
116 # Parse cells
117 # Try to read manufacturer from the first cell and if it fails (cell is empty), use the previous name
118 manufacturer = cells[0].text.strip() if cells[0].text.strip() != '-' else manufacturer
119 # Try to read model from the second cell and if it fails (cell is empty), use the previous name
120 model = cells[1].text.strip() if cells[1].text.strip() != '-' else model
121 source_name = f'{manufacturer} {model}'
122 # Third cell contains hyperlink, where the anchor is the PDF and text is target name
123 url = cells[2].find('a')['href'].replace('?dl=0', '?dl=1')
124 form = 'over-ear' if 'over-ear' in cells[2].text.strip().lower() else 'in-ear'
125 # Fourth cell is notes
126 notes = cells[3].text.strip()
127 if 'preliminary' in notes.lower() or ' EQ' in notes:
128 continue # Skip various EQ settings and preliminary measurements
129 if notes and notes.lower() != 'standard':
130 source_name += f' ({notes})'
131 item = NameItem(url=url, source_name=source_name, form=form)
132 known_item = self.name_index.find_one(url=url)
133 if known_item is not None:
134 if known_item.name is not None:
135 item.name = known_item.name
136 if known_item.form is not None:
137 item.form = known_item.form
138 if known_item.rig is not None:
139 item.rig = known_item.rig
140 if not self.crawl_index.find(source_name=source_name):
141 self.crawl_index.add(item)
142 return self.crawl_index
143
144 @staticmethod
145 def parse_image(path, px_top=800, px_bottom=4400, px_left=0, px_right=2500):

Callers

nothing calls this directly

Calls 9

NameIndexClass · 0.90
NameItemClass · 0.90
RedditCrawlFailedClass · 0.85
get_beautiful_soupMethod · 0.80
replaceMethod · 0.80
find_oneMethod · 0.80
addMethod · 0.80
read_name_indexMethod · 0.45
findMethod · 0.45

Tested by

no test coverage detected