hub / github.com/jaakkopasanen/AutoEq / crawl

Method crawl

dbtools/oratory1990_crawler.py:104–142 · view source on GitHub ↗

(self)

Source from the content-addressed store, hash-verified

102	return item.name is None or item.form is None or item.rig is None
103
104	def crawl(self):
105	if self.driver is None:
106	raise TypeError('self.driver cannot be None')
107	self.name_index = self.read_name_index()
108	document = self.get_beautiful_soup('https://www.reddit.com/r/oratory1990/wiki/index/list_of_presets')
109	table_header = document.find(id='wiki_full_list_of_eq_settings.3A')
110	if table_header is None:
111	raise RedditCrawlFailed('Failed to parse Reddit page. Maybe try again?')
112	self.crawl_index = NameIndex()
113	manufacturer, model = None, None
114	for row in table_header.parent.find('table').find('tbody').find_all('tr'):
115	cells = row.find_all('td')
116	# Parse cells
117	# Try to read manufacturer from the first cell and if it fails (cell is empty), use the previous name
118	manufacturer = cells[0].text.strip() if cells[0].text.strip() != '-' else manufacturer
119	# Try to read model from the second cell and if it fails (cell is empty), use the previous name
120	model = cells[1].text.strip() if cells[1].text.strip() != '-' else model
121	source_name = f'{manufacturer} {model}'
122	# Third cell contains hyperlink, where the anchor is the PDF and text is target name
123	url = cells[2].find('a')['href'].replace('?dl=0', '?dl=1')
124	form = 'over-ear' if 'over-ear' in cells[2].text.strip().lower() else 'in-ear'
125	# Fourth cell is notes
126	notes = cells[3].text.strip()
127	if 'preliminary' in notes.lower() or ' EQ' in notes:
128	continue # Skip various EQ settings and preliminary measurements
129	if notes and notes.lower() != 'standard':
130	source_name += f' ({notes})'
131	item = NameItem(url=url, source_name=source_name, form=form)
132	known_item = self.name_index.find_one(url=url)
133	if known_item is not None:
134	if known_item.name is not None:
135	item.name = known_item.name
136	if known_item.form is not None:
137	item.form = known_item.form
138	if known_item.rig is not None:
139	item.rig = known_item.rig
140	if not self.crawl_index.find(source_name=source_name):
141	self.crawl_index.add(item)
142	return self.crawl_index
143
144	@staticmethod
145	def parse_image(path, px_top=800, px_bottom=4400, px_left=0, px_right=2500):

Callers

nothing calls this directly

Calls 9

NameIndexClass · 0.90

NameItemClass · 0.90

RedditCrawlFailedClass · 0.85

get_beautiful_soupMethod · 0.80

replaceMethod · 0.80

find_oneMethod · 0.80

addMethod · 0.80

read_name_indexMethod · 0.45

findMethod · 0.45

Tested by

no test coverage detected