parse bioavailability into separate ROAs from tripsit and minor improvements
This commit is contained in:
parent
db7e100095
commit
aaff2bd826
49
scrape.py
49
scrape.py
@ -2,7 +2,6 @@
|
|||||||
|
|
||||||
# downloads and exports data on all substances from psychonautwiki and tripsit factsheets, combining to form master list with standardized format
|
# downloads and exports data on all substances from psychonautwiki and tripsit factsheets, combining to form master list with standardized format
|
||||||
# prioritizes psychonautwiki ROA info (dose/duration) over tripsit factsheets
|
# prioritizes psychonautwiki ROA info (dose/duration) over tripsit factsheets
|
||||||
|
|
||||||
# pip3 install beautifulsoup4 requests python-graphql-client
|
# pip3 install beautifulsoup4 requests python-graphql-client
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
@ -51,6 +50,9 @@ roa_name_aliases = {
|
|||||||
|
|
||||||
'insufflated': ['snorted'],
|
'insufflated': ['snorted'],
|
||||||
'snorted': ['insufflated'],
|
'snorted': ['insufflated'],
|
||||||
|
|
||||||
|
'vaporized': ['vapourized'],
|
||||||
|
'vapourized': ['vaporized'],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -196,7 +198,7 @@ if not len(pw_substance_data):
|
|||||||
'td').text.split(', '))) if len(common_names_str) > 0 else set()
|
'td').text.split(', '))) if len(common_names_str) > 0 else set()
|
||||||
cleaned_common_names.add(substance['name'])
|
cleaned_common_names.add(substance['name'])
|
||||||
# don't include name in list of other common names
|
# don't include name in list of other common names
|
||||||
common_names = list(
|
common_names = sorted(
|
||||||
filter(lambda n: n != name, cleaned_common_names))
|
filter(lambda n: n != name, cleaned_common_names))
|
||||||
|
|
||||||
# scrape ROAs from page
|
# scrape ROAs from page
|
||||||
@ -210,10 +212,10 @@ if not len(pw_substance_data):
|
|||||||
|
|
||||||
row_values = curr_row.find('td', {'class': 'RowValues'})
|
row_values = curr_row.find('td', {'class': 'RowValues'})
|
||||||
|
|
||||||
row_value_text = row_values.find(
|
row_value_text = row_values.find_all(
|
||||||
text=True, recursive=False)
|
text=True, recursive=False)
|
||||||
if row_value_text:
|
if len(row_value_text):
|
||||||
row['value'] = row_value_text.strip()
|
row['value'] = "".join(row_value_text).strip()
|
||||||
else:
|
else:
|
||||||
row['value'] = None
|
row['value'] = None
|
||||||
|
|
||||||
@ -373,9 +375,14 @@ for name in all_substance_names:
|
|||||||
if not len(summary):
|
if not len(summary):
|
||||||
summary = None
|
summary = None
|
||||||
|
|
||||||
bioavailability = ts_properties.get('bioavailability', '').strip()
|
ts_bioavailability_str = ts_properties.get('bioavailability', '').strip()
|
||||||
if not len(bioavailability):
|
ts_bioavailability = {}
|
||||||
bioavailability = None
|
if len(ts_bioavailability_str):
|
||||||
|
matches = re.findall(
|
||||||
|
r'([a-zA-Z\/]+)[.:\s]+([0-9\.%\s\+/\-]+)', ts_bioavailability_str)
|
||||||
|
if len(matches):
|
||||||
|
for roa_name, value in matches:
|
||||||
|
ts_bioavailability[roa_name.lower()] = value.strip('. \t')
|
||||||
|
|
||||||
pw_data = pw_substance.get('data', {})
|
pw_data = pw_substance.get('data', {})
|
||||||
|
|
||||||
@ -437,15 +444,28 @@ for name in all_substance_names:
|
|||||||
(roa for roa in roas if roa_matches_name(roa, ts_roa['name'])), None)
|
(roa for roa in roas if roa_matches_name(roa, ts_roa['name'])), None)
|
||||||
# if ROA does not exist, add
|
# if ROA does not exist, add
|
||||||
if not existing_roa:
|
if not existing_roa:
|
||||||
roas.append(ts_roa)
|
existing_roa = ts_roa
|
||||||
continue
|
roas.append(existing_roa)
|
||||||
|
# we want bioavailability from below, so don't skip
|
||||||
|
|
||||||
# if TS has dosage but existing ROA is missing dosage, add
|
# if ROA does not already have bioavailability, try to get from TS
|
||||||
if ('dosage' in ts_roa and ts_roa['dosage'] and len(ts_roa['dosage'])) and ('dosage' not in existing_roa or not existing_roa['dosage'] or not len(existing_roa['dosage'])):
|
if not existing_roa.get('bioavailability'):
|
||||||
|
name_lower = ts_roa['name'].lower()
|
||||||
|
name_aliases = roa_name_aliases.get(name_lower, [])
|
||||||
|
|
||||||
|
alias_found = next(
|
||||||
|
(name_alias in ts_bioavailability for name_alias in name_aliases), None)
|
||||||
|
# TS has bioavailability if name or any name alias is found
|
||||||
|
if name_lower in ts_bioavailability or alias_found:
|
||||||
|
existing_roa['bioavailability'] = ts_bioavailability.get(
|
||||||
|
name_lower) or ts_bioavailability.get(alias_found)
|
||||||
|
|
||||||
|
# if existing ROA is missing dosage and TS has dosage, add
|
||||||
|
if (not existing_roa.get('dosage') or not len(existing_roa['dosage'])) and ('dosage' in ts_roa and ts_roa['dosage'] and len(ts_roa['dosage'])):
|
||||||
existing_roa['dosage'] = ts_roa['dosage']
|
existing_roa['dosage'] = ts_roa['dosage']
|
||||||
|
|
||||||
# if TS has duration but existing ROA is missing duration, add
|
# if existing ROA is missing duration and TS has duration, add
|
||||||
if ('duration' in ts_roa and ts_roa['duration'] and len(ts_roa['duration'])) and ('duration' not in existing_roa or not existing_roa['duration'] or not len(existing_roa['duration'])):
|
if (not existing_roa.get('duration') or not len(existing_roa['duration'])) and ('duration' in ts_roa and ts_roa['duration'] and len(ts_roa['duration'])):
|
||||||
existing_roa['duration'] = ts_roa['duration']
|
existing_roa['duration'] = ts_roa['duration']
|
||||||
|
|
||||||
interactions = None
|
interactions = None
|
||||||
@ -471,7 +491,6 @@ for name in all_substance_names:
|
|||||||
'addictionPotential': addiction_potential,
|
'addictionPotential': addiction_potential,
|
||||||
'tolerance': tolerance,
|
'tolerance': tolerance,
|
||||||
'crossTolerances': cross_tolerances,
|
'crossTolerances': cross_tolerances,
|
||||||
'bioavailability': bioavailability,
|
|
||||||
'roas': roas,
|
'roas': roas,
|
||||||
'interactions': interactions,
|
'interactions': interactions,
|
||||||
})
|
})
|
||||||
|
Loading…
x
Reference in New Issue
Block a user