pablo-bot/ts_pn_data/getData.py
Kian Channon 8c7d91d097 Changes
2021-10-14 12:34:07 +02:00

636 lines
21 KiB
Python

"""MIT License
Copyright (c) 2021 Noah Saso
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE."""
#!/usr/bin/env python3
# downloads and exports data on all substances from psychonautwiki and tripsit factsheets, combining to form master list with standardized format
# prioritizes psychonautwiki ROA info (dose/duration) over tripsit factsheets
# pip3 install beautifulsoup4 requests python-graphql-client
import requests
from bs4 import BeautifulSoup
from python_graphql_client import GraphqlClient
import json
import os
import re
import traceback
from TrainingDataGen import DataGen
headers = {
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Methods": "GET",
"Access-Control-Allow-Headers": "Content-Type",
"Access-Control-Max-Age": "3600",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
}
ts_api_url = "https://tripbot.tripsit.me/api/tripsit/getAllDrugs"
ps_api_url = "https://api.psychonautwiki.org"
ps_client = GraphqlClient(endpoint=ps_api_url, headers=headers)
def substance_name_match(name, substance):
"""check if name matches any value in keys we care about"""
lower_name = name.lower()
return any(
[
lower_name == substance[key].lower()
for key in ["name", "pretty_name"]
if key in substance
]
+ [lower_name == alias.lower() for alias in substance.get("aliases", [])]
)
def find_substance_in_data(data, name):
return next((s for s in data if substance_name_match(name, s)), None)
roa_name_aliases = {
"iv": ["intravenous"],
"intravenous": ["iv"],
"im": ["intramuscular"],
"intramuscular": ["im"],
"insufflated": ["snorted"],
"snorted": ["insufflated"],
"vaporized": ["vapourized"],
"vapourized": ["vaporized"],
}
def roa_matches_name(roa, name):
aliases = roa_name_aliases.get(name.lower(), [])
return roa["name"].lower() == name.lower() or roa["name"].lower() in aliases
# get tripsit data
ts_dose_order = ["Threshold", "Light", "Common", "Strong", "Heavy"]
ts_combo_ignore = ["benzos"] # duplicate
# prettify names in interaction list
ts_combo_transformations = {
"lsd": "LSD",
"mushrooms": "Mushrooms",
"dmt": "DMT",
"mescaline": "Mescaline",
"dox": "DOx",
"nbomes": "NBOMes",
"2c-x": "2C-x",
"2c-t-x": "2C-T-x",
"amt": "aMT",
"5-meo-xxt": "5-MeO-xxT",
"cannabis": "Cannabis",
"ketamine": "Ketamine",
"mxe": "MXE",
"dxm": "DXM",
"pcp": "PCP",
"nitrous": "Nitrous",
"amphetamines": "Amphetamines",
"mdma": "MDMA",
"cocaine": "Cocaine",
"caffeine": "Caffeine",
"alcohol": "Alcohol",
"ghb/gbl": "GHB/GBL",
"opioids": "Opioids",
"tramadol": "Tramadol",
"benzodiazepines": "Benzodiazepines",
"maois": "MAOIs",
"ssris": "SSRIs",
}
ts_response = requests.get(ts_api_url)
ts_data = ts_response.json()["data"][0]
ts_substances_data = list(ts_data.values())
# TS has durations split over a few keys, so this finds or creates the duration for the associated ROA
# and adds a new line item
def ts_add_formatted_duration(ts_roas, formatted_duration, duration_name):
units = formatted_duration.get("_unit", "") or ""
if "_unit" in formatted_duration:
formatted_duration.pop("_unit")
def add_to_roa(roa, value):
if "duration" not in roa:
roa["duration"] = []
roa["duration"].append({"name": duration_name, "value": value})
for roa_name, value in formatted_duration.items():
value_string = f"{value} {units}".strip()
# if value present (i.e. just one value for all ROA doses provided above), apply to all ROAs
if roa_name == "value":
# if TS did not add any doses, do nothing with this value
# we could theoretically apply this to all PW doses with missing durations, but we can't be sure
# if it applies to all ROAs, so just ignore
if not len(ts_roas):
break
for ts_roa in ts_roas:
add_to_roa(ts_roa, value_string)
# add to matching ROA or create new ROA if doesn't exist
else:
ts_roa = next(
(ts_roa for ts_roa in ts_roas if roa_matches_name(ts_roa, roa_name)),
None,
)
# if ROA doesn't exist, make new
if not ts_roa:
ts_roa = {"name": roa_name}
ts_roas.append(ts_roa)
add_to_roa(ts_roa, value_string)
# get psychonautwiki data
def pw_clean_common_name(name):
name = re.sub(r'^"', "", name)
name = re.sub(r'"$', "", name)
name = re.sub(r'"?\[\d*\]$', "", name)
name = re.sub(r"\s*More names\.$", "", name)
name = re.sub(r"\.$", "", name)
name = re.sub(r"\(.*\)", "", name)
return name.strip()
def pw_should_skip(name, soup):
return (
name.startswith("Experience:") or len(soup.find_all(text="Common names")) == 0
)
pw_substance_data = []
if os.path.exists("ts_pn_data/_cached_pw_substances.json"):
with open("ts_pn_data/_cached_pw_substances.json") as f:
pw_substance_data = json.load(f)
if not len(pw_substance_data):
pw_substance_urls_query = """
{
substances(limit: 11000) {
name
url
}
}
"""
pw_substance_urls_data = ps_client.execute(query=pw_substance_urls_query)["data"][
"substances"
]
for idx, substance in enumerate(pw_substance_urls_data):
try:
url = substance["url"]
substance_req = requests.get(url, headers)
substance_soup = BeautifulSoup(substance_req.content, "html.parser")
name = substance_soup.find("h1", id="firstHeading").text
if pw_should_skip(name, substance_soup):
print(f"Skipping {name} ({idx + 1} / {len(pw_substance_urls_data)})")
continue
# get aliases text
common_names_str = substance_soup.find_all(text="Common names")
cleaned_common_names = (
set(
map(
pw_clean_common_name,
common_names_str[0]
.parent.find_next_sibling("td")
.text.split(", "),
)
)
if len(common_names_str) > 0
else set()
)
cleaned_common_names.add(substance["name"])
print(cleaned_common_names)
# don't include name in list of other common names
common_names = sorted(filter(lambda n: n != name, cleaned_common_names))
# scrape ROAs from page
def get_data_starting_at_row(curr_row):
rows = []
while curr_row.find("th", {"class": "ROARowHeader"}):
row = {}
row["name"] = (
curr_row.find("th", {"class": "ROARowHeader"}).find("a").text
)
row_values = curr_row.find("td", {"class": "RowValues"})
row_value_text = row_values.find_all(text=True, recursive=False)
if len(row_value_text):
row["value"] = "".join(row_value_text).strip()
else:
row["value"] = None
row_note = row_values.find("span")
if row_note:
row["note"] = re.sub(r"\s*\[\d*\]$", "", row_note.text).strip()
rows.append(row)
curr_row = curr_row.find_next("tr")
return rows, curr_row
roas = []
dose_charts = substance_soup.find_all("tr", {"class": "dosechart"})
for dose_chart in dose_charts:
table = dose_chart.parent.parent
roa_name = table.find("tr").find("a").text
if not roa_name:
continue
roa = {
"name": roa_name,
"dosage": [],
"duration": [],
}
# dosage
curr_row = dose_chart.find_next("tr")
roa["dosage"], curr_row = get_data_starting_at_row(curr_row)
# extract bioavailability
if len(roa["dosage"]) and roa["dosage"][0]["name"] == "Bioavailability":
bioavailability = roa["dosage"].pop(0)
roa["bioavailability"] = bioavailability["value"]
# duration
if curr_row.find("th", {"class": "ROASubHeader"}):
curr_row = curr_row.find_next("tr")
roa["duration"], _ = get_data_starting_at_row(curr_row)
if not len(roa["dosage"]):
roa["dosage"] = None
if not len(roa["duration"]):
roa["duration"] = None
roas.append(roa)
# query PS API for more data on substance
query = (
"""
{
substances(query: "%s") {
name
class {
chemical
psychoactive
}
tolerance {
full
half
zero
}
toxicity
addictionPotential
crossTolerances
}
}
"""
% substance["name"]
)
data = ps_client.execute(query=query)["data"]["substances"]
if len(data) == 0:
continue
elif len(data) > 1:
# should never happen?
print(f"{name} has more than one dataset... investigate why")
data = data[0]
if "name" in data:
data.pop("name")
pw_substance_data.append(
{
"url": url,
"name": name,
"aliases": common_names,
"roas": roas,
"data": data,
}
)
print(
f"Done with {name} [{len(roas)} ROA(s)] ({idx + 1} / {len(pw_substance_urls_data)})"
)
except KeyboardInterrupt:
print("\nScrape canceled")
exit(0)
except:
print(f"{name} failed:")
print(traceback.format_exc())
exit(1)
with open(f"ts_pn_data/_cached_pw_substances.json", "w", encoding="utf-8") as f:
f.write(json.dumps(pw_substance_data, indent=2, ensure_ascii=False))
# combine tripsit and psychonautwiki data
all_substance_names = sorted(
set(
list(map(lambda s: s.get("name", "").lower(), pw_substance_data))
+ list(map(lambda s: s.get("name", "").lower(), ts_substances_data))
)
)
substance_data = []
x = 0
for name in all_substance_names:
# find PW substance
pw_substance = find_substance_in_data(pw_substance_data, name)
# remove to get rid of duplicates in final output
if pw_substance:
pw_substance_data.remove(pw_substance)
else:
pw_substance = {}
# find TS substance
ts_substance = find_substance_in_data(ts_substances_data, name)
# remove to get rid of duplicates in final output
if ts_substance:
ts_substances_data.remove(ts_substance)
else:
ts_substance = {}
# if no substance found in either dataset, skip
if not pw_substance and not ts_substance:
continue
ts_properties = ts_substance.get("properties", {})
# url will always exist for psychonautwiki substance, so tripsit substance must exist if url is None
url = pw_substance.get("url") or f"https://drugs.tripsit.me/{ts_substance['name']}"
ts_links = ts_substance.get("links", {})
experiences_url = ts_links.get("experiences")
# pick display name from available substances found from both datasets
names = list(
filter(
lambda n: n is not None and len(n) > 0,
[pw_substance.get("name"), ts_substance.get("pretty_name")],
)
)
# people use shorter names
name = min(names, key=len)
# lowercase list of all names, excluding chosen name above
aliases = set(
map(
lambda n: n.lower(),
filter(
lambda n: n is not None and len(n) > 0,
[pw_substance.get("name"), ts_substance.get("pretty_name")]
+ pw_substance.get("aliases", [])
+ ts_substance.get("aliases", []),
),
)
)
if name.lower() in aliases:
aliases.remove(name.lower())
aliases = sorted(aliases)
summary = ts_properties.get("summary", "").strip()
if not len(summary):
summary = None
test_kits = ts_properties.get("test-kits", "").strip()
if not len(test_kits):
test_kits = None
ts_bioavailability_str = ts_properties.get("bioavailability", "").strip()
ts_bioavailability = {}
if len(ts_bioavailability_str):
matches = re.findall(
r"([a-zA-Z\/]+)[.:\s]+([0-9\.%\s\+/\-]+)", ts_bioavailability_str
)
if len(matches):
for roa_name, value in matches:
ts_bioavailability[roa_name.lower()] = value.strip(". \t")
pw_data = pw_substance.get("data", {})
classes = pw_data.get("class")
toxicity = pw_data.get("toxicity")
addiction_potential = pw_data.get("addictionPotential")
tolerance = pw_data.get("tolerance")
cross_tolerances = pw_data.get("crossTolerances")
roas = []
# get PW ROAs
pw_roas = pw_substance.get("roas", [])
# process TS ROAs
ts_roas = []
# TS ROA dosage
ts_formatted_dose = ts_substance.get("formatted_dose")
if ts_formatted_dose:
for roa_name, dose_data in ts_formatted_dose.items():
dose_levels = []
for dose_level in ts_dose_order:
value_string = dose_data.get(dose_level)
if value_string is None:
continue
dose_levels.append(
{
"name": dose_level,
"value": value_string,
}
)
if len(dose_levels):
ts_roas.append({"name": roa_name, "dosage": dose_levels})
# TS ROA durations
ts_formatted_onset = ts_substance.get("formatted_onset")
if ts_formatted_onset:
ts_add_formatted_duration(ts_roas, ts_formatted_onset, "Onset")
ts_formatted_duration = ts_substance.get("formatted_duration")
if ts_formatted_duration:
ts_add_formatted_duration(ts_roas, ts_formatted_duration, "Duration")
ts_formatted_aftereffects = ts_substance.get("formatted_aftereffects")
if ts_formatted_aftereffects:
ts_add_formatted_duration(ts_roas, ts_formatted_aftereffects, "After effects")
# merge PW and TS ROAs
# prioritize PW for ROAs but use TS to fill in gaps
roas.extend(pw_roas)
for ts_roa in ts_roas:
existing_roa = next(
(roa for roa in roas if roa_matches_name(roa, ts_roa["name"])), None
)
# if ROA does not exist, add
if not existing_roa:
existing_roa = ts_roa
roas.append(existing_roa)
# we want bioavailability from below, so don't skip
# if ROA does not already have bioavailability, try to get from TS
if not existing_roa.get("bioavailability"):
name_lower = ts_roa["name"].lower()
name_aliases = roa_name_aliases.get(name_lower, [])
alias_found = next(
(name_alias in ts_bioavailability for name_alias in name_aliases), None
)
# TS has bioavailability if name or any name alias is found
if name_lower in ts_bioavailability or alias_found:
existing_roa["bioavailability"] = ts_bioavailability.get(
name_lower
) or ts_bioavailability.get(alias_found)
# if existing ROA is missing dosage and TS has dosage, add
if (not existing_roa.get("dosage") or not len(existing_roa["dosage"])) and (
"dosage" in ts_roa and ts_roa["dosage"] and len(ts_roa["dosage"])
):
existing_roa["dosage"] = ts_roa["dosage"]
# if existing ROA is missing duration and TS has duration, add
if (not existing_roa.get("duration") or not len(existing_roa["duration"])) and (
"duration" in ts_roa and ts_roa["duration"] and len(ts_roa["duration"])
):
existing_roa["duration"] = ts_roa["duration"]
interactions = None
combos = ts_substance.get("combos")
if combos:
interactions = []
for key, combo_data in combos.items():
if key in ts_combo_ignore:
continue
combo_data["name"] = ts_combo_transformations[key]
interactions.append(combo_data)
interactions = sorted(interactions, key=lambda i: i["name"])
if classes != None:
chemical_class = classes["chemical"]
psychoactive_class = classes["psychoactive"]
else:
chemical_class = None
psychoactive_class = None
substance_data.append(
{
"id": x,
"name": name,
"aliases": list(aliases),
"aliasesStr": ",".join(aliases),
"url": url,
"experiencesUrl": experiences_url,
"summary": summary,
"reagents": test_kits,
"classes": classes,
"chemicalClass": chemical_class,
"psychoactiveClass": psychoactive_class,
"toxicity": toxicity,
"addictionPotential": addiction_potential,
"tolerance": tolerance,
"crossTolerances": cross_tolerances,
"roas": roas,
"interactions": interactions,
}
)
x += 1
# output
substances_json = {}
substances_json["substances"] = substance_data
with open(f"ts_pn_data/substances_data.json", "w", encoding="utf-8") as f:
json.dump(substances_json, fp=f, ensure_ascii=False, indent=2)
substance_aliases = {}
with open("data/lookups/substances.yml", "w", encoding="utf-8") as fp:
# Lookup Table
fp.write("""version: "2.0"\nnlu:\n- lookup: substance\n examples: |\n""")
for drug in substances_json["substances"]:
name = re.sub(r"\(.*\)", "", drug["name"])
fp.write(f" - {name}\n")
# Add aliases to lookup table too
for y in drug["aliases"]:
y = re.sub(r"\(.*\)", "", y)
# Check for "or" in aliases and remove
if " or " in y:
aliases = y.split(" or ")
fp.write(f" - {aliases[0]}\n")
fp.write(f" - {aliases[1]}\n")
elif "or " in y:
aliases = y.split("or ")
fp.write(f" - {aliases[1]}\n")
else:
fp.write(f" - {y}\n")
fp.write("\n")
# Synonyms to map aliases to one entity
for drug in substances_json["substances"]:
# Skip adding synonym if there are no aliases
name = re.sub(r"\(.*\)", "", drug["name"])
substance_aliases[name] = []
if drug["aliases"] == []:
continue
fp.write(f"- synonym: {name}\n examples: |\n")
for y in drug["aliases"]:
y = re.sub(r"\(.*\)", "", y)
# Check for "or" in aliases and remove
if " or " in y:
aliases = y.split(" or ")
fp.write(f" - {aliases[0]}\n")
fp.write(f" - {aliases[1]}\n")
substance_aliases[name].append(aliases[0])
substance_aliases[name].append(aliases[1])
elif "or " in y:
aliases = y.split("or ")
fp.write(f" - {aliases[1]}\n")
substance_aliases[name].append(aliases[1])
else:
fp.write(f" - {y}\n")
substance_aliases[name].append(y)
with open("ts_pn_data/generated_intents.yml", "w", encoding="utf-8") as fp:
fp.write(DataGen(substances_json).combo_gen())