mirror of
https://github.com/pyt0xic/pablo-bot.git
synced 2024-11-29 16:39:32 +01:00
636 lines
21 KiB
Python
636 lines
21 KiB
Python
"""MIT License
|
|
|
|
Copyright (c) 2021 Noah Saso
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in all
|
|
copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
SOFTWARE."""
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
# downloads and exports data on all substances from psychonautwiki and tripsit factsheets, combining to form master list with standardized format
|
|
# prioritizes psychonautwiki ROA info (dose/duration) over tripsit factsheets
|
|
# pip3 install beautifulsoup4 requests python-graphql-client
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from python_graphql_client import GraphqlClient
|
|
import json
|
|
import os
|
|
import re
|
|
import traceback
|
|
from TrainingDataGen import DataGen
|
|
|
|
headers = {
|
|
"Access-Control-Allow-Origin": "*",
|
|
"Access-Control-Allow-Methods": "GET",
|
|
"Access-Control-Allow-Headers": "Content-Type",
|
|
"Access-Control-Max-Age": "3600",
|
|
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
|
|
}
|
|
|
|
ts_api_url = "https://tripbot.tripsit.me/api/tripsit/getAllDrugs"
|
|
ps_api_url = "https://api.psychonautwiki.org"
|
|
ps_client = GraphqlClient(endpoint=ps_api_url, headers=headers)
|
|
|
|
|
|
def substance_name_match(name, substance):
|
|
"""check if name matches any value in keys we care about"""
|
|
lower_name = name.lower()
|
|
return any(
|
|
[
|
|
lower_name == substance[key].lower()
|
|
for key in ["name", "pretty_name"]
|
|
if key in substance
|
|
]
|
|
+ [lower_name == alias.lower() for alias in substance.get("aliases", [])]
|
|
)
|
|
|
|
|
|
def find_substance_in_data(data, name):
|
|
return next((s for s in data if substance_name_match(name, s)), None)
|
|
|
|
|
|
roa_name_aliases = {
|
|
"iv": ["intravenous"],
|
|
"intravenous": ["iv"],
|
|
"im": ["intramuscular"],
|
|
"intramuscular": ["im"],
|
|
"insufflated": ["snorted"],
|
|
"snorted": ["insufflated"],
|
|
"vaporized": ["vapourized"],
|
|
"vapourized": ["vaporized"],
|
|
}
|
|
|
|
|
|
def roa_matches_name(roa, name):
|
|
aliases = roa_name_aliases.get(name.lower(), [])
|
|
return roa["name"].lower() == name.lower() or roa["name"].lower() in aliases
|
|
|
|
|
|
# get tripsit data
|
|
|
|
|
|
ts_dose_order = ["Threshold", "Light", "Common", "Strong", "Heavy"]
|
|
ts_combo_ignore = ["benzos"] # duplicate
|
|
# prettify names in interaction list
|
|
ts_combo_transformations = {
|
|
"lsd": "LSD",
|
|
"mushrooms": "Mushrooms",
|
|
"dmt": "DMT",
|
|
"mescaline": "Mescaline",
|
|
"dox": "DOx",
|
|
"nbomes": "NBOMes",
|
|
"2c-x": "2C-x",
|
|
"2c-t-x": "2C-T-x",
|
|
"amt": "aMT",
|
|
"5-meo-xxt": "5-MeO-xxT",
|
|
"cannabis": "Cannabis",
|
|
"ketamine": "Ketamine",
|
|
"mxe": "MXE",
|
|
"dxm": "DXM",
|
|
"pcp": "PCP",
|
|
"nitrous": "Nitrous",
|
|
"amphetamines": "Amphetamines",
|
|
"mdma": "MDMA",
|
|
"cocaine": "Cocaine",
|
|
"caffeine": "Caffeine",
|
|
"alcohol": "Alcohol",
|
|
"ghb/gbl": "GHB/GBL",
|
|
"opioids": "Opioids",
|
|
"tramadol": "Tramadol",
|
|
"benzodiazepines": "Benzodiazepines",
|
|
"maois": "MAOIs",
|
|
"ssris": "SSRIs",
|
|
}
|
|
|
|
ts_response = requests.get(ts_api_url)
|
|
ts_data = ts_response.json()["data"][0]
|
|
|
|
ts_substances_data = list(ts_data.values())
|
|
|
|
|
|
# TS has durations split over a few keys, so this finds or creates the duration for the associated ROA
|
|
# and adds a new line item
|
|
def ts_add_formatted_duration(ts_roas, formatted_duration, duration_name):
|
|
units = formatted_duration.get("_unit", "") or ""
|
|
if "_unit" in formatted_duration:
|
|
formatted_duration.pop("_unit")
|
|
|
|
def add_to_roa(roa, value):
|
|
if "duration" not in roa:
|
|
roa["duration"] = []
|
|
|
|
roa["duration"].append({"name": duration_name, "value": value})
|
|
|
|
for roa_name, value in formatted_duration.items():
|
|
value_string = f"{value} {units}".strip()
|
|
|
|
# if value present (i.e. just one value for all ROA doses provided above), apply to all ROAs
|
|
if roa_name == "value":
|
|
# if TS did not add any doses, do nothing with this value
|
|
# we could theoretically apply this to all PW doses with missing durations, but we can't be sure
|
|
# if it applies to all ROAs, so just ignore
|
|
if not len(ts_roas):
|
|
break
|
|
|
|
for ts_roa in ts_roas:
|
|
add_to_roa(ts_roa, value_string)
|
|
|
|
# add to matching ROA or create new ROA if doesn't exist
|
|
else:
|
|
ts_roa = next(
|
|
(ts_roa for ts_roa in ts_roas if roa_matches_name(ts_roa, roa_name)),
|
|
None,
|
|
)
|
|
# if ROA doesn't exist, make new
|
|
if not ts_roa:
|
|
ts_roa = {"name": roa_name}
|
|
ts_roas.append(ts_roa)
|
|
|
|
add_to_roa(ts_roa, value_string)
|
|
|
|
|
|
# get psychonautwiki data
|
|
|
|
|
|
def pw_clean_common_name(name):
|
|
name = re.sub(r'^"', "", name)
|
|
name = re.sub(r'"$', "", name)
|
|
name = re.sub(r'"?\[\d*\]$', "", name)
|
|
name = re.sub(r"\s*More names\.$", "", name)
|
|
name = re.sub(r"\.$", "", name)
|
|
name = re.sub(r"\(.*\)", "", name)
|
|
return name.strip()
|
|
|
|
|
|
def pw_should_skip(name, soup):
|
|
return (
|
|
name.startswith("Experience:") or len(soup.find_all(text="Common names")) == 0
|
|
)
|
|
|
|
|
|
pw_substance_data = []
|
|
|
|
if os.path.exists("ts_pn_data/_cached_pw_substances.json"):
|
|
with open("ts_pn_data/_cached_pw_substances.json") as f:
|
|
pw_substance_data = json.load(f)
|
|
|
|
if not len(pw_substance_data):
|
|
pw_substance_urls_query = """
|
|
{
|
|
substances(limit: 11000) {
|
|
name
|
|
url
|
|
}
|
|
}
|
|
"""
|
|
|
|
pw_substance_urls_data = ps_client.execute(query=pw_substance_urls_query)["data"][
|
|
"substances"
|
|
]
|
|
|
|
for idx, substance in enumerate(pw_substance_urls_data):
|
|
try:
|
|
url = substance["url"]
|
|
substance_req = requests.get(url, headers)
|
|
substance_soup = BeautifulSoup(substance_req.content, "html.parser")
|
|
|
|
name = substance_soup.find("h1", id="firstHeading").text
|
|
if pw_should_skip(name, substance_soup):
|
|
print(f"Skipping {name} ({idx + 1} / {len(pw_substance_urls_data)})")
|
|
continue
|
|
|
|
# get aliases text
|
|
common_names_str = substance_soup.find_all(text="Common names")
|
|
|
|
cleaned_common_names = (
|
|
set(
|
|
map(
|
|
pw_clean_common_name,
|
|
common_names_str[0]
|
|
.parent.find_next_sibling("td")
|
|
.text.split(", "),
|
|
)
|
|
)
|
|
if len(common_names_str) > 0
|
|
else set()
|
|
)
|
|
cleaned_common_names.add(substance["name"])
|
|
print(cleaned_common_names)
|
|
# don't include name in list of other common names
|
|
common_names = sorted(filter(lambda n: n != name, cleaned_common_names))
|
|
|
|
# scrape ROAs from page
|
|
|
|
def get_data_starting_at_row(curr_row):
|
|
rows = []
|
|
while curr_row.find("th", {"class": "ROARowHeader"}):
|
|
row = {}
|
|
row["name"] = (
|
|
curr_row.find("th", {"class": "ROARowHeader"}).find("a").text
|
|
)
|
|
|
|
row_values = curr_row.find("td", {"class": "RowValues"})
|
|
|
|
row_value_text = row_values.find_all(text=True, recursive=False)
|
|
if len(row_value_text):
|
|
row["value"] = "".join(row_value_text).strip()
|
|
else:
|
|
row["value"] = None
|
|
|
|
row_note = row_values.find("span")
|
|
if row_note:
|
|
row["note"] = re.sub(r"\s*\[\d*\]$", "", row_note.text).strip()
|
|
|
|
rows.append(row)
|
|
|
|
curr_row = curr_row.find_next("tr")
|
|
return rows, curr_row
|
|
|
|
roas = []
|
|
|
|
dose_charts = substance_soup.find_all("tr", {"class": "dosechart"})
|
|
for dose_chart in dose_charts:
|
|
table = dose_chart.parent.parent
|
|
roa_name = table.find("tr").find("a").text
|
|
if not roa_name:
|
|
continue
|
|
|
|
roa = {
|
|
"name": roa_name,
|
|
"dosage": [],
|
|
"duration": [],
|
|
}
|
|
|
|
# dosage
|
|
|
|
curr_row = dose_chart.find_next("tr")
|
|
roa["dosage"], curr_row = get_data_starting_at_row(curr_row)
|
|
|
|
# extract bioavailability
|
|
if len(roa["dosage"]) and roa["dosage"][0]["name"] == "Bioavailability":
|
|
bioavailability = roa["dosage"].pop(0)
|
|
roa["bioavailability"] = bioavailability["value"]
|
|
|
|
# duration
|
|
|
|
if curr_row.find("th", {"class": "ROASubHeader"}):
|
|
curr_row = curr_row.find_next("tr")
|
|
roa["duration"], _ = get_data_starting_at_row(curr_row)
|
|
|
|
if not len(roa["dosage"]):
|
|
roa["dosage"] = None
|
|
if not len(roa["duration"]):
|
|
roa["duration"] = None
|
|
|
|
roas.append(roa)
|
|
|
|
# query PS API for more data on substance
|
|
|
|
query = (
|
|
"""
|
|
{
|
|
substances(query: "%s") {
|
|
name
|
|
class {
|
|
chemical
|
|
psychoactive
|
|
}
|
|
tolerance {
|
|
full
|
|
half
|
|
zero
|
|
}
|
|
toxicity
|
|
addictionPotential
|
|
crossTolerances
|
|
}
|
|
}
|
|
"""
|
|
% substance["name"]
|
|
)
|
|
|
|
data = ps_client.execute(query=query)["data"]["substances"]
|
|
if len(data) == 0:
|
|
continue
|
|
elif len(data) > 1:
|
|
# should never happen?
|
|
print(f"{name} has more than one dataset... investigate why")
|
|
|
|
data = data[0]
|
|
if "name" in data:
|
|
data.pop("name")
|
|
|
|
pw_substance_data.append(
|
|
{
|
|
"url": url,
|
|
"name": name,
|
|
"aliases": common_names,
|
|
"roas": roas,
|
|
"data": data,
|
|
}
|
|
)
|
|
print(
|
|
f"Done with {name} [{len(roas)} ROA(s)] ({idx + 1} / {len(pw_substance_urls_data)})"
|
|
)
|
|
|
|
except KeyboardInterrupt:
|
|
print("\nScrape canceled")
|
|
exit(0)
|
|
except:
|
|
print(f"{name} failed:")
|
|
print(traceback.format_exc())
|
|
exit(1)
|
|
|
|
with open(f"ts_pn_data/_cached_pw_substances.json", "w", encoding="utf-8") as f:
|
|
f.write(json.dumps(pw_substance_data, indent=2, ensure_ascii=False))
|
|
|
|
# combine tripsit and psychonautwiki data
|
|
|
|
|
|
all_substance_names = sorted(
|
|
set(
|
|
list(map(lambda s: s.get("name", "").lower(), pw_substance_data))
|
|
+ list(map(lambda s: s.get("name", "").lower(), ts_substances_data))
|
|
)
|
|
)
|
|
substance_data = []
|
|
x = 0
|
|
|
|
for name in all_substance_names:
|
|
# find PW substance
|
|
pw_substance = find_substance_in_data(pw_substance_data, name)
|
|
# remove to get rid of duplicates in final output
|
|
if pw_substance:
|
|
pw_substance_data.remove(pw_substance)
|
|
else:
|
|
pw_substance = {}
|
|
|
|
# find TS substance
|
|
ts_substance = find_substance_in_data(ts_substances_data, name)
|
|
# remove to get rid of duplicates in final output
|
|
if ts_substance:
|
|
ts_substances_data.remove(ts_substance)
|
|
else:
|
|
ts_substance = {}
|
|
|
|
# if no substance found in either dataset, skip
|
|
if not pw_substance and not ts_substance:
|
|
continue
|
|
|
|
ts_properties = ts_substance.get("properties", {})
|
|
|
|
# url will always exist for psychonautwiki substance, so tripsit substance must exist if url is None
|
|
url = pw_substance.get("url") or f"https://drugs.tripsit.me/{ts_substance['name']}"
|
|
|
|
ts_links = ts_substance.get("links", {})
|
|
experiences_url = ts_links.get("experiences")
|
|
|
|
# pick display name from available substances found from both datasets
|
|
names = list(
|
|
filter(
|
|
lambda n: n is not None and len(n) > 0,
|
|
[pw_substance.get("name"), ts_substance.get("pretty_name")],
|
|
)
|
|
)
|
|
# people use shorter names
|
|
name = min(names, key=len)
|
|
|
|
# lowercase list of all names, excluding chosen name above
|
|
aliases = set(
|
|
map(
|
|
lambda n: n.lower(),
|
|
filter(
|
|
lambda n: n is not None and len(n) > 0,
|
|
[pw_substance.get("name"), ts_substance.get("pretty_name")]
|
|
+ pw_substance.get("aliases", [])
|
|
+ ts_substance.get("aliases", []),
|
|
),
|
|
)
|
|
)
|
|
if name.lower() in aliases:
|
|
aliases.remove(name.lower())
|
|
aliases = sorted(aliases)
|
|
|
|
summary = ts_properties.get("summary", "").strip()
|
|
if not len(summary):
|
|
summary = None
|
|
|
|
test_kits = ts_properties.get("test-kits", "").strip()
|
|
if not len(test_kits):
|
|
test_kits = None
|
|
|
|
ts_bioavailability_str = ts_properties.get("bioavailability", "").strip()
|
|
ts_bioavailability = {}
|
|
if len(ts_bioavailability_str):
|
|
matches = re.findall(
|
|
r"([a-zA-Z\/]+)[.:\s]+([0-9\.%\s\+/\-]+)", ts_bioavailability_str
|
|
)
|
|
if len(matches):
|
|
for roa_name, value in matches:
|
|
ts_bioavailability[roa_name.lower()] = value.strip(". \t")
|
|
|
|
pw_data = pw_substance.get("data", {})
|
|
|
|
classes = pw_data.get("class")
|
|
toxicity = pw_data.get("toxicity")
|
|
addiction_potential = pw_data.get("addictionPotential")
|
|
tolerance = pw_data.get("tolerance")
|
|
cross_tolerances = pw_data.get("crossTolerances")
|
|
|
|
roas = []
|
|
|
|
# get PW ROAs
|
|
pw_roas = pw_substance.get("roas", [])
|
|
|
|
# process TS ROAs
|
|
ts_roas = []
|
|
|
|
# TS ROA dosage
|
|
ts_formatted_dose = ts_substance.get("formatted_dose")
|
|
if ts_formatted_dose:
|
|
for roa_name, dose_data in ts_formatted_dose.items():
|
|
dose_levels = []
|
|
for dose_level in ts_dose_order:
|
|
value_string = dose_data.get(dose_level)
|
|
if value_string is None:
|
|
continue
|
|
|
|
dose_levels.append(
|
|
{
|
|
"name": dose_level,
|
|
"value": value_string,
|
|
}
|
|
)
|
|
|
|
if len(dose_levels):
|
|
ts_roas.append({"name": roa_name, "dosage": dose_levels})
|
|
|
|
# TS ROA durations
|
|
ts_formatted_onset = ts_substance.get("formatted_onset")
|
|
if ts_formatted_onset:
|
|
ts_add_formatted_duration(ts_roas, ts_formatted_onset, "Onset")
|
|
|
|
ts_formatted_duration = ts_substance.get("formatted_duration")
|
|
if ts_formatted_duration:
|
|
ts_add_formatted_duration(ts_roas, ts_formatted_duration, "Duration")
|
|
|
|
ts_formatted_aftereffects = ts_substance.get("formatted_aftereffects")
|
|
if ts_formatted_aftereffects:
|
|
ts_add_formatted_duration(ts_roas, ts_formatted_aftereffects, "After effects")
|
|
|
|
# merge PW and TS ROAs
|
|
# prioritize PW for ROAs but use TS to fill in gaps
|
|
|
|
roas.extend(pw_roas)
|
|
for ts_roa in ts_roas:
|
|
existing_roa = next(
|
|
(roa for roa in roas if roa_matches_name(roa, ts_roa["name"])), None
|
|
)
|
|
# if ROA does not exist, add
|
|
if not existing_roa:
|
|
existing_roa = ts_roa
|
|
roas.append(existing_roa)
|
|
# we want bioavailability from below, so don't skip
|
|
|
|
# if ROA does not already have bioavailability, try to get from TS
|
|
if not existing_roa.get("bioavailability"):
|
|
name_lower = ts_roa["name"].lower()
|
|
name_aliases = roa_name_aliases.get(name_lower, [])
|
|
|
|
alias_found = next(
|
|
(name_alias in ts_bioavailability for name_alias in name_aliases), None
|
|
)
|
|
# TS has bioavailability if name or any name alias is found
|
|
if name_lower in ts_bioavailability or alias_found:
|
|
existing_roa["bioavailability"] = ts_bioavailability.get(
|
|
name_lower
|
|
) or ts_bioavailability.get(alias_found)
|
|
|
|
# if existing ROA is missing dosage and TS has dosage, add
|
|
if (not existing_roa.get("dosage") or not len(existing_roa["dosage"])) and (
|
|
"dosage" in ts_roa and ts_roa["dosage"] and len(ts_roa["dosage"])
|
|
):
|
|
existing_roa["dosage"] = ts_roa["dosage"]
|
|
|
|
# if existing ROA is missing duration and TS has duration, add
|
|
if (not existing_roa.get("duration") or not len(existing_roa["duration"])) and (
|
|
"duration" in ts_roa and ts_roa["duration"] and len(ts_roa["duration"])
|
|
):
|
|
existing_roa["duration"] = ts_roa["duration"]
|
|
|
|
interactions = None
|
|
combos = ts_substance.get("combos")
|
|
if combos:
|
|
interactions = []
|
|
for key, combo_data in combos.items():
|
|
if key in ts_combo_ignore:
|
|
continue
|
|
|
|
combo_data["name"] = ts_combo_transformations[key]
|
|
interactions.append(combo_data)
|
|
interactions = sorted(interactions, key=lambda i: i["name"])
|
|
|
|
if classes != None:
|
|
chemical_class = classes["chemical"]
|
|
psychoactive_class = classes["psychoactive"]
|
|
else:
|
|
chemical_class = None
|
|
psychoactive_class = None
|
|
|
|
substance_data.append(
|
|
{
|
|
"id": x,
|
|
"name": name,
|
|
"aliases": list(aliases),
|
|
"aliasesStr": ",".join(aliases),
|
|
"url": url,
|
|
"experiencesUrl": experiences_url,
|
|
"summary": summary,
|
|
"reagents": test_kits,
|
|
"classes": classes,
|
|
"chemicalClass": chemical_class,
|
|
"psychoactiveClass": psychoactive_class,
|
|
"toxicity": toxicity,
|
|
"addictionPotential": addiction_potential,
|
|
"tolerance": tolerance,
|
|
"crossTolerances": cross_tolerances,
|
|
"roas": roas,
|
|
"interactions": interactions,
|
|
}
|
|
)
|
|
x += 1
|
|
|
|
# output
|
|
|
|
substances_json = {}
|
|
substances_json["substances"] = substance_data
|
|
with open(f"ts_pn_data/substances_data.json", "w", encoding="utf-8") as f:
|
|
json.dump(substances_json, fp=f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
substance_aliases = {}
|
|
|
|
with open("data/lookups/substances.yml", "w", encoding="utf-8") as fp:
|
|
# Lookup Table
|
|
fp.write("""version: "2.0"\nnlu:\n- lookup: substance\n examples: |\n""")
|
|
for drug in substances_json["substances"]:
|
|
name = re.sub(r"\(.*\)", "", drug["name"])
|
|
fp.write(f" - {name}\n")
|
|
# Add aliases to lookup table too
|
|
for y in drug["aliases"]:
|
|
y = re.sub(r"\(.*\)", "", y)
|
|
# Check for "or" in aliases and remove
|
|
if " or " in y:
|
|
aliases = y.split(" or ")
|
|
fp.write(f" - {aliases[0]}\n")
|
|
fp.write(f" - {aliases[1]}\n")
|
|
elif "or " in y:
|
|
aliases = y.split("or ")
|
|
fp.write(f" - {aliases[1]}\n")
|
|
else:
|
|
fp.write(f" - {y}\n")
|
|
fp.write("\n")
|
|
# Synonyms to map aliases to one entity
|
|
for drug in substances_json["substances"]:
|
|
# Skip adding synonym if there are no aliases
|
|
name = re.sub(r"\(.*\)", "", drug["name"])
|
|
substance_aliases[name] = []
|
|
if drug["aliases"] == []:
|
|
continue
|
|
fp.write(f"- synonym: {name}\n examples: |\n")
|
|
for y in drug["aliases"]:
|
|
y = re.sub(r"\(.*\)", "", y)
|
|
# Check for "or" in aliases and remove
|
|
if " or " in y:
|
|
aliases = y.split(" or ")
|
|
fp.write(f" - {aliases[0]}\n")
|
|
fp.write(f" - {aliases[1]}\n")
|
|
substance_aliases[name].append(aliases[0])
|
|
substance_aliases[name].append(aliases[1])
|
|
elif "or " in y:
|
|
aliases = y.split("or ")
|
|
fp.write(f" - {aliases[1]}\n")
|
|
substance_aliases[name].append(aliases[1])
|
|
else:
|
|
fp.write(f" - {y}\n")
|
|
substance_aliases[name].append(y)
|
|
|
|
with open("ts_pn_data/generated_intents.yml", "w", encoding="utf-8") as fp:
|
|
fp.write(DataGen(substances_json).combo_gen())
|