Using a better script to get drug data and updated requirements.txt

2025-12-24 07:17:55 +01:00 · 2021-09-29 23:03:13 +02:00 · 2021-09-29 23:03:13 +02:00 · faefd94f2a
commit faefd94f2a
parent 8243a5ba0f
6 changed files with 68095 additions and 35229 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1,4 @@
 rasa
-pandas
+pandas
+beautifulsoup4
+python-graphql-client
--- a/tripsit/getAllDrugs.json
+++ b/tripsit/getAllDrugs.json
--- a/tripsit/getAllDrugs.py
+++ b/tripsit/getAllDrugs.py
@ -1,27 +0,0 @@
-import requests, json
-import pandas as pd
-
-# API URL
-url = "http://tripbot.tripsit.me/api/tripsit/getAllDrugs"
-r = requests.get(url)
-data = r.json()
-
-# Format dict and load into df
-data = json.dumps(data["data"][0], indent=2, sort_keys=False, ensure_ascii=False)
-df = pd.DataFrame.from_dict(json.loads(data), orient="index")
-
-# Add id for each drug for rasa
-id = []
-for x in range(0, len(df)):
-    id.append(x)
-df["id"] = id
-
-# Write to JSON file
-with open("tripsit/getAllDrugs.json", "w") as fp:
-    # Clean NaN values
-    clean_data = {
-        k1: {k: v for k, v in v1.items() if v == v and v is not None}
-        for k1, v1 in df.to_dict("index").items()
-    }
-    # Set ensure_ascii to false to ensure we can keep greek letters (like alpha)
-    fp.write(json.dumps(clean_data, indent=2, ensure_ascii=False))
--- a/ts_pn_data/_cached_pw_substances.json
+++ b/ts_pn_data/_cached_pw_substances.json
--- a/ts_pn_data/getData.py
+++ b/ts_pn_data/getData.py
@ -0,0 +1,558 @@
+#!/usr/bin/env python3
+
+# downloads and exports data on all substances from psychonautwiki and tripsit factsheets, combining to form master list with standardized format
+# prioritizes psychonautwiki ROA info (dose/duration) over tripsit factsheets
+# pip3 install beautifulsoup4 requests python-graphql-client
+
+import requests
+from bs4 import BeautifulSoup
+from time import time, sleep
+from python_graphql_client import GraphqlClient
+import json
+import os
+import re
+import traceback
+
+headers = {
+    "Access-Control-Allow-Origin": "*",
+    "Access-Control-Allow-Methods": "GET",
+    "Access-Control-Allow-Headers": "Content-Type",
+    "Access-Control-Max-Age": "3600",
+    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
+}
+
+ts_api_url = "https://tripbot.tripsit.me/api/tripsit/getAllDrugs"
+ps_api_url = "https://api.psychonautwiki.org"
+ps_client = GraphqlClient(endpoint=ps_api_url, headers=headers)
+
+
+def substance_name_match(name, substance):
+    """check if name matches any value in keys we care about"""
+    lower_name = name.lower()
+    return any(
+        [
+            lower_name == substance[key].lower()
+            for key in ["name", "pretty_name"]
+            if key in substance
+        ]
+        + [lower_name == alias.lower() for alias in substance.get("aliases", [])]
+    )
+
+
+def find_substance_in_data(data, name):
+    return next((s for s in data if substance_name_match(name, s)), None)
+
+
+roa_name_aliases = {
+    "iv": ["intravenous"],
+    "intravenous": ["iv"],
+    "im": ["intramuscular"],
+    "intramuscular": ["im"],
+    "insufflated": ["snorted"],
+    "snorted": ["insufflated"],
+    "vaporized": ["vapourized"],
+    "vapourized": ["vaporized"],
+}
+
+
+def roa_matches_name(roa, name):
+    aliases = roa_name_aliases.get(name.lower(), [])
+    return roa["name"].lower() == name.lower() or roa["name"].lower() in aliases
+
+
+# get tripsit data
+
+
+ts_dose_order = ["Threshold", "Light", "Common", "Strong", "Heavy"]
+ts_combo_ignore = ["benzos"]  # duplicate
+# prettify names in interaction list
+ts_combo_transformations = {
+    "lsd": "LSD",
+    "mushrooms": "Mushrooms",
+    "dmt": "DMT",
+    "mescaline": "Mescaline",
+    "dox": "DOx",
+    "nbomes": "NBOMes",
+    "2c-x": "2C-x",
+    "2c-t-x": "2C-T-x",
+    "amt": "aMT",
+    "5-meo-xxt": "5-MeO-xxT",
+    "cannabis": "Cannabis",
+    "ketamine": "Ketamine",
+    "mxe": "MXE",
+    "dxm": "DXM",
+    "pcp": "PCP",
+    "nitrous": "Nitrous",
+    "amphetamines": "Amphetamines",
+    "mdma": "MDMA",
+    "cocaine": "Cocaine",
+    "caffeine": "Caffeine",
+    "alcohol": "Alcohol",
+    "ghb/gbl": "GHB/GBL",
+    "opioids": "Opioids",
+    "tramadol": "Tramadol",
+    "benzodiazepines": "Benzodiazepines",
+    "maois": "MAOIs",
+    "ssris": "SSRIs",
+}
+
+ts_response = requests.get(ts_api_url)
+ts_data = ts_response.json()["data"][0]
+
+ts_substances_data = list(ts_data.values())
+
+
+# TS has durations split over a few keys, so this finds or creates the duration for the associated ROA
+# and adds a new line item
+def ts_add_formatted_duration(ts_roas, formatted_duration, duration_name):
+    units = formatted_duration.get("_unit", "") or ""
+    if "_unit" in formatted_duration:
+        formatted_duration.pop("_unit")
+
+    def add_to_roa(roa, value):
+        if "duration" not in roa:
+            roa["duration"] = []
+
+        roa["duration"].append({"name": duration_name, "value": value})
+
+    for roa_name, value in formatted_duration.items():
+        value_string = f"{value} {units}".strip()
+
+        # if value present (i.e. just one value for all ROA doses provided above), apply to all ROAs
+        if roa_name == "value":
+            # if TS did not add any doses, do nothing with this value
+            # we could theoretically apply this to all PW doses with missing durations, but we can't be sure
+            # if it applies to all ROAs, so just ignore
+            if not len(ts_roas):
+                break
+
+            for ts_roa in ts_roas:
+                add_to_roa(ts_roa, value_string)
+
+        # add to matching ROA or create new ROA if doesn't exist
+        else:
+            ts_roa = next(
+                (ts_roa for ts_roa in ts_roas if roa_matches_name(ts_roa, roa_name)),
+                None,
+            )
+            # if ROA doesn't exist, make new
+            if not ts_roa:
+                ts_roa = {"name": roa_name}
+                ts_roas.append(ts_roa)
+
+            add_to_roa(ts_roa, value_string)
+
+
+# get psychonautwiki data
+
+
+def pw_clean_common_name(name):
+    name = re.sub(r'^"', "", name)
+    name = re.sub(r'"$', "", name)
+    name = re.sub(r'"?\[\d*\]$', "", name)
+    name = re.sub(r"\s*More names\.$", "", name)
+    name = re.sub(r"\.$", "", name)
+    return name.strip()
+
+
+def pw_should_skip(name, soup):
+    return (
+        name.startswith("Experience:") or len(soup.find_all(text="Common names")) == 0
+    )
+
+
+pw_substance_data = []
+
+if os.path.exists("ts_pn_data/_cached_pw_substances.json"):
+    with open("ts_pn_data/_cached_pw_substances.json") as f:
+        pw_substance_data = json.load(f)
+
+if not len(pw_substance_data):
+    offset = 0
+    pw_substance_urls_query = (
+        f"{{substances(limit: 250 offset: {offset}) {{name url}}}}"
+    )
+
+    pw_substance_urls_data = ps_client.execute(query=pw_substance_urls_query,)["data"][
+        "substances"
+    ]
+
+    offset = 252
+    while offset <= 340:
+        pw_substance_urls_query = (
+            f"{{substances(limit: 1 offset: {offset}) {{name url}}}}"
+        )
+        offset += 1
+        temp_data = ps_client.execute(query=pw_substance_urls_query,)["data"][
+            "substances"
+        ]
+        print(temp_data)
+        if temp_data is None:
+            continue
+        pw_substance_urls_data.extend(temp_data)
+
+    for idx, substance in enumerate(pw_substance_urls_data):
+        try:
+            url = substance["url"]
+            substance_req = requests.get(url, headers)
+            substance_soup = BeautifulSoup(substance_req.content, "html.parser")
+
+            name = substance_soup.find("h1", id="firstHeading").text
+            if pw_should_skip(name, substance_soup):
+                print(f"Skipping {name} ({idx + 1} / {len(pw_substance_urls_data)})")
+                continue
+
+            # get aliases text
+            common_names_str = substance_soup.find_all(text="Common names")
+
+            cleaned_common_names = (
+                set(
+                    map(
+                        pw_clean_common_name,
+                        common_names_str[0]
+                        .parent.find_next_sibling("td")
+                        .text.split(", "),
+                    )
+                )
+                if len(common_names_str) > 0
+                else set()
+            )
+            cleaned_common_names.add(substance["name"])
+            # don't include name in list of other common names
+            common_names = sorted(filter(lambda n: n != name, cleaned_common_names))
+
+            # scrape ROAs from page
+
+            def get_data_starting_at_row(curr_row):
+                rows = []
+                while curr_row.find("th", {"class": "ROARowHeader"}):
+                    row = {}
+                    row["name"] = (
+                        curr_row.find("th", {"class": "ROARowHeader"}).find("a").text
+                    )
+
+                    row_values = curr_row.find("td", {"class": "RowValues"})
+
+                    row_value_text = row_values.find_all(text=True, recursive=False)
+                    if len(row_value_text):
+                        row["value"] = "".join(row_value_text).strip()
+                    else:
+                        row["value"] = None
+
+                    row_note = row_values.find("span")
+                    if row_note:
+                        row["note"] = re.sub(r"\s*\[\d*\]$", "", row_note.text).strip()
+
+                    rows.append(row)
+
+                    curr_row = curr_row.find_next("tr")
+                return rows, curr_row
+
+            roas = []
+
+            dose_charts = substance_soup.find_all("tr", {"class": "dosechart"})
+            for dose_chart in dose_charts:
+                table = dose_chart.parent.parent
+                roa_name = table.find("tr").find("a").text
+                if not roa_name:
+                    continue
+
+                roa = {
+                    "name": roa_name,
+                    "dosage": [],
+                    "duration": [],
+                }
+
+                # dosage
+
+                curr_row = dose_chart.find_next("tr")
+                roa["dosage"], curr_row = get_data_starting_at_row(curr_row)
+
+                # extract bioavailability
+                if len(roa["dosage"]) and roa["dosage"][0]["name"] == "Bioavailability":
+                    bioavailability = roa["dosage"].pop(0)
+                    roa["bioavailability"] = bioavailability["value"]
+
+                # duration
+
+                if curr_row.find("th", {"class": "ROASubHeader"}):
+                    curr_row = curr_row.find_next("tr")
+                    roa["duration"], _ = get_data_starting_at_row(curr_row)
+
+                if not len(roa["dosage"]):
+                    roa["dosage"] = None
+                if not len(roa["duration"]):
+                    roa["duration"] = None
+
+                roas.append(roa)
+
+            # query PS API for more data on substance
+
+            query = (
+                """
+                {
+                    substances(query: "%s") {
+                        name
+                        class {
+                            chemical
+                            psychoactive
+                        }
+                        tolerance {
+                            full
+                            half
+                            zero
+                        }
+                        toxicity
+                        addictionPotential
+                        crossTolerances
+                    }
+                }
+            """
+                % substance["name"]
+            )
+
+            data = ps_client.execute(query=query)["data"]["substances"]
+            if len(data) == 0:
+                continue
+            elif len(data) > 1:
+                # should never happen?
+                print(f"{name} has more than one dataset... investigate why")
+
+            data = data[0]
+            if "name" in data:
+                data.pop("name")
+
+            pw_substance_data.append(
+                {
+                    "url": url,
+                    "name": name,
+                    "aliases": common_names,
+                    "roas": roas,
+                    "data": data,
+                }
+            )
+            print(
+                f"Done with {name} [{len(roas)} ROA(s)] ({idx + 1} / {len(pw_substance_urls_data)})"
+            )
+
+        except KeyboardInterrupt:
+            print("\nScrape canceled")
+            exit(0)
+        except:
+            print(f"{name} failed:")
+            print(traceback.format_exc())
+            exit(1)
+
+    with open(f"ts_pn_data/_cached_pw_substances.json", "w") as f:
+        f.write(json.dumps(pw_substance_data, indent=2, ensure_ascii=False))
+
+# combine tripsit and psychonautwiki data
+
+
+all_substance_names = sorted(
+    set(
+        list(map(lambda s: s.get("name", "").lower(), pw_substance_data))
+        + list(map(lambda s: s.get("name", "").lower(), ts_substances_data))
+    )
+)
+substance_data = []
+x = 0
+
+for name in all_substance_names:
+    # find PW substance
+    pw_substance = find_substance_in_data(pw_substance_data, name)
+    # remove to get rid of duplicates in final output
+    if pw_substance:
+        pw_substance_data.remove(pw_substance)
+    else:
+        pw_substance = {}
+
+    # find TS substance
+    ts_substance = find_substance_in_data(ts_substances_data, name)
+    # remove to get rid of duplicates in final output
+    if ts_substance:
+        ts_substances_data.remove(ts_substance)
+    else:
+        ts_substance = {}
+
+    # if no substance found in either dataset, skip
+    if not pw_substance and not ts_substance:
+        continue
+
+    ts_properties = ts_substance.get("properties", {})
+
+    # url will always exist for psychonautwiki substance, so tripsit substance must exist if url is None
+    url = pw_substance.get("url") or f"https://drugs.tripsit.me/{ts_substance['name']}"
+
+    ts_links = ts_substance.get("links", {})
+    experiences_url = ts_links.get("experiences")
+
+    # pick display name from available substances found from both datasets
+    names = list(
+        filter(
+            lambda n: n is not None and len(n) > 0,
+            [pw_substance.get("name"), ts_substance.get("pretty_name")],
+        )
+    )
+    # people use shorter names
+    name = min(names, key=len)
+
+    # lowercase list of all names, excluding chosen name above
+    aliases = set(
+        map(
+            lambda n: n.lower(),
+            filter(
+                lambda n: n is not None and len(n) > 0,
+                [pw_substance.get("name"), ts_substance.get("pretty_name")]
+                + pw_substance.get("aliases", [])
+                + ts_substance.get("aliases", []),
+            ),
+        )
+    )
+    if name.lower() in aliases:
+        aliases.remove(name.lower())
+    aliases = sorted(aliases)
+
+    summary = ts_properties.get("summary", "").strip()
+    if not len(summary):
+        summary = None
+
+    test_kits = ts_properties.get("test-kits", "").strip()
+    if not len(test_kits):
+        test_kits = None
+
+    ts_bioavailability_str = ts_properties.get("bioavailability", "").strip()
+    ts_bioavailability = {}
+    if len(ts_bioavailability_str):
+        matches = re.findall(
+            r"([a-zA-Z\/]+)[.:\s]+([0-9\.%\s\+/\-]+)", ts_bioavailability_str
+        )
+        if len(matches):
+            for roa_name, value in matches:
+                ts_bioavailability[roa_name.lower()] = value.strip(". \t")
+
+    pw_data = pw_substance.get("data", {})
+
+    classes = pw_data.get("class")
+    toxicity = pw_data.get("toxicity")
+    addiction_potential = pw_data.get("addictionPotential")
+    tolerance = pw_data.get("tolerance")
+    cross_tolerances = pw_data.get("crossTolerances")
+
+    roas = []
+
+    # get PW ROAs
+    pw_roas = pw_substance.get("roas", [])
+
+    # process TS ROAs
+    ts_roas = []
+
+    # TS ROA dosage
+    ts_formatted_dose = ts_substance.get("formatted_dose")
+    if ts_formatted_dose:
+        for roa_name, dose_data in ts_formatted_dose.items():
+            dose_levels = []
+            for dose_level in ts_dose_order:
+                value_string = dose_data.get(dose_level)
+                if value_string is None:
+                    continue
+
+                dose_levels.append(
+                    {"name": dose_level, "value": value_string,}
+                )
+
+            if len(dose_levels):
+                ts_roas.append({"name": roa_name, "dosage": dose_levels})
+
+    # TS ROA durations
+    ts_formatted_onset = ts_substance.get("formatted_onset")
+    if ts_formatted_onset:
+        ts_add_formatted_duration(ts_roas, ts_formatted_onset, "Onset")
+
+    ts_formatted_duration = ts_substance.get("formatted_duration")
+    if ts_formatted_duration:
+        ts_add_formatted_duration(ts_roas, ts_formatted_duration, "Duration")
+
+    ts_formatted_aftereffects = ts_substance.get("formatted_aftereffects")
+    if ts_formatted_aftereffects:
+        ts_add_formatted_duration(ts_roas, ts_formatted_aftereffects, "After effects")
+
+    # merge PW and TS ROAs
+    # prioritize PW for ROAs but use TS to fill in gaps
+
+    roas.extend(pw_roas)
+    for ts_roa in ts_roas:
+        existing_roa = next(
+            (roa for roa in roas if roa_matches_name(roa, ts_roa["name"])), None
+        )
+        # if ROA does not exist, add
+        if not existing_roa:
+            existing_roa = ts_roa
+            roas.append(existing_roa)
+            # we want bioavailability from below, so don't skip
+
+        # if ROA does not already have bioavailability, try to get from TS
+        if not existing_roa.get("bioavailability"):
+            name_lower = ts_roa["name"].lower()
+            name_aliases = roa_name_aliases.get(name_lower, [])
+
+            alias_found = next(
+                (name_alias in ts_bioavailability for name_alias in name_aliases), None
+            )
+            # TS has bioavailability if name or any name alias is found
+            if name_lower in ts_bioavailability or alias_found:
+                existing_roa["bioavailability"] = ts_bioavailability.get(
+                    name_lower
+                ) or ts_bioavailability.get(alias_found)
+
+        # if existing ROA is missing dosage and TS has dosage, add
+        if (not existing_roa.get("dosage") or not len(existing_roa["dosage"])) and (
+            "dosage" in ts_roa and ts_roa["dosage"] and len(ts_roa["dosage"])
+        ):
+            existing_roa["dosage"] = ts_roa["dosage"]
+
+        # if existing ROA is missing duration and TS has duration, add
+        if (not existing_roa.get("duration") or not len(existing_roa["duration"])) and (
+            "duration" in ts_roa and ts_roa["duration"] and len(ts_roa["duration"])
+        ):
+            existing_roa["duration"] = ts_roa["duration"]
+
+    interactions = None
+    combos = ts_substance.get("combos")
+    if combos:
+        interactions = []
+        for key, combo_data in combos.items():
+            if key in ts_combo_ignore:
+                continue
+
+            combo_data["name"] = ts_combo_transformations[key]
+            interactions.append(combo_data)
+        interactions = sorted(interactions, key=lambda i: i["name"])
+
+    substance_data.append(
+        {
+            "id": x,
+            "name": name,
+            "aliases": list(aliases),
+            "aliasesStr": ",".join(aliases),
+            "url": url,
+            "experiencesUrl": experiences_url,
+            "summary": summary,
+            "reagents": test_kits,
+            "classes": classes,
+            "toxicity": toxicity,
+            "addictionPotential": addiction_potential,
+            "tolerance": tolerance,
+            "crossTolerances": cross_tolerances,
+            "roas": roas,
+            "interactions": interactions,
+        }
+    )
+    x += 1
+
+# output
+
+
+substances_json = json.dumps(substance_data, indent=2, ensure_ascii=False)
+with open(f"ts_pn_data/substances_{time()}.json", "w") as f:
+    f.write(substances_json)
--- a/ts_pn_data/substances_1632949161.9573479.json
+++ b/ts_pn_data/substances_1632949161.9573479.json