added first intent and lookup table(entities and synonyms)

2025-12-24 19:27:55 +01:00 · 2021-09-30 22:48:46 +02:00 · 2021-09-30 22:48:46 +02:00 · 1e6cbc386a
commit 1e6cbc386a
parent faefd94f2a
12 changed files with 53510 additions and 45382 deletions
--- a/README.md
+++ b/README.md
@ -6,10 +6,15 @@ Still alot of work to be done but for those wanting to test anyway, see below

 ## How to run

-    $ rasa train  
-    $ rasa shell 
+    $ rasa train
+    $ rasa shell

+Or use --debug flag to help you understand what's going on

-Or use --debug flag to help you understand what's going on  
-    
    $ rasa shell --debug
+
+## Credits
+
+Thanks to the [PsychoautWiki](https://psychonautwiki.org/wiki/Main_Page) and [TripSit](https://tripsit.me) for the data.
+
+Special thanks to [NoahSaso](https://github.com/NoahSaso) for his [scraper](https://github.com/NoahSaso/merge-psychonautwiki-tripsit-data)
--- a/config.yml
+++ b/config.yml
@ -3,6 +3,8 @@ pipeline:
  - name: WhitespaceTokenizer
    token_pattern: (?u)\b\w+\b
  - name: RegexFeaturizer
+    use_word_boundaries: false
+    case_sensitive: false
  - name: LexicalSyntacticFeaturizer
  - name: CountVectorsFeaturizer
 #    OOV_token: oov
@ -11,7 +13,7 @@ pipeline:
    min_ngram: 1
    max_ngram: 4
  - name: DIETClassifier
-    epochs: 100
+    epochs: 10
    ranking_length: 10
 #  - name: DucklingEntityExtractor
 #    url: http://localhost:8000
--- a/data/kb_query.yml
+++ b/data/kb_query.yml
--- a/data/lookups/substances.yml
+++ b/data/lookups/substances.yml
--- a/data/rules.yml
+++ b/data/rules.yml
@ -20,4 +20,9 @@ rules:
 - rule: OOS
  steps:
  - intent: out_of_scope
-  - action: utter_out_of_scope
+  - action: utter_out_of_scope
+
+- rule: what is drug
+  steps:
+  - intent: what_is_substance
+  - action: utter_what_is_stubstace
--- a/domain.yml
+++ b/domain.yml
@ -23,16 +23,23 @@ intents:
 - contact
 - inform
 - restart
+- what_is_substance
+

 entities:
 - language
 - location
 - name
+- substance

 slots:
  name:
    type: text
    influence_conversation: true
+  substance:
+    type: text
+    influence_conversation: true
+    auto_fill: true

 responses:
  utter_ask_name:
@ -127,6 +134,8 @@ responses:
  - text: That depends on which you are using and, most importantly, how you are using them...
  utter_faq/drugs_legal:
  - text: Probably but it depends on where you are and what drugs
+  utter_what_is_stubstace:
+  - text: It is {substance}

  utter_out_of_scope/non_english:
  - text: No hablo english
--- a/substances_data.json
+++ b/substances_data.json
--- a/ts_pn_data/generated_intents.yml
+++ b/ts_pn_data/generated_intents.yml
--- a/ts_pn_data/getData.py
+++ b/ts_pn_data/getData.py
@ -1,3 +1,25 @@
+"""MIT License
+
+Copyright (c) 2021 Noah Saso
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE."""
+
 #!/usr/bin/env python3

 # downloads and exports data on all substances from psychonautwiki and tripsit factsheets, combining to form master list with standardized format
@ -6,12 +28,12 @@

 import requests
 from bs4 import BeautifulSoup
-from time import time, sleep
 from python_graphql_client import GraphqlClient
 import json
 import os
 import re
 import traceback
+from intentGen import intentGen

 headers = {
    "Access-Control-Allow-Origin": "*",
@ -552,7 +574,54 @@ for name in all_substance_names:

 # output

+substances_json = {}
+substances_json["substances"] = substance_data
+with open(f"ts_pn_data/substances_data.json", "w") as f:
+    json.dump(substances_json, fp=f, ensure_ascii=False, indent=2)

-substances_json = json.dumps(substance_data, indent=2, ensure_ascii=False)
-with open(f"ts_pn_data/substances_{time()}.json", "w") as f:
-    f.write(substances_json)
+
+substance_aliases = {}
+
+with open("data/lookups/substances.yml", "w") as fp:
+    # Lookup Table
+    fp.write("""version: "2.0"\nnlu:\n- lookup: substance\n  examples: |\n""")
+    for drug in substances_json["substances"]:
+        fp.write(f"    - {drug['name']}\n")
+        # Add aliases to lookup table too
+        for y in drug["aliases"]:
+            # Check for "or" in aliases and remove
+            if " or " in y:
+                aliases = y.split(" or ")
+                fp.write(f"    - {aliases[0]}\n")
+                fp.write(f"    - {aliases[1]}\n")
+            elif "or " in y:
+                aliases = y.split("or ")
+                fp.write(f"    - {aliases[1]}\n")
+            else:
+                fp.write(f"    - {y}\n")
+    fp.write("\n")
+    # Synonyms to map aliases to one entity
+    for drug in substances_json["substances"]:
+        # Skip adding synonym if there are no aliases
+        substance_aliases[drug["name"]] = []
+        if drug["aliases"] == []:
+            continue
+        fp.write(f"- synonym: {drug['name']}\n  examples: |\n")
+        for y in drug["aliases"]:
+            # Check for "or" in aliases and remove
+            if " or " in y:
+                aliases = y.split(" or ")
+                fp.write(f"    - {aliases[0]}\n")
+                fp.write(f"    - {aliases[1]}\n")
+                substance_aliases[drug["name"]].append(aliases[0])
+                substance_aliases[drug["name"]].append(aliases[1])
+            elif "or " in y:
+                aliases = y.split("or ")
+                fp.write(f"    - {aliases[1]}\n")
+                substance_aliases[drug["name"]].append(aliases[1])
+            else:
+                fp.write(f"    - {y}\n")
+                substance_aliases[drug["name"]].append(y)
+
+with open("ts_pn_data/generated_intents.yml", "w") as fp:
+    fp.write(intentGen(substance_aliases).what_is())
--- a/ts_pn_data/intentGen.py
+++ b/ts_pn_data/intentGen.py
@ -0,0 +1,38 @@
+from secrets import choice, randbelow
+
+
+class intentGen:
+    def __init__(self, substances):
+        self.names = []
+        for x in substances:
+            self.names.append(x)
+            if not substances[x] == []:
+                for y in substances[x]:
+                    self.names.append(y)
+
+    def parse(self, intent_name, intent_list):
+        intent_str = f"- intent: {intent_name}\n  examples: |\n"
+        for x in intent_list:
+            intent_str = "".join([intent_str, f"    - {x}\n"])
+        return intent_str
+
+    def what_is(self):
+        what_is_intents = []
+        for name in self.names:
+            unlikely_chance = randbelow(10)
+            templates = [
+                f"what is [{name}](substance)?",
+                f"what is [{name}](substance)",
+                f"whats [{name}](substance)",
+                f"what's [{name}](substance)?",
+                f"what [{name}](substance)",
+            ]
+            what_is_intents.append(choice(templates))
+            if unlikely_chance > 6:
+                unlikely_templates = [
+                    f"[{name}](substance)?",
+                    f"[{name}](substance) is what?",
+                    f"[{name}](substance) is?",
+                ]
+                what_is_intents.append(choice(unlikely_templates))
+        return self.parse("what_is_substance", intent_list=what_is_intents)
--- a/ts_pn_data/substances_1632949161.9573479.json
+++ b/ts_pn_data/substances_1632949161.9573479.json
--- a/ts_pn_data/substances_data.json
+++ b/ts_pn_data/substances_data.json