Specimen5423 commited on
Commit
379b837
·
1 Parent(s): 37ef30d

Add the app

Browse files
Files changed (2) hide show
  1. app.py +63 -0
  2. functions.py +267 -0
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from functions import CAT_ARTIST, CAT_CHARACTER, CAT_COPYRIGHT, CAT_GENERAL, CAT_LORE, CAT_META, CAT_SPECIES, PromptBuilder, parse_tag, parse_tags, related_tags
3
+
4
+ def query_tag(tag: str, category: int):
5
+ if category == -1:
6
+ category = None
7
+ return related_tags(parse_tag(tag), category=category)
8
+
9
+ def generate_prompt(include: str, focus: str, exclude: str, avoid: str, skip: str, rating: str, general: int, artist: int, species: int, copyright: int, character: int, meta: int) -> str:
10
+ try:
11
+ builder = PromptBuilder(skip=list(parse_tags(skip)), min_posts=50, rating=rating)
12
+ for tag in parse_tags(include):
13
+ builder = builder.include(tag)
14
+ for tag in parse_tags(focus):
15
+ builder = builder.focus(tag)
16
+ for tag in parse_tags(exclude):
17
+ builder = builder.exclude(tag)
18
+ for tag in parse_tags(avoid):
19
+ builder = builder.avoid(tag)
20
+ if artist > 0:
21
+ builder = builder.pick(CAT_ARTIST, artist, 10)
22
+ if species > 0:
23
+ builder = builder.pick(CAT_SPECIES, species, 10)
24
+ if copyright > 0:
25
+ builder = builder.pick(CAT_COPYRIGHT, copyright, 10)
26
+ if character > 0:
27
+ builder = builder.pick(CAT_CHARACTER, character, 10)
28
+ if meta > 0:
29
+ builder = builder.pick(CAT_META, meta, 10)
30
+ if general > 0:
31
+ builder = builder.pick(CAT_GENERAL, general, 50)
32
+ return builder.get_one()
33
+ except Exception as e:
34
+ return str(e)
35
+
36
+ with gr.Blocks() as demo:
37
+ with gr.Tab("Tag Explorer"):
38
+ tag = gr.Textbox(label="Tag")
39
+ category = gr.Dropdown(label="Category", choices=[("All", -1), ("General", CAT_GENERAL), ("Artist", CAT_ARTIST), ("Copyright", CAT_COPYRIGHT), ("Character", CAT_CHARACTER), ("Species", CAT_SPECIES), ("Meta", CAT_META), ("Lore", CAT_LORE)], value=-1)
40
+ query = gr.Button("Query")
41
+ output = gr.Dataframe()
42
+ query.click(fn=query_tag, inputs=[tag, category], outputs=output)
43
+ with gr.Tab("Prompt Expander"):
44
+ include = gr.Textbox(label="Positive Prompt - Start with these tags and weight picks toward them")
45
+ focus = gr.Textbox(label="Focus - Used for picks but not necessarily added to the prompt")
46
+ exclude = gr.Textbox(label="Negative Prompt - Put these in the negative prompt and weight picks against them")
47
+ avoid = gr.Textbox(label="Other Negatives - Weighted against picks but not put in the negatives")
48
+ skip = gr.Textbox(label="Skip - Never pick these tags")
49
+ rating = gr.Dropdown(label="Rating Limit (not 100% reliable)", choices=[("Safe", "s"), ("Questionable", "q"), ("Explicit", "e")], value="s")
50
+ with gr.Accordion(label="Tag Counts"):
51
+ with gr.Row():
52
+ general = gr.Number(5, label="General", precision=0, minimum=0, maximum=20)
53
+ artist = gr.Number(0, label="Artist", precision=0, minimum=0, maximum=5)
54
+ species = gr.Number(0, label="Species", precision=0, minimum=0, maximum=5)
55
+ copyright = gr.Number(0, label="Copyright", precision=0, minimum=0, maximum=5)
56
+ character = gr.Number(0, label="Character", precision=0, minimum=0, maximum=5)
57
+ meta = gr.Number(0, label="Meta", precision=0, minimum=0, maximum=5)
58
+
59
+ generate = gr.Button("Generate")
60
+ output = gr.Textbox(label="Prompt")
61
+ generate.click(fn=generate_prompt, inputs=[include, focus, exclude, avoid, skip, rating, general, artist, species, copyright, character, meta], outputs=output)
62
+
63
+ demo.launch()
functions.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas
2
+ import numpy
3
+ import pandas.io.formats.style
4
+ import random
5
+ import functools
6
+ from typing import Callable, Literal
7
+
8
+ DATA_FOLDER = "."
9
+
10
+ CAT_GENERAL = 0
11
+ CAT_ARTIST = 1
12
+ CAT_UNUSED = 2
13
+ CAT_COPYRIGHT = 3
14
+ CAT_CHARACTER = 4
15
+ CAT_SPECIES = 5
16
+ CAT_INVALID = 6
17
+ CAT_META = 7
18
+ CAT_LORE = 8
19
+
20
+ CATEGORY_COLORS = {
21
+ CAT_GENERAL: "#b4c7d9",
22
+ CAT_ARTIST: "#f2ac08",
23
+ CAT_UNUSED: "#ff3d3d",
24
+ CAT_COPYRIGHT: "#d0d",
25
+ CAT_CHARACTER: "#0a0",
26
+ CAT_SPECIES: "#ed5d1f",
27
+ CAT_INVALID: "#ff3d3d",
28
+ CAT_META: "#fff",
29
+ CAT_LORE: "#282"
30
+ }
31
+
32
+ def get_feather(filename: str) -> pandas.DataFrame:
33
+ return pandas.read_feather(f"{DATA_FOLDER}/{filename}.feather")
34
+
35
+ tags = get_feather("tags")
36
+ posts_by_tag = get_feather("posts_by_tag").set_index("tag_id")
37
+ tags_by_post = get_feather("tags_by_post").set_index("post_id")
38
+ tag_ratings = get_feather("tag_ratings")
39
+ implications = get_feather("implications")
40
+ tags_by_name = tags.copy(deep=True)
41
+ tags_by_name.set_index("name", inplace=True)
42
+ tags.set_index("tag_id", inplace=True)
43
+
44
+ @functools.cache
45
+ def get_related_tags(targets: tuple[str, ...], exclude: tuple[str, ...] = (), samples: int = 100_000) -> pandas.DataFrame:
46
+ these_tags = tags_by_name.loc[list(targets)]
47
+ posts_with_these_tags = posts_by_tag.loc[these_tags["tag_id"]].map(set).groupby(lambda x: True).agg(lambda x: set.intersection(*x))["post_id"][True]
48
+ if (len(exclude) > 0):
49
+ excluded_tags = tags_by_name.loc[list(exclude)]
50
+ posts_with_excluded_tags = posts_by_tag.loc[excluded_tags["tag_id"]].map(set).groupby(lambda x: True).agg(lambda x: set.union(*x))["post_id"][True]
51
+ posts_with_these_tags = posts_with_these_tags - posts_with_excluded_tags
52
+ total_post_count_together = len(posts_with_these_tags)
53
+ sample_posts = random.sample(list(posts_with_these_tags), samples) if total_post_count_together > samples else list(posts_with_these_tags)
54
+ post_count_together = len(sample_posts)
55
+ sample_ratio = post_count_together / total_post_count_together
56
+ tags_in_these_posts = tags_by_post.loc[sample_posts]
57
+ counts_in_these_posts = tags_in_these_posts["tag_id"].explode().value_counts().rename("overlap")
58
+ summaries = pandas.DataFrame(counts_in_these_posts).join(tags[tags["post_count"]>0], how="right").fillna(0)
59
+ summaries["overlap"] = numpy.minimum(summaries["overlap"] / sample_ratio, summaries["post_count"])
60
+ summaries = summaries[["category", "name", "overlap", "post_count"]]
61
+ # Old "interestingness" value, didn't give as good results as an actual statistical technique, go figure. Code kept for curiosity's sake.
62
+ #summaries["interestingness"] = summaries["overlap"].pow(2) / (total_post_count_together * summaries["post_count"])
63
+ # Phi coefficient stuff.
64
+ n = float(len(tags_by_post))
65
+ n11 = summaries["overlap"]
66
+ n1x = float(total_post_count_together)
67
+ nx1 = summaries["post_count"].astype("float64")
68
+ summaries["correlation"] = (n * n11 - n1x * nx1) / numpy.sqrt(n1x * nx1 * (n - n1x) * (n - nx1))
69
+ return summaries
70
+
71
+ def format_tags(styler: pandas.io.formats.style.Styler):
72
+ styler.apply(lambda row: numpy.where(row.index == "name", "color:"+CATEGORY_COLORS[row["category"]], ""), axis=1)
73
+ styler.hide(level=0)
74
+ styler.hide("category",axis=1)
75
+ if 'overlap' in styler.data:
76
+ styler.format("{:.0f}".format, subset=["overlap"])
77
+ if 'correlation' in styler.data:
78
+ styler.format("{:.2f}".format, subset=["correlation"])
79
+ styler.background_gradient(vmin=-1.0, vmax=1.0, cmap="RdYlGn", subset=["correlation"])
80
+ if 'score' in styler.data:
81
+ styler.format("{:.2f}".format, subset=["score"])
82
+ styler.background_gradient(vmin=-1.0, vmax=1.0, cmap="RdYlGn", subset=["score"])
83
+ return styler
84
+
85
+ def related_tags(*targets: str, exclude: tuple[str, ...] = (), category: int = None, samples: int = 100_000, min_overlap: int = 5, min_posts: int = 20, top: int = 30, bottom: int = 0) -> pandas.DataFrame:
86
+ result = get_related_tags(targets, exclude=exclude, samples=samples)
87
+ if category != None:
88
+ result = result[result["category"] == category]
89
+ result = result[~result["name"].isin(targets)]
90
+ result = result[result["overlap"] >= min_overlap]
91
+ result = result[result["post_count"] >= min_posts]
92
+ top_part = result.sort_values("correlation", ascending=False)[:top]
93
+ bottom_part = result.sort_values("correlation", ascending=True)[:bottom].sort_values("correlation", ascending=False)
94
+ return pandas.concat([top_part, bottom_part]).style.pipe(format_tags)
95
+
96
+ def implications_for(*subjects: str, seen: set[str] = None):
97
+ if seen is None:
98
+ seen = set()
99
+ for subject in subjects:
100
+ found = tags.loc[list(implications[implications["antecedent_id"] == tags_by_name.loc[subject, "tag_id"]].loc[:,"consequent_id"]), "name"].values
101
+ for f in found:
102
+ if f in seen:
103
+ pass
104
+ else:
105
+ yield f
106
+ seen.add(f)
107
+ yield from implications_for(f, seen=seen)
108
+
109
+ def parse_tag(potential_tag: str):
110
+ potential_tag = potential_tag.strip().replace(" ", "_").replace("\\(", "(").replace("\\)", ")")
111
+ if potential_tag == "":
112
+ return None
113
+ elif potential_tag in tags_by_name.index:
114
+ return potential_tag
115
+ elif potential_tag.startswith("by_") and potential_tag[3:] in tags_by_name.index:
116
+ return potential_tag[3:]
117
+ else:
118
+ print(f"Couldn't find tag '{potential_tag}', skipping it.")
119
+
120
+ def parse_tags(*parts: str):
121
+ for part in parts:
122
+ for potential_tag in part.split(","):
123
+ tag = parse_tag(potential_tag)
124
+ if tag is not None:
125
+ yield tag
126
+
127
+ def add_suggestions(suggestions: pandas.DataFrame, new_tags: str | list[str], multiplier: int, samples : int, min_posts: int, rating: Literal['s', 'q', 'e']):
128
+ if isinstance(new_tags, str):
129
+ new_tags = [new_tags]
130
+ for new_tag in new_tags:
131
+ related = get_related_tags((new_tag,), samples=samples)
132
+ # Implementing the rating filter this way is horribly inefficient, fix it later
133
+ if rating == 's':
134
+ related = related.join(tag_ratings.set_index("tag_id"), on="tag_id")
135
+ related["post_count"] = related["s"]
136
+ related = related.drop("s", axis=1)
137
+ related = related.drop("q", axis=1)
138
+ related = related.drop("e", axis=1)
139
+ elif rating == 'q':
140
+ related = related.join(tag_ratings.set_index("tag_id"), on="tag_id")
141
+ related["post_count"] = related["s"] + related["q"]
142
+ related = related.drop("s", axis=1)
143
+ related = related.drop("q", axis=1)
144
+ related = related.drop("e", axis=1)
145
+ related = related[related["post_count"] >= min_posts]
146
+ if suggestions is None:
147
+ suggestions = related.rename(columns={"correlation": "score"})
148
+ else:
149
+ suggestions = suggestions.join(related, rsuffix="r")
150
+ # This is a totally made up way to combine correlations. It keeps them from going outside the +/- 1 range, which is nice. It also makes older
151
+ # tags less important every time newer ones are added. That could be considered a feature or not.
152
+ suggestions["score"] = numpy.real(numpy.power((numpy.sqrt(suggestions["score"] + 0j) + numpy.sqrt(multiplier * suggestions["correlation"] + 0j)) / 2, 2))
153
+ return suggestions[["category", "name", "post_count", "score"]]
154
+
155
+
156
+
157
+ def pick_tags(suggestions: pandas.DataFrame, category: int, count: int, from_top: int, excluding: list[str], weighted: bool = True):
158
+ options = suggestions[(True if category is None else suggestions["category"] == category) & (suggestions["score"] > 0) & ~suggestions["name"].isin(excluding)].sort_values("score", ascending=False)[:from_top]
159
+ if weighted:
160
+ values = list(options["name"].values)
161
+ weights = list(options["score"].values)
162
+ choices = []
163
+ for _ in range(count):
164
+ choice = random.choices(population=values, weights=weights, k=1)[0]
165
+ weights.pop(values.index(choice))
166
+ values.remove(choice)
167
+ choices.append(choice)
168
+ return choices
169
+ else:
170
+ return random.sample(list(options["name"].values), count)
171
+
172
+ def tag_to_prompt(tag: str) -> str:
173
+ if (tags_by_name.loc[tag]["category"] == CAT_ARTIST):
174
+ tag = "by " + tag
175
+ return tag.replace("_", " ").replace("(" , "\\(").replace(")" , "\\)")
176
+
177
+ # A lambda in a for loop doesn't capture variables the way I want it to, so this is a method now
178
+ def add_suggestions_later(suggestions: pandas.DataFrame, new_tags: str | list[str], multiplier: int, samples: int, min_posts: int, rating: Literal['s', 'q', 'e']):
179
+ return lambda: add_suggestions(suggestions, new_tags, multiplier, samples, min_posts, rating)
180
+
181
+
182
+ Prompt = tuple[list[str], list[str], Callable[[], pandas.DataFrame]]
183
+
184
+ class PromptBuilder:
185
+ prompts: list[Prompt]
186
+ samples: int
187
+ min_posts: int
188
+ rating: Literal['s', 'q', 'e']
189
+ skip_list: list[str]
190
+
191
+ def __init__(self, prompts = [([],[],lambda: None)], skip=[], samples = 100_000, min_posts = 20, rating: Literal['s', 'q', 'e'] = 'e'):
192
+ self.prompts = prompts
193
+ self.samples = samples
194
+ self.min_posts = min_posts
195
+ self.rating = rating
196
+ self.skip_list = skip
197
+
198
+ def include(self, tag: str):
199
+ return PromptBuilder(prompts=[
200
+ (tag_list + [tag], negative_list, add_suggestions_later(suggestions(), tag, 1, self.samples, self.min_posts, self.rating))
201
+ for (tag_list, negative_list, suggestions) in self.prompts
202
+ ], samples=self.samples, min_posts=self.min_posts, skip=self.skip_list, rating=self.rating)
203
+
204
+ def focus(self, tag: str):
205
+ return PromptBuilder(prompts=[
206
+ (tag_list, negative_list, add_suggestions_later(suggestions(), tag, 1, self.samples, self.min_posts, self.rating))
207
+ for (tag_list, negative_list, suggestions) in self.prompts
208
+ ], samples=self.samples, min_posts=self.min_posts, skip=self.skip_list, rating=self.rating)
209
+
210
+ def exclude(self, tag: str):
211
+ return PromptBuilder(prompts=[
212
+ (tag_list, negative_list + [tag], add_suggestions_later(suggestions(), tag, -1, self.samples, self.min_posts, self.rating))
213
+ for (tag_list, negative_list, suggestions) in self.prompts
214
+ ], samples=self.samples, min_posts=self.min_posts, skip=self.skip_list, rating=self.rating)
215
+
216
+ def avoid(self, tag: str):
217
+ return PromptBuilder(prompts=[
218
+ (tag_list, negative_list, add_suggestions_later(suggestions(), tag, -1, self.samples, self.min_posts, self.rating))
219
+ for (tag_list, negative_list, suggestions) in self.prompts
220
+ ], samples=self.samples, min_posts=self.min_posts, skip=self.skip_list, rating=self.rating)
221
+
222
+ def pick(self, category: int, count: int, from_top: int):
223
+ new_prompts = self.prompts
224
+ for _ in range(count):
225
+ new_prompts = [
226
+ (tag_list + [tag], negative_list, add_suggestions_later(s, tag, 1, self.samples, self.min_posts, self.rating))
227
+ for (tag_list, negative_list, suggestions) in new_prompts
228
+ for s in (suggestions(),)
229
+ for tag in pick_tags(s, category, 1, from_top, tag_list + negative_list + self.skip_list)
230
+ ]
231
+ return PromptBuilder(new_prompts, samples=self.samples, min_posts=self.min_posts, skip=self.skip_list, rating=self.rating)
232
+
233
+ def foreach_pick(self, category: int, count: int, from_top: int):
234
+ return PromptBuilder(prompts=[
235
+ (tag_list + [tag], negative_list, add_suggestions_later(s, tag, 1, self.samples, self.min_posts, self.rating))
236
+ for (tag_list, negative_list, suggestions) in self.prompts
237
+ for s in (suggestions(),)
238
+ for tag in pick_tags(s, category, count, from_top, tag_list + negative_list + self.skip_list)
239
+ ], samples=self.samples, min_posts=self.min_posts, skip=self.skip_list, rating=self.rating)
240
+
241
+ def pick_fast(self, category: int, count: int, from_top: int):
242
+ prompts = []
243
+ for (tag_list, negative_list, suggestions) in self.prompts:
244
+ s = suggestions()
245
+ new_tags = pick_tags(s, category, count, from_top, tag_list + negative_list + self.skip_list)
246
+ prompts.append((tag_list + new_tags, negative_list, add_suggestions_later(s, new_tags, 1, self.samples, self.min_posts, self.rating)))
247
+ return PromptBuilder(prompts=prompts, samples=self.samples, min_posts=self.min_posts, skip=self.skip_list, rating=self.rating)
248
+
249
+ def branch(self, count: int):
250
+ return PromptBuilder(prompts=[prompt for prompt in self.prompts for _ in range(count)], samples=self.samples, min_posts=self.min_posts, skip=self.skip_list, rating=self.rating)
251
+
252
+ def build(self):
253
+ for (tag_list, negative_list, _) in self.prompts:
254
+ positive_prompt = ", ".join([ tag_to_prompt(tag) for tag in tag_list])
255
+ negative_prompt = ", ".join([ tag_to_prompt(tag) for tag in negative_list])
256
+ if negative_prompt:
257
+ yield f"{positive_prompt}\nNegative prompt: {negative_prompt}"
258
+ else:
259
+ yield positive_prompt
260
+
261
+ def print(self):
262
+ for prompt in self.build():
263
+ print(prompt)
264
+
265
+ def get_one(self):
266
+ for prompt in self.build():
267
+ return prompt