Spaces:

btrunghieu
/

detect-emotions-comment-tiktok

Sleeping

App Files Files Community

Bui Trung commited on Jun 28, 2024

Commit

467a368

verified ·

1 Parent(s): f4018e2

Upload 2 files

Browse files

Files changed (2) hide show

app.py +193 -0
phobert_fold1.pth +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import jmespath
+import asyncio
+import json
+from urllib.parse import urlencode
+from typing import List, Dict
+from httpx import AsyncClient, Response
+from loguru import logger as log
+import nest_asyncio
+import torch
+import torch.nn as nn
+from transformers import AutoModel
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import gradio as gr
+client = AsyncClient(
+    # enable http2
+    http2=True,
+    headers={
+        "Accept-Language": "en-US,en;q=0.9",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
+        "Accept-Encoding": "gzip, deflate, br",
+        "content-type": "application/json"
+    },
+)
+def parse_comments(response: Response) -> Dict:
+    try:
+        data = json.loads(response.text)
+    except json.JSONDecodeError:
+        log.error(f"Failed to parse JSON response: {response.text}")
+        return {"comments": [], "total_comments": 0}
+    comments_data = data.get("comments", [])
+    total_comments = data.get("total", 0)
+    if not comments_data:
+        log.warning(f"No comments found in response: {response.text}")
+        return {"comments": [], "total_comments": total_comments}
+    parsed_comments = []
+    for comment in comments_data:
+        result = jmespath.search(
+            """{
+            text: text
+            }""",
+            comment
+        )
+        parsed_comments.append(result)
+    return {"comments": parsed_comments, "total_comments": total_comments}
+async def scrape_comments(post_id: int, comments_count: int = 20, max_comments: int = None) -> List[Dict]:
+    def form_api_url(cursor: int):
+        base_url = "https://www.tiktok.com/api/comment/list/?"
+        params = {
+            "aweme_id": post_id,
+            'count': comments_count,
+            'cursor': cursor # the index to start from
+        }
+        return base_url + urlencode(params)
+    log.info(f"Scraping comments from post ID: {post_id}")
+    first_page = await client.get(form_api_url(0))
+    data = parse_comments(first_page)
+    comments_data = data["comments"]
+    total_comments = data["total_comments"]
+    if not comments_data:
+        log.warning(f"No comments found for post ID {post_id}")
+        return []
+    if max_comments and max_comments < total_comments:
+        total_comments = max_comments
+    log.info(f"Scraping comments pagination, remaining {total_comments // comments_count - 1} more pages")
+    _other_pages = [
+        client.get(form_api_url(cursor=cursor))
+        for cursor in range(comments_count, total_comments + comments_count, comments_count)
+    ]
+    for response in asyncio.as_completed(_other_pages):
+        response = await response
+        new_comments = parse_comments(response)["comments"]
+        comments_data.extend(new_comments)
+        # If we have reached or exceeded the maximum number of comments to scrape, stop the process
+        if max_comments and len(comments_data) >= max_comments:
+            comments_data = comments_data[:max_comments]
+            break
+    log.success(f"Scraped {len(comments_data)} comments from post ID {post_id}")
+    return comments_data
+class SentimentClassifier(nn.Module):
+    def __init__(self, n_classes):
+        super(SentimentClassifier, self).__init__()
+        self.bert = AutoModel.from_pretrained("vinai/phobert-base")
+        self.drop = nn.Dropout(p=0.3)
+        self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)
+        nn.init.normal_(self.fc.weight, std=0.02)
+        nn.init.normal_(self.fc.bias, 0)
+    def forward(self, input_ids, attention_mask):
+        last_hidden_state, output = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=False # Dropout will errors if without this
+        )
+        x = self.drop(output)
+        x = self.fc(x)
+        return x
+def infer(text, tokenizer, max_len=120):
+    encoded_review = tokenizer.encode_plus(
+        text,
+        max_length=max_len,
+        truncation=True,
+        add_special_tokens=True,
+        padding='max_length',
+        return_attention_mask=True,
+        return_token_type_ids=False,
+        return_tensors='pt',
+    )
+    input_ids = encoded_review['input_ids'].to(device)
+    attention_mask = encoded_review['attention_mask'].to(device)
+    output = model(input_ids, attention_mask)
+    _, y_pred = torch.max(output, dim=1)
+    return class_names[y_pred]
+async def predict_comments(video_id):
+    comments = await scrape_comments(
+        post_id=int(video_id),
+        max_comments=2000,
+        comments_count=20
+    )
+    predictions = []
+    for comment in comments:
+        text = comment['text']
+        probs = infer(text, tokenizer)
+        predictions.append({'comment': text, 'predictions': probs})
+    # Tính toán tỷ lệ phần trăm của mỗi nhãn
+    total_comments = len(predictions)
+    label_counts = [0, 0, 0]  # Assuming there are 3 labels
+    comment_off = []
+    comment_hate = []
+    for prediction in predictions:
+        probs = prediction['predictions']
+        if probs == 'CLEAN':
+            label_counts[0] += 1
+        elif probs == 'OFFENSIVE':
+            label_counts[1] += 1
+            comment_off.append(prediction['comment'])
+        else :
+            label_counts[2] += 1
+            comment_hate.append(prediction['comment'])
+    label_percentages = [count / total_comments * 100 for count in label_counts]
+    results = {
+        'total_comments': total_comments,
+        'label_percentages': {
+            'CLEAN': label_percentages[0],
+            'OFFENSIVE': label_percentages[1],
+            'HATE': label_percentages[2],
+            'CMT OFFENSIVE': comment_off,
+            'CMT HATE': comment_hate,
+        }
+    }
+    return results
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+model = SentimentClassifier(n_classes=3)
+model.to(device)
+model.load_state_dict(torch.load('phobert_fold1.pth'))
+tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
+class_names = ['CLEAN', 'OFFENSIVE', 'HATE']
+iface = gr.Interface(
+    fn=predict_comments,
+    inputs="text",
+    outputs="json"
+)
+iface.launch()

phobert_fold1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:275266c8ce8e44222f98cccd875b1e38c8fd677b89a90a147dcc3fdd4de55dfd
+size 540085102