admin管理员组

文章数量:1026989

import os
import shutil
import random
import torch
import torchvision.transforms as transforms
import cv2
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torchvision.models.video as models
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
from PIL import Image

# ------------------------
# Datasets => Train, Test, Val 
# ------------------------

source_dir = "new_dataset"
target_dir = "data"


for split in ["train", "test", "val"]:
    os.makedirs(os.path.join(target_dir, split, "NonViolence"), exist_ok=True)
    os.makedirs(os.path.join(target_dir, split, "Violence"), exist_ok=True)


train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1

for category in ["NonViolence", "Violence"]:
    category_path = os.path.join(source_dir, category)
    files = os.listdir(category_path)
    random.shuffle(files)

    train_count = int(len(files) * train_ratio)
    val_count = int(len(files) * val_ratio)

    train_files = files[:train_count]
    val_files = files[train_count:train_count + val_count]
    test_files = files[train_count + val_count:]

    for file_set, split in [(train_files, "train"), (val_files, "val"), (test_files, "test")]:
        for file in file_set:
            shutil.copy(os.path.join(category_path, file), os.path.join(target_dir, split, category, file))

total_train = len(os.listdir("data/train/Violence")) + len(os.listdir("data/train/NonViolence"))
total_test = len(os.listdir("data/test/Violence")) + len(os.listdir("data/test/NonViolence"))
total_val = len(os.listdir("data/val/Violence")) + len(os.listdir("data/val/NonViolence"))
print(f"Train: {total_train}")
print(f"Test: {total_test}")
print(f"Val: {total_val}")

class ViolenceDataset(Dataset):
    def __init__(self, dataset_folder, clip_length=16, transform=None):
        self.dataset_folder = dataset_folder
        self.clip_length = clip_length
        self.transform = transform if transform else transforms.Compose([
            transforms.Resize((112, 112)),
            transforms.ToTensor()
        ])

        self.video_paths = []
        self.labels = []

        for label, category in enumerate(os.listdir(dataset_folder)):
            folder_path = os.path.join(dataset_folder, category)
            if os.path.isdir(folder_path):
                for video_name in os.listdir(folder_path):
                    self.video_paths.append(os.path.join(folder_path, video_name))
                    self.labels.append(label)

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]

        frames = self.extract_frames(video_path)
        frames = torch.stack([self.transform(frame) for frame in frames])  # (frames, C, H, W)
        frames = frames.permute(1, 0, 2, 3)  # (C, frames, H, W)
        
        print(f"Dataset Output: {frames.shape}")  # (C, frames, 112, 112)

        return frames, torch.tensor(label, dtype=torch.long)

    def extract_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        selected_frames = np.linspace(0, frame_count - 1, self.clip_length, dtype=int)

        for i in range(frame_count):
            ret, frame = cap.read()
            if not ret:
                break
            if i in selected_frames:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = cv2.resize(frame, (112, 112))
                frames.append(frame)

        cap.release()
        return [transforms.ToPILImage()(frame) for frame in frames]

dataset_folder = "data"
batch_size = 8

train_dataset = ViolenceDataset(os.path.join(dataset_folder, "train"))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for clips, labels in train_loader:
    print(f"Loader Video Shape: {clips.shape}")  # (batch, 3, frames, 112, 112) 
    break

class ViolenceDetectionLSTM(nn.Module):
    def __init__(self, hidden_size=256, num_layers=2):
        super(ViolenceDetectionLSTM, self).__init__()
        selfn = models.r3d_18(pretrained=True)
        selfn.fc = nn.Identity()  

        self.lstm = nn.LSTM(input_size=512, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        print("\n--- Forward Started ---")
        print("Input Shape:", x.shape)  # (batch, 16, 3, 112, 112)

        # (batch, frames, 3, 112, 112)
        x = x.permute(0, 2, 1, 3, 4)  # (batch, frames, C, H, W)
        print("Permute:", x.shape)  # (batch, frames, 3, 112, 112)

        
        cnn_features = []
        for t in range(x.shape[1]):  
            frame = x[:, t, :, :, :]  # (batch, 3, 112, 112)
            cnn_out = selfn(frame) # (batch, 512)
            cnn_features.append(cnn_out.unsqueeze(1))  # (batch, 512) 
        #(batch, frames, 512)
        cnn_features = torch.cat(cnn_features, dim=1)
        print("LSTM, CNN:", cnn_features.shape)  # (batch, frames, 512)

        
        lstm_out, _ = self.lstm(cnn_features)
        lstm_out = lstm_out[:, -1, :] 
        output = self.fc(lstm_out)

        print("Model Output:", output.shape)  # (batch, 1)
        print("--- Forward Finished ---\n")

        return output
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViolenceDetectionLSTM().to(device)

# ------------------------
# Training
# ------------------------
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

num_epochs = 10
train_losses, val_losses = [], []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for clips, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        clips, labels = clips.to(device), labels.float().unsqueeze(1).to(device)

        # Debug: Input Check
        print(f"Input: {clips.shape}")  # (batch, frames, C, H, W)
        
        optimizer.zero_grad()
        outputs = model(clips)

        # Debug: Output Check
        print(f"Output: {outputs.shape}")  # (batch, 1)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {train_loss:.4f}")

    torch.save(model.state_dict(), "best_violence_model_lstm.pth")

print("Training complete! Best model saved.")

RuntimeError: Given groups=1, weight of size [64, 3, 3, 7, 7], expected input[1, 8, 3, 112, 112] to have 3 channels, but got 8 channels instead Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...

I could not solved this problem for 2 days, even used chat gpt and other stuff. There is a problem with channels but i could not figured out and confused

import os
import shutil
import random
import torch
import torchvision.transforms as transforms
import cv2
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torchvision.models.video as models
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
from PIL import Image

# ------------------------
# Datasets => Train, Test, Val 
# ------------------------

source_dir = "new_dataset"
target_dir = "data"


for split in ["train", "test", "val"]:
    os.makedirs(os.path.join(target_dir, split, "NonViolence"), exist_ok=True)
    os.makedirs(os.path.join(target_dir, split, "Violence"), exist_ok=True)


train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1

for category in ["NonViolence", "Violence"]:
    category_path = os.path.join(source_dir, category)
    files = os.listdir(category_path)
    random.shuffle(files)

    train_count = int(len(files) * train_ratio)
    val_count = int(len(files) * val_ratio)

    train_files = files[:train_count]
    val_files = files[train_count:train_count + val_count]
    test_files = files[train_count + val_count:]

    for file_set, split in [(train_files, "train"), (val_files, "val"), (test_files, "test")]:
        for file in file_set:
            shutil.copy(os.path.join(category_path, file), os.path.join(target_dir, split, category, file))

total_train = len(os.listdir("data/train/Violence")) + len(os.listdir("data/train/NonViolence"))
total_test = len(os.listdir("data/test/Violence")) + len(os.listdir("data/test/NonViolence"))
total_val = len(os.listdir("data/val/Violence")) + len(os.listdir("data/val/NonViolence"))
print(f"Train: {total_train}")
print(f"Test: {total_test}")
print(f"Val: {total_val}")

class ViolenceDataset(Dataset):
    def __init__(self, dataset_folder, clip_length=16, transform=None):
        self.dataset_folder = dataset_folder
        self.clip_length = clip_length
        self.transform = transform if transform else transforms.Compose([
            transforms.Resize((112, 112)),
            transforms.ToTensor()
        ])

        self.video_paths = []
        self.labels = []

        for label, category in enumerate(os.listdir(dataset_folder)):
            folder_path = os.path.join(dataset_folder, category)
            if os.path.isdir(folder_path):
                for video_name in os.listdir(folder_path):
                    self.video_paths.append(os.path.join(folder_path, video_name))
                    self.labels.append(label)

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]

        frames = self.extract_frames(video_path)
        frames = torch.stack([self.transform(frame) for frame in frames])  # (frames, C, H, W)
        frames = frames.permute(1, 0, 2, 3)  # (C, frames, H, W)
        
        print(f"Dataset Output: {frames.shape}")  # (C, frames, 112, 112)

        return frames, torch.tensor(label, dtype=torch.long)

    def extract_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        selected_frames = np.linspace(0, frame_count - 1, self.clip_length, dtype=int)

        for i in range(frame_count):
            ret, frame = cap.read()
            if not ret:
                break
            if i in selected_frames:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = cv2.resize(frame, (112, 112))
                frames.append(frame)

        cap.release()
        return [transforms.ToPILImage()(frame) for frame in frames]

dataset_folder = "data"
batch_size = 8

train_dataset = ViolenceDataset(os.path.join(dataset_folder, "train"))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for clips, labels in train_loader:
    print(f"Loader Video Shape: {clips.shape}")  # (batch, 3, frames, 112, 112) 
    break

class ViolenceDetectionLSTM(nn.Module):
    def __init__(self, hidden_size=256, num_layers=2):
        super(ViolenceDetectionLSTM, self).__init__()
        self.cnn = models.r3d_18(pretrained=True)
        self.cnn.fc = nn.Identity()  

        self.lstm = nn.LSTM(input_size=512, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        print("\n--- Forward Started ---")
        print("Input Shape:", x.shape)  # (batch, 16, 3, 112, 112)

        # (batch, frames, 3, 112, 112)
        x = x.permute(0, 2, 1, 3, 4)  # (batch, frames, C, H, W)
        print("Permute:", x.shape)  # (batch, frames, 3, 112, 112)

        
        cnn_features = []
        for t in range(x.shape[1]):  
            frame = x[:, t, :, :, :]  # (batch, 3, 112, 112)
            cnn_out = self.cnn(frame) # (batch, 512)
            cnn_features.append(cnn_out.unsqueeze(1))  # (batch, 512) 
        #(batch, frames, 512)
        cnn_features = torch.cat(cnn_features, dim=1)
        print("LSTM, CNN:", cnn_features.shape)  # (batch, frames, 512)

        
        lstm_out, _ = self.lstm(cnn_features)
        lstm_out = lstm_out[:, -1, :] 
        output = self.fc(lstm_out)

        print("Model Output:", output.shape)  # (batch, 1)
        print("--- Forward Finished ---\n")

        return output
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViolenceDetectionLSTM().to(device)

# ------------------------
# Training
# ------------------------
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

num_epochs = 10
train_losses, val_losses = [], []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for clips, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        clips, labels = clips.to(device), labels.float().unsqueeze(1).to(device)

        # Debug: Input Check
        print(f"Input: {clips.shape}")  # (batch, frames, C, H, W)
        
        optimizer.zero_grad()
        outputs = model(clips)

        # Debug: Output Check
        print(f"Output: {outputs.shape}")  # (batch, 1)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {train_loss:.4f}")

    torch.save(model.state_dict(), "best_violence_model_lstm.pth")

print("Training complete! Best model saved.")

RuntimeError: Given groups=1, weight of size [64, 3, 3, 7, 7], expected input[1, 8, 3, 112, 112] to have 3 channels, but got 8 channels instead Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...

I could not solved this problem for 2 days, even used chat gpt and other stuff. There is a problem with channels but i could not figured out and confused

Share asked Mar 4 at 8:51 Can GürcüoğluCan Gürcüoğlu 111 silver badge1 bronze badge
Add a comment  | 

1 Answer 1

Reset to default 2

The issue occurs because it incorrectly processes the data before training. The model expects input shape - (batch, channels, frames, height, width). However, your data shape is incorrect when feeding into the CNN. Fix forward method like:

def forward(self, x):
    # x comes as (batch, channels, frames, height, width)
    batch_size = x.size(0)
    
    # this is the exact format, no need to permute
    cnn_out = self.cnn(x)
    
    # reshape for LSTM
    cnn_features = cnn_out.unsqueeze(1)
    
    # pass through
    lstm_out, _ = self.lstm(cnn_features)
    lstm_out = lstm_out[:, -1, :] 
    output = self.fc(lstm_out)
    
    return output
import os
import shutil
import random
import torch
import torchvision.transforms as transforms
import cv2
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torchvision.models.video as models
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
from PIL import Image

# ------------------------
# Datasets => Train, Test, Val 
# ------------------------

source_dir = "new_dataset"
target_dir = "data"


for split in ["train", "test", "val"]:
    os.makedirs(os.path.join(target_dir, split, "NonViolence"), exist_ok=True)
    os.makedirs(os.path.join(target_dir, split, "Violence"), exist_ok=True)


train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1

for category in ["NonViolence", "Violence"]:
    category_path = os.path.join(source_dir, category)
    files = os.listdir(category_path)
    random.shuffle(files)

    train_count = int(len(files) * train_ratio)
    val_count = int(len(files) * val_ratio)

    train_files = files[:train_count]
    val_files = files[train_count:train_count + val_count]
    test_files = files[train_count + val_count:]

    for file_set, split in [(train_files, "train"), (val_files, "val"), (test_files, "test")]:
        for file in file_set:
            shutil.copy(os.path.join(category_path, file), os.path.join(target_dir, split, category, file))

total_train = len(os.listdir("data/train/Violence")) + len(os.listdir("data/train/NonViolence"))
total_test = len(os.listdir("data/test/Violence")) + len(os.listdir("data/test/NonViolence"))
total_val = len(os.listdir("data/val/Violence")) + len(os.listdir("data/val/NonViolence"))
print(f"Train: {total_train}")
print(f"Test: {total_test}")
print(f"Val: {total_val}")

class ViolenceDataset(Dataset):
    def __init__(self, dataset_folder, clip_length=16, transform=None):
        self.dataset_folder = dataset_folder
        self.clip_length = clip_length
        self.transform = transform if transform else transforms.Compose([
            transforms.Resize((112, 112)),
            transforms.ToTensor()
        ])

        self.video_paths = []
        self.labels = []

        for label, category in enumerate(os.listdir(dataset_folder)):
            folder_path = os.path.join(dataset_folder, category)
            if os.path.isdir(folder_path):
                for video_name in os.listdir(folder_path):
                    self.video_paths.append(os.path.join(folder_path, video_name))
                    self.labels.append(label)

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]

        frames = self.extract_frames(video_path)
        frames = torch.stack([self.transform(frame) for frame in frames])  # (frames, C, H, W)
        frames = frames.permute(1, 0, 2, 3)  # (C, frames, H, W)
        
        print(f"Dataset Output: {frames.shape}")  # (C, frames, 112, 112)

        return frames, torch.tensor(label, dtype=torch.long)

    def extract_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        selected_frames = np.linspace(0, frame_count - 1, self.clip_length, dtype=int)

        for i in range(frame_count):
            ret, frame = cap.read()
            if not ret:
                break
            if i in selected_frames:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = cv2.resize(frame, (112, 112))
                frames.append(frame)

        cap.release()
        return [transforms.ToPILImage()(frame) for frame in frames]

dataset_folder = "data"
batch_size = 8

train_dataset = ViolenceDataset(os.path.join(dataset_folder, "train"))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for clips, labels in train_loader:
    print(f"Loader Video Shape: {clips.shape}")  # (batch, 3, frames, 112, 112) 
    break

class ViolenceDetectionLSTM(nn.Module):
    def __init__(self, hidden_size=256, num_layers=2):
        super(ViolenceDetectionLSTM, self).__init__()
        selfn = models.r3d_18(pretrained=True)
        selfn.fc = nn.Identity()  

        self.lstm = nn.LSTM(input_size=512, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        print("\n--- Forward Started ---")
        print("Input Shape:", x.shape)  # (batch, 16, 3, 112, 112)

        # (batch, frames, 3, 112, 112)
        x = x.permute(0, 2, 1, 3, 4)  # (batch, frames, C, H, W)
        print("Permute:", x.shape)  # (batch, frames, 3, 112, 112)

        
        cnn_features = []
        for t in range(x.shape[1]):  
            frame = x[:, t, :, :, :]  # (batch, 3, 112, 112)
            cnn_out = selfn(frame) # (batch, 512)
            cnn_features.append(cnn_out.unsqueeze(1))  # (batch, 512) 
        #(batch, frames, 512)
        cnn_features = torch.cat(cnn_features, dim=1)
        print("LSTM, CNN:", cnn_features.shape)  # (batch, frames, 512)

        
        lstm_out, _ = self.lstm(cnn_features)
        lstm_out = lstm_out[:, -1, :] 
        output = self.fc(lstm_out)

        print("Model Output:", output.shape)  # (batch, 1)
        print("--- Forward Finished ---\n")

        return output
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViolenceDetectionLSTM().to(device)

# ------------------------
# Training
# ------------------------
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

num_epochs = 10
train_losses, val_losses = [], []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for clips, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        clips, labels = clips.to(device), labels.float().unsqueeze(1).to(device)

        # Debug: Input Check
        print(f"Input: {clips.shape}")  # (batch, frames, C, H, W)
        
        optimizer.zero_grad()
        outputs = model(clips)

        # Debug: Output Check
        print(f"Output: {outputs.shape}")  # (batch, 1)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {train_loss:.4f}")

    torch.save(model.state_dict(), "best_violence_model_lstm.pth")

print("Training complete! Best model saved.")

RuntimeError: Given groups=1, weight of size [64, 3, 3, 7, 7], expected input[1, 8, 3, 112, 112] to have 3 channels, but got 8 channels instead Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...

I could not solved this problem for 2 days, even used chat gpt and other stuff. There is a problem with channels but i could not figured out and confused

import os
import shutil
import random
import torch
import torchvision.transforms as transforms
import cv2
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torchvision.models.video as models
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
from PIL import Image

# ------------------------
# Datasets => Train, Test, Val 
# ------------------------

source_dir = "new_dataset"
target_dir = "data"


for split in ["train", "test", "val"]:
    os.makedirs(os.path.join(target_dir, split, "NonViolence"), exist_ok=True)
    os.makedirs(os.path.join(target_dir, split, "Violence"), exist_ok=True)


train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1

for category in ["NonViolence", "Violence"]:
    category_path = os.path.join(source_dir, category)
    files = os.listdir(category_path)
    random.shuffle(files)

    train_count = int(len(files) * train_ratio)
    val_count = int(len(files) * val_ratio)

    train_files = files[:train_count]
    val_files = files[train_count:train_count + val_count]
    test_files = files[train_count + val_count:]

    for file_set, split in [(train_files, "train"), (val_files, "val"), (test_files, "test")]:
        for file in file_set:
            shutil.copy(os.path.join(category_path, file), os.path.join(target_dir, split, category, file))

total_train = len(os.listdir("data/train/Violence")) + len(os.listdir("data/train/NonViolence"))
total_test = len(os.listdir("data/test/Violence")) + len(os.listdir("data/test/NonViolence"))
total_val = len(os.listdir("data/val/Violence")) + len(os.listdir("data/val/NonViolence"))
print(f"Train: {total_train}")
print(f"Test: {total_test}")
print(f"Val: {total_val}")

class ViolenceDataset(Dataset):
    def __init__(self, dataset_folder, clip_length=16, transform=None):
        self.dataset_folder = dataset_folder
        self.clip_length = clip_length
        self.transform = transform if transform else transforms.Compose([
            transforms.Resize((112, 112)),
            transforms.ToTensor()
        ])

        self.video_paths = []
        self.labels = []

        for label, category in enumerate(os.listdir(dataset_folder)):
            folder_path = os.path.join(dataset_folder, category)
            if os.path.isdir(folder_path):
                for video_name in os.listdir(folder_path):
                    self.video_paths.append(os.path.join(folder_path, video_name))
                    self.labels.append(label)

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]

        frames = self.extract_frames(video_path)
        frames = torch.stack([self.transform(frame) for frame in frames])  # (frames, C, H, W)
        frames = frames.permute(1, 0, 2, 3)  # (C, frames, H, W)
        
        print(f"Dataset Output: {frames.shape}")  # (C, frames, 112, 112)

        return frames, torch.tensor(label, dtype=torch.long)

    def extract_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        selected_frames = np.linspace(0, frame_count - 1, self.clip_length, dtype=int)

        for i in range(frame_count):
            ret, frame = cap.read()
            if not ret:
                break
            if i in selected_frames:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = cv2.resize(frame, (112, 112))
                frames.append(frame)

        cap.release()
        return [transforms.ToPILImage()(frame) for frame in frames]

dataset_folder = "data"
batch_size = 8

train_dataset = ViolenceDataset(os.path.join(dataset_folder, "train"))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for clips, labels in train_loader:
    print(f"Loader Video Shape: {clips.shape}")  # (batch, 3, frames, 112, 112) 
    break

class ViolenceDetectionLSTM(nn.Module):
    def __init__(self, hidden_size=256, num_layers=2):
        super(ViolenceDetectionLSTM, self).__init__()
        self.cnn = models.r3d_18(pretrained=True)
        self.cnn.fc = nn.Identity()  

        self.lstm = nn.LSTM(input_size=512, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        print("\n--- Forward Started ---")
        print("Input Shape:", x.shape)  # (batch, 16, 3, 112, 112)

        # (batch, frames, 3, 112, 112)
        x = x.permute(0, 2, 1, 3, 4)  # (batch, frames, C, H, W)
        print("Permute:", x.shape)  # (batch, frames, 3, 112, 112)

        
        cnn_features = []
        for t in range(x.shape[1]):  
            frame = x[:, t, :, :, :]  # (batch, 3, 112, 112)
            cnn_out = self.cnn(frame) # (batch, 512)
            cnn_features.append(cnn_out.unsqueeze(1))  # (batch, 512) 
        #(batch, frames, 512)
        cnn_features = torch.cat(cnn_features, dim=1)
        print("LSTM, CNN:", cnn_features.shape)  # (batch, frames, 512)

        
        lstm_out, _ = self.lstm(cnn_features)
        lstm_out = lstm_out[:, -1, :] 
        output = self.fc(lstm_out)

        print("Model Output:", output.shape)  # (batch, 1)
        print("--- Forward Finished ---\n")

        return output
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViolenceDetectionLSTM().to(device)

# ------------------------
# Training
# ------------------------
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

num_epochs = 10
train_losses, val_losses = [], []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for clips, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        clips, labels = clips.to(device), labels.float().unsqueeze(1).to(device)

        # Debug: Input Check
        print(f"Input: {clips.shape}")  # (batch, frames, C, H, W)
        
        optimizer.zero_grad()
        outputs = model(clips)

        # Debug: Output Check
        print(f"Output: {outputs.shape}")  # (batch, 1)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {train_loss:.4f}")

    torch.save(model.state_dict(), "best_violence_model_lstm.pth")

print("Training complete! Best model saved.")

RuntimeError: Given groups=1, weight of size [64, 3, 3, 7, 7], expected input[1, 8, 3, 112, 112] to have 3 channels, but got 8 channels instead Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...

I could not solved this problem for 2 days, even used chat gpt and other stuff. There is a problem with channels but i could not figured out and confused

Share asked Mar 4 at 8:51 Can GürcüoğluCan Gürcüoğlu 111 silver badge1 bronze badge
Add a comment  | 

1 Answer 1

Reset to default 2

The issue occurs because it incorrectly processes the data before training. The model expects input shape - (batch, channels, frames, height, width). However, your data shape is incorrect when feeding into the CNN. Fix forward method like:

def forward(self, x):
    # x comes as (batch, channels, frames, height, width)
    batch_size = x.size(0)
    
    # this is the exact format, no need to permute
    cnn_out = self.cnn(x)
    
    # reshape for LSTM
    cnn_features = cnn_out.unsqueeze(1)
    
    # pass through
    lstm_out, _ = self.lstm(cnn_features)
    lstm_out = lstm_out[:, -1, :] 
    output = self.fc(lstm_out)
    
    return output

本文标签: pythonRuntimeError Given groups1weight of size 6437