python - RuntimeError: Given groups=1, weight of size [64, 3, 3, 7, 7], expected input[1, 8, 3, 112, 112] to have 3 channels, bu-369IT编程

admin管理员组
文章数量:1026989

import os
import shutil
import random
import torch
import torchvision.transforms as transforms
import cv2
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torchvision.models.video as models
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
from PIL import Image

# ------------------------
# Datasets => Train, Test, Val 
# ------------------------

source_dir = "new_dataset"
target_dir = "data"


for split in ["train", "test", "val"]:
    os.makedirs(os.path.join(target_dir, split, "NonViolence"), exist_ok=True)
    os.makedirs(os.path.join(target_dir, split, "Violence"), exist_ok=True)


train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1

for category in ["NonViolence", "Violence"]:
    category_path = os.path.join(source_dir, category)
    files = os.listdir(category_path)
    random.shuffle(files)

    train_count = int(len(files) * train_ratio)
    val_count = int(len(files) * val_ratio)

    train_files = files[:train_count]
    val_files = files[train_count:train_count + val_count]
    test_files = files[train_count + val_count:]

    for file_set, split in [(train_files, "train"), (val_files, "val"), (test_files, "test")]:
        for file in file_set:
            shutil.copy(os.path.join(category_path, file), os.path.join(target_dir, split, category, file))

total_train = len(os.listdir("data/train/Violence")) + len(os.listdir("data/train/NonViolence"))
total_test = len(os.listdir("data/test/Violence")) + len(os.listdir("data/test/NonViolence"))
total_val = len(os.listdir("data/val/Violence")) + len(os.listdir("data/val/NonViolence"))
print(f"Train: {total_train}")
print(f"Test: {total_test}")
print(f"Val: {total_val}")

class ViolenceDataset(Dataset):
    def __init__(self, dataset_folder, clip_length=16, transform=None):
        self.dataset_folder = dataset_folder
        self.clip_length = clip_length
        self.transform = transform if transform else transforms.Compose([
            transforms.Resize((112, 112)),
            transforms.ToTensor()
        ])

        self.video_paths = []
        self.labels = []

        for label, category in enumerate(os.listdir(dataset_folder)):
            folder_path = os.path.join(dataset_folder, category)
            if os.path.isdir(folder_path):
                for video_name in os.listdir(folder_path):
                    self.video_paths.append(os.path.join(folder_path, video_name))
                    self.labels.append(label)

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]

        frames = self.extract_frames(video_path)
        frames = torch.stack([self.transform(frame) for frame in frames])  # (frames, C, H, W)
        frames = frames.permute(1, 0, 2, 3)  # (C, frames, H, W)
        
        print(f"Dataset Output: {frames.shape}")  # (C, frames, 112, 112)

        return frames, torch.tensor(label, dtype=torch.long)

    def extract_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        selected_frames = np.linspace(0, frame_count - 1, self.clip_length, dtype=int)

        for i in range(frame_count):
            ret, frame = cap.read()
            if not ret:
                break
            if i in selected_frames:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = cv2.resize(frame, (112, 112))
                frames.append(frame)

        cap.release()
        return [transforms.ToPILImage()(frame) for frame in frames]

dataset_folder = "data"
batch_size = 8

train_dataset = ViolenceDataset(os.path.join(dataset_folder, "train"))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for clips, labels in train_loader:
    print(f"Loader Video Shape: {clips.shape}")  # (batch, 3, frames, 112, 112) 
    break

class ViolenceDetectionLSTM(nn.Module):
    def __init__(self, hidden_size=256, num_layers=2):
        super(ViolenceDetectionLSTM, self).__init__()
        selfn = models.r3d_18(pretrained=True)
        selfn.fc = nn.Identity()  

        self.lstm = nn.LSTM(input_size=512, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        print("\n--- Forward Started ---")
        print("Input Shape:", x.shape)  # (batch, 16, 3, 112, 112)

        # (batch, frames, 3, 112, 112)
        x = x.permute(0, 2, 1, 3, 4)  # (batch, frames, C, H, W)
        print("Permute:", x.shape)  # (batch, frames, 3, 112, 112)

        
        cnn_features = []
        for t in range(x.shape[1]):  
            frame = x[:, t, :, :, :]  # (batch, 3, 112, 112)
            cnn_out = selfn(frame) # (batch, 512)
            cnn_features.append(cnn_out.unsqueeze(1))  # (batch, 512) 
        #(batch, frames, 512)
        cnn_features = torch.cat(cnn_features, dim=1)
        print("LSTM, CNN:", cnn_features.shape)  # (batch, frames, 512)

        
        lstm_out, _ = self.lstm(cnn_features)
        lstm_out = lstm_out[:, -1, :] 
        output = self.fc(lstm_out)

        print("Model Output:", output.shape)  # (batch, 1)
        print("--- Forward Finished ---\n")

        return output
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViolenceDetectionLSTM().to(device)

# ------------------------
# Training
# ------------------------
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

num_epochs = 10
train_losses, val_losses = [], []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for clips, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        clips, labels = clips.to(device), labels.float().unsqueeze(1).to(device)

        # Debug: Input Check
        print(f"Input: {clips.shape}")  # (batch, frames, C, H, W)
        
        optimizer.zero_grad()
        outputs = model(clips)

        # Debug: Output Check
        print(f"Output: {outputs.shape}")  # (batch, 1)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {train_loss:.4f}")

    torch.save(model.state_dict(), "best_violence_model_lstm.pth")

print("Training complete! Best model saved.")

RuntimeError: Given groups=1, weight of size [64, 3, 3, 7, 7], expected input[1, 8, 3, 112, 112] to have 3 channels, but got 8 channels instead Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...

I could not solved this problem for 2 days, even used chat gpt and other stuff. There is a problem with channels but i could not figured out and confused

import os
import shutil
import random
import torch
import torchvision.transforms as transforms
import cv2
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torchvision.models.video as models
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
from PIL import Image

# ------------------------
# Datasets => Train, Test, Val 
# ------------------------

source_dir = "new_dataset"
target_dir = "data"


for split in ["train", "test", "val"]:
    os.makedirs(os.path.join(target_dir, split, "NonViolence"), exist_ok=True)
    os.makedirs(os.path.join(target_dir, split, "Violence"), exist_ok=True)


train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1

for category in ["NonViolence", "Violence"]:
    category_path = os.path.join(source_dir, category)
    files = os.listdir(category_path)
    random.shuffle(files)

    train_count = int(len(files) * train_ratio)
    val_count = int(len(files) * val_ratio)

    train_files = files[:train_count]
    val_files = files[train_count:train_count + val_count]
    test_files = files[train_count + val_count:]

    for file_set, split in [(train_files, "train"), (val_files, "val"), (test_files, "test")]:
        for file in file_set:
            shutil.copy(os.path.join(category_path, file), os.path.join(target_dir, split, category, file))

total_train = len(os.listdir("data/train/Violence")) + len(os.listdir("data/train/NonViolence"))
total_test = len(os.listdir("data/test/Violence")) + len(os.listdir("data/test/NonViolence"))
total_val = len(os.listdir("data/val/Violence")) + len(os.listdir("data/val/NonViolence"))
print(f"Train: {total_train}")
print(f"Test: {total_test}")
print(f"Val: {total_val}")

class ViolenceDataset(Dataset):
    def __init__(self, dataset_folder, clip_length=16, transform=None):
        self.dataset_folder = dataset_folder
        self.clip_length = clip_length
        self.transform = transform if transform else transforms.Compose([
            transforms.Resize((112, 112)),
            transforms.ToTensor()
        ])

        self.video_paths = []
        self.labels = []

        for label, category in enumerate(os.listdir(dataset_folder)):
            folder_path = os.path.join(dataset_folder, category)
            if os.path.isdir(folder_path):
                for video_name in os.listdir(folder_path):
                    self.video_paths.append(os.path.join(folder_path, video_name))
                    self.labels.append(label)

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]

        frames = self.extract_frames(video_path)
        frames = torch.stack([self.transform(frame) for frame in frames])  # (frames, C, H, W)
        frames = frames.permute(1, 0, 2, 3)  # (C, frames, H, W)
        
        print(f"Dataset Output: {frames.shape}")  # (C, frames, 112, 112)

        return frames, torch.tensor(label, dtype=torch.long)

    def extract_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        selected_frames = np.linspace(0, frame_count - 1, self.clip_length, dtype=int)

        for i in range(frame_count):
            ret, frame = cap.read()
            if not ret:
                break
            if i in selected_frames:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = cv2.resize(frame, (112, 112))
                frames.append(frame)

        cap.release()
        return [transforms.ToPILImage()(frame) for frame in frames]

dataset_folder = "data"
batch_size = 8

train_dataset = ViolenceDataset(os.path.join(dataset_folder, "train"))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for clips, labels in train_loader:
    print(f"Loader Video Shape: {clips.shape}")  # (batch, 3, frames, 112, 112) 
    break

class ViolenceDetectionLSTM(nn.Module):
    def __init__(self, hidden_size=256, num_layers=2):
        super(ViolenceDetectionLSTM, self).__init__()
        self.cnn = models.r3d_18(pretrained=True)
        self.cnn.fc = nn.Identity()  

        self.lstm = nn.LSTM(input_size=512, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        print("\n--- Forward Started ---")
        print("Input Shape:", x.shape)  # (batch, 16, 3, 112, 112)

        # (batch, frames, 3, 112, 112)
        x = x.permute(0, 2, 1, 3, 4)  # (batch, frames, C, H, W)
        print("Permute:", x.shape)  # (batch, frames, 3, 112, 112)

        
        cnn_features = []
        for t in range(x.shape[1]):  
            frame = x[:, t, :, :, :]  # (batch, 3, 112, 112)
            cnn_out = self.cnn(frame) # (batch, 512)
            cnn_features.append(cnn_out.unsqueeze(1))  # (batch, 512) 
        #(batch, frames, 512)
        cnn_features = torch.cat(cnn_features, dim=1)
        print("LSTM, CNN:", cnn_features.shape)  # (batch, frames, 512)

        
        lstm_out, _ = self.lstm(cnn_features)
        lstm_out = lstm_out[:, -1, :] 
        output = self.fc(lstm_out)

        print("Model Output:", output.shape)  # (batch, 1)
        print("--- Forward Finished ---\n")

        return output
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViolenceDetectionLSTM().to(device)

# ------------------------
# Training
# ------------------------
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

num_epochs = 10
train_losses, val_losses = [], []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for clips, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        clips, labels = clips.to(device), labels.float().unsqueeze(1).to(device)

        # Debug: Input Check
        print(f"Input: {clips.shape}")  # (batch, frames, C, H, W)
        
        optimizer.zero_grad()
        outputs = model(clips)

        # Debug: Output Check
        print(f"Output: {outputs.shape}")  # (batch, 1)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {train_loss:.4f}")

    torch.save(model.state_dict(), "best_violence_model_lstm.pth")

print("Training complete! Best model saved.")

I could not solved this problem for 2 days, even used chat gpt and other stuff. There is a problem with channels but i could not figured out and confused

Share asked Mar 4 at 8:51 Can Gürcüoğlu 111 silver badge1 bronze badge

Add a comment |

1 Answer 1

Sorted by: Reset to default 2

The issue occurs because it incorrectly processes the data before training. The model expects input shape - (batch, channels, frames, height, width). However, your data shape is incorrect when feeding into the CNN. Fix forward method like:

def forward(self, x):
    # x comes as (batch, channels, frames, height, width)
    batch_size = x.size(0)
    
    # this is the exact format, no need to permute
    cnn_out = self.cnn(x)
    
    # reshape for LSTM
    cnn_features = cnn_out.unsqueeze(1)
    
    # pass through
    lstm_out, _ = self.lstm(cnn_features)
    lstm_out = lstm_out[:, -1, :] 
    output = self.fc(lstm_out)
    
    return output

import os
import shutil
import random
import torch
import torchvision.transforms as transforms
import cv2
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torchvision.models.video as models
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
from PIL import Image

# ------------------------
# Datasets => Train, Test, Val 
# ------------------------

source_dir = "new_dataset"
target_dir = "data"


for split in ["train", "test", "val"]:
    os.makedirs(os.path.join(target_dir, split, "NonViolence"), exist_ok=True)
    os.makedirs(os.path.join(target_dir, split, "Violence"), exist_ok=True)


train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1

for category in ["NonViolence", "Violence"]:
    category_path = os.path.join(source_dir, category)
    files = os.listdir(category_path)
    random.shuffle(files)

    train_count = int(len(files) * train_ratio)
    val_count = int(len(files) * val_ratio)

    train_files = files[:train_count]
    val_files = files[train_count:train_count + val_count]
    test_files = files[train_count + val_count:]

    for file_set, split in [(train_files, "train"), (val_files, "val"), (test_files, "test")]:
        for file in file_set:
            shutil.copy(os.path.join(category_path, file), os.path.join(target_dir, split, category, file))

total_train = len(os.listdir("data/train/Violence")) + len(os.listdir("data/train/NonViolence"))
total_test = len(os.listdir("data/test/Violence")) + len(os.listdir("data/test/NonViolence"))
total_val = len(os.listdir("data/val/Violence")) + len(os.listdir("data/val/NonViolence"))
print(f"Train: {total_train}")
print(f"Test: {total_test}")
print(f"Val: {total_val}")

class ViolenceDataset(Dataset):
    def __init__(self, dataset_folder, clip_length=16, transform=None):
        self.dataset_folder = dataset_folder
        self.clip_length = clip_length
        self.transform = transform if transform else transforms.Compose([
            transforms.Resize((112, 112)),
            transforms.ToTensor()
        ])

        self.video_paths = []
        self.labels = []

        for label, category in enumerate(os.listdir(dataset_folder)):
            folder_path = os.path.join(dataset_folder, category)
            if os.path.isdir(folder_path):
                for video_name in os.listdir(folder_path):
                    self.video_paths.append(os.path.join(folder_path, video_name))
                    self.labels.append(label)

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]

        frames = self.extract_frames(video_path)
        frames = torch.stack([self.transform(frame) for frame in frames])  # (frames, C, H, W)
        frames = frames.permute(1, 0, 2, 3)  # (C, frames, H, W)
        
        print(f"Dataset Output: {frames.shape}")  # (C, frames, 112, 112)

        return frames, torch.tensor(label, dtype=torch.long)

    def extract_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        selected_frames = np.linspace(0, frame_count - 1, self.clip_length, dtype=int)

        for i in range(frame_count):
            ret, frame = cap.read()
            if not ret:
                break
            if i in selected_frames:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = cv2.resize(frame, (112, 112))
                frames.append(frame)

        cap.release()
        return [transforms.ToPILImage()(frame) for frame in frames]

dataset_folder = "data"
batch_size = 8

train_dataset = ViolenceDataset(os.path.join(dataset_folder, "train"))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for clips, labels in train_loader:
    print(f"Loader Video Shape: {clips.shape}")  # (batch, 3, frames, 112, 112) 
    break

class ViolenceDetectionLSTM(nn.Module):
    def __init__(self, hidden_size=256, num_layers=2):
        super(ViolenceDetectionLSTM, self).__init__()
        selfn = models.r3d_18(pretrained=True)
        selfn.fc = nn.Identity()  

        self.lstm = nn.LSTM(input_size=512, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        print("\n--- Forward Started ---")
        print("Input Shape:", x.shape)  # (batch, 16, 3, 112, 112)

        # (batch, frames, 3, 112, 112)
        x = x.permute(0, 2, 1, 3, 4)  # (batch, frames, C, H, W)
        print("Permute:", x.shape)  # (batch, frames, 3, 112, 112)

        
        cnn_features = []
        for t in range(x.shape[1]):  
            frame = x[:, t, :, :, :]  # (batch, 3, 112, 112)
            cnn_out = selfn(frame) # (batch, 512)
            cnn_features.append(cnn_out.unsqueeze(1))  # (batch, 512) 
        #(batch, frames, 512)
        cnn_features = torch.cat(cnn_features, dim=1)
        print("LSTM, CNN:", cnn_features.shape)  # (batch, frames, 512)

        
        lstm_out, _ = self.lstm(cnn_features)
        lstm_out = lstm_out[:, -1, :] 
        output = self.fc(lstm_out)

        print("Model Output:", output.shape)  # (batch, 1)
        print("--- Forward Finished ---\n")

        return output
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViolenceDetectionLSTM().to(device)

# ------------------------
# Training
# ------------------------
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

num_epochs = 10
train_losses, val_losses = [], []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for clips, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        clips, labels = clips.to(device), labels.float().unsqueeze(1).to(device)

        # Debug: Input Check
        print(f"Input: {clips.shape}")  # (batch, frames, C, H, W)
        
        optimizer.zero_grad()
        outputs = model(clips)

        # Debug: Output Check
        print(f"Output: {outputs.shape}")  # (batch, 1)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {train_loss:.4f}")

    torch.save(model.state_dict(), "best_violence_model_lstm.pth")

print("Training complete! Best model saved.")

I could not solved this problem for 2 days, even used chat gpt and other stuff. There is a problem with channels but i could not figured out and confused

import os
import shutil
import random
import torch
import torchvision.transforms as transforms
import cv2
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torchvision.models.video as models
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
from PIL import Image

# ------------------------
# Datasets => Train, Test, Val 
# ------------------------

source_dir = "new_dataset"
target_dir = "data"


for split in ["train", "test", "val"]:
    os.makedirs(os.path.join(target_dir, split, "NonViolence"), exist_ok=True)
    os.makedirs(os.path.join(target_dir, split, "Violence"), exist_ok=True)


train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1

for category in ["NonViolence", "Violence"]:
    category_path = os.path.join(source_dir, category)
    files = os.listdir(category_path)
    random.shuffle(files)

    train_count = int(len(files) * train_ratio)
    val_count = int(len(files) * val_ratio)

    train_files = files[:train_count]
    val_files = files[train_count:train_count + val_count]
    test_files = files[train_count + val_count:]

    for file_set, split in [(train_files, "train"), (val_files, "val"), (test_files, "test")]:
        for file in file_set:
            shutil.copy(os.path.join(category_path, file), os.path.join(target_dir, split, category, file))

total_train = len(os.listdir("data/train/Violence")) + len(os.listdir("data/train/NonViolence"))
total_test = len(os.listdir("data/test/Violence")) + len(os.listdir("data/test/NonViolence"))
total_val = len(os.listdir("data/val/Violence")) + len(os.listdir("data/val/NonViolence"))
print(f"Train: {total_train}")
print(f"Test: {total_test}")
print(f"Val: {total_val}")

class ViolenceDataset(Dataset):
    def __init__(self, dataset_folder, clip_length=16, transform=None):
        self.dataset_folder = dataset_folder
        self.clip_length = clip_length
        self.transform = transform if transform else transforms.Compose([
            transforms.Resize((112, 112)),
            transforms.ToTensor()
        ])

        self.video_paths = []
        self.labels = []

        for label, category in enumerate(os.listdir(dataset_folder)):
            folder_path = os.path.join(dataset_folder, category)
            if os.path.isdir(folder_path):
                for video_name in os.listdir(folder_path):
                    self.video_paths.append(os.path.join(folder_path, video_name))
                    self.labels.append(label)

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]

        frames = self.extract_frames(video_path)
        frames = torch.stack([self.transform(frame) for frame in frames])  # (frames, C, H, W)
        frames = frames.permute(1, 0, 2, 3)  # (C, frames, H, W)
        
        print(f"Dataset Output: {frames.shape}")  # (C, frames, 112, 112)

        return frames, torch.tensor(label, dtype=torch.long)

    def extract_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        selected_frames = np.linspace(0, frame_count - 1, self.clip_length, dtype=int)

        for i in range(frame_count):
            ret, frame = cap.read()
            if not ret:
                break
            if i in selected_frames:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = cv2.resize(frame, (112, 112))
                frames.append(frame)

        cap.release()
        return [transforms.ToPILImage()(frame) for frame in frames]

dataset_folder = "data"
batch_size = 8

train_dataset = ViolenceDataset(os.path.join(dataset_folder, "train"))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for clips, labels in train_loader:
    print(f"Loader Video Shape: {clips.shape}")  # (batch, 3, frames, 112, 112) 
    break

class ViolenceDetectionLSTM(nn.Module):
    def __init__(self, hidden_size=256, num_layers=2):
        super(ViolenceDetectionLSTM, self).__init__()
        self.cnn = models.r3d_18(pretrained=True)
        self.cnn.fc = nn.Identity()  

        self.lstm = nn.LSTM(input_size=512, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        print("\n--- Forward Started ---")
        print("Input Shape:", x.shape)  # (batch, 16, 3, 112, 112)

        # (batch, frames, 3, 112, 112)
        x = x.permute(0, 2, 1, 3, 4)  # (batch, frames, C, H, W)
        print("Permute:", x.shape)  # (batch, frames, 3, 112, 112)

        
        cnn_features = []
        for t in range(x.shape[1]):  
            frame = x[:, t, :, :, :]  # (batch, 3, 112, 112)
            cnn_out = self.cnn(frame) # (batch, 512)
            cnn_features.append(cnn_out.unsqueeze(1))  # (batch, 512) 
        #(batch, frames, 512)
        cnn_features = torch.cat(cnn_features, dim=1)
        print("LSTM, CNN:", cnn_features.shape)  # (batch, frames, 512)

        
        lstm_out, _ = self.lstm(cnn_features)
        lstm_out = lstm_out[:, -1, :] 
        output = self.fc(lstm_out)

        print("Model Output:", output.shape)  # (batch, 1)
        print("--- Forward Finished ---\n")

        return output
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViolenceDetectionLSTM().to(device)

# ------------------------
# Training
# ------------------------
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

num_epochs = 10
train_losses, val_losses = [], []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for clips, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        clips, labels = clips.to(device), labels.float().unsqueeze(1).to(device)

        # Debug: Input Check
        print(f"Input: {clips.shape}")  # (batch, frames, C, H, W)
        
        optimizer.zero_grad()
        outputs = model(clips)

        # Debug: Output Check
        print(f"Output: {outputs.shape}")  # (batch, 1)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {train_loss:.4f}")

    torch.save(model.state_dict(), "best_violence_model_lstm.pth")

print("Training complete! Best model saved.")

I could not solved this problem for 2 days, even used chat gpt and other stuff. There is a problem with channels but i could not figured out and confused

Share asked Mar 4 at 8:51 Can Gürcüoğlu 111 silver badge1 bronze badge

Add a comment |

1 Answer 1

Sorted by: Reset to default 2

def forward(self, x):
    # x comes as (batch, channels, frames, height, width)
    batch_size = x.size(0)
    
    # this is the exact format, no need to permute
    cnn_out = self.cnn(x)
    
    # reshape for LSTM
    cnn_features = cnn_out.unsqueeze(1)
    
    # pass through
    lstm_out, _ = self.lstm(cnn_features)
    lstm_out = lstm_out[:, -1, :] 
    output = self.fc(lstm_out)
    
    return output

本文标签： pythonRuntimeError Given groups1 weight of size 64 3 7

版权声明：本文标题：python - RuntimeError: Given groups=1, weight of size [64, 3, 3, 7, 7], expected input[1, 8, 3, 112, 112] to have 3 channels, bu 内容由热心网友自发贡献，该文观点仅代表作者本人，转载请联系作者并注明出处：http://it.en369.cn/questions/1745054035a2131939.html，本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容，一经查实，本站将立刻删除。

369IT编程

python - RuntimeError: Given groups=1, weight of size [64, 3, 3, 7, 7], expected input[1, 8, 3, 112, 112] to have 3 channels, bu

1 Answer 1

1 Answer 1

更多相关文章

7

3

1. 有1,2,3,4个数字, 能组成多少个互不相同且无重复数字的三位数? 都是多少?

综合实践计算机的入门知识教学设计,3

把一个字符串13579先变成Array——[1, 3, 5, 7, 9]，再利用reduce()，就可以写出一个把字符串转换为Number的函数。

sql注入中的union select 1,2,3....

3,Matlab仿真弹跳球

个人理解向：证明：对n=2,3,...,成立 11*2+12*3+...+1（n

7, excel vba 女神说要把她的名字写10000遍

1.算法斐波那契数列0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, ，递归实现（超简答）

Set placeholder %2, %3, ... in Windows Event Log with PowerShell - Stack Overflow

javascript - Member not found IE error (IE 6, 7, 8, 9) - Stack Overflow

machine learning - YOLOv8 Final Detection Head Still Outputs (1, 7, 8400) Instead of (1, 8, 8400) for 3 Classes - Stack Overflow

javascript - Uncaught Error: Based on the provided shape, [1024,3], the tensor should have 3072 values but has 30 - Stack Overfl

javascript - Split array into different size chunks (4, 3, 3, 3, 4, 3, 3, 3, etc) - Stack Overflow

javascript - for loop number sequence of (1,1,2,2,3,3, etc.) - Stack Overflow

javascript - How to iterate over the series: 1, -2, 3, -4, 5, -6, 7, -8, ...? - Stack Overflow

javascript - What is the most efficient way of merging [1,2] and [7,8] into [[1,7], [2,8]] - Stack Overflow

javascript - What is &quot;let [a,,b] = [1, 2, 3, 4, 5]&quot; with two commas (,,) doing and how can it be used? - Stack

jquery - Javascript Timing: How do I execute a function every 5, 7, and 8 seconds? - Stack Overflow

发表评论

推荐文章

permalinks - CPT links error: each one links to next one further along in the list

javascript - Disable FancyTree Nodes - Stack Overflow

python - Extract organization NPI based on data provided in input file - Stack Overflow

javascript - TypeError: obj[key].includes is not a function when filtering values of an object in an array - Stack Overflow

plugins - How to change response of admin-ajax request?

热门文章

javascript - when checkbox is checked, change background color of div - Stack Overflow

javascript - angular cli includeexclude scripts in index.html depending on the environment - Stack Overflow

html - Getting duplicate page inside when submitting a form - Stack Overflow

Outlook integration failing in FusionAuth - Stack Overflow

plugins - I need help locating a URL that is on my site map, but that I can&#39;t find in my Wordpress dashboard

custom post types - Should i use a plugin for Event Manager or code my own needs?

Multiple Plugin best practice in Multisite

javascript - How should I build a simple 4x4 table with fill-able blocks - Stack Overflow

javascript - Dart equivalent of Array.prototype.map()? - Stack Overflow

javascript - Prevent iOS from handling focus event on an &lt;input&gt; - Stack Overflow

最新文章

windows设置断电重启开机后自动输入锁屏密码登录

Windows系统设置开机默认开启数字小键盘

Windows11 开机自动同步时间（开机时间不更新问题）

windows配置开机自启动软件或脚本

【Redis】Windows设置Redis为开机自启动

程序员刚毕业，先去大厂镀金还是先去小厂攒经验？

万象2008清空boss账户密码

【Tools】GitBook简明教程

oracle exadata celldisk 闪存盘受损导致性能下降

SDUT 2138 图结构练习——BFSDFS——判断可达性

javascript - Type &#39;undefined&#39; is not assignable to type &#39;menuItemProps[]&#39; - Stack Overflow

javascript - VS 2015 Angular 2 import modules cannot be resolved - Stack Overflow

javascript - Get the JSON objects that are not present in another array - Stack Overflow

javascript - How to dismiss a phonegap notification programmatically - Stack Overflow

c - Solaris 10 make Error code 1 Fatal Error when trying to build python 2.7.16 - Stack Overflow

个人理解向：证明：对n=2,3,...,成立 112+123+...+1（n

javascript - What is "let [a,,b] = [1, 2, 3, 4, 5]" with two commas (,,) doing and how can it be used? - Stack

plugins - I need help locating a URL that is on my site map, but that I can't find in my Wordpress dashboard

javascript - Prevent iOS from handling focus event on an <input> - Stack Overflow

javascript - Type 'undefined' is not assignable to type 'menuItemProps[]' - Stack Overflow