diff --git a/.github/actions/veristat_baseline_compare/action.yml b/.github/actions/veristat_baseline_compare/action.yml new file mode 100644 index 0000000000000..4a24166a97a9a --- /dev/null +++ b/.github/actions/veristat_baseline_compare/action.yml @@ -0,0 +1,49 @@ +name: 'run-veristat' +description: 'Run veristat benchmark' +inputs: + veristat_output: + description: 'Veristat output filepath' + required: true + baseline_name: + description: 'Veristat baseline cache name' + required: true +runs: + using: "composite" + steps: + - uses: actions/upload-artifact@v4 + with: + name: ${{ inputs.baseline_name }} + if-no-files-found: error + path: ${{ github.workspace }}/${{ inputs.veristat_output }} + + # For pull request: + # - get baseline log from cache + # - compare it to current run + - if: ${{ github.event_name == 'pull_request' }} + uses: actions/cache/restore@v4 + with: + key: ${{ inputs.baseline_name }}-${{ github.base_ref }} + restore-keys: | + ${{ inputs.baseline_name }}- + path: '${{ github.workspace }}/${{ inputs.baseline_name }}' + + - if: ${{ github.event_name == 'pull_request' }} + name: Show veristat comparison + shell: bash + run: ./.github/scripts/compare-veristat-results.sh + env: + BASELINE_PATH: ${{ github.workspace }}/${{ inputs.baseline_name }} + VERISTAT_OUTPUT: ${{ inputs.veristat_output }} + + # For push: just put baseline log to cache + - if: ${{ github.event_name == 'push' }} + shell: bash + run: | + mv "${{ github.workspace }}/${{ inputs.veristat_output }}" \ + "${{ github.workspace }}/${{ inputs.baseline_name }}" + + - if: ${{ github.event_name == 'push' }} + uses: actions/cache/save@v4 + with: + key: ${{ inputs.baseline_name }}-${{ github.ref_name }}-${{ github.run_id }} + path: '${{ github.workspace }}/${{ inputs.baseline_name }}' diff --git a/.github/scripts/compare-veristat-results.sh b/.github/scripts/compare-veristat-results.sh new file mode 100755 index 0000000000000..f95c3c192d80d --- /dev/null +++ b/.github/scripts/compare-veristat-results.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +if [[ ! -f "${BASELINE_PATH}" ]]; then + echo "# No ${BASELINE_PATH} available" >> "${GITHUB_STEP_SUMMARY}" + + echo "No ${BASELINE_PATH} available" + echo "Printing veristat results" + cat "${VERISTAT_OUTPUT}" + + exit +fi + +selftests/bpf/veristat \ + --output-format csv \ + --emit file,prog,verdict,states \ + --compare "${BASELINE_PATH}" "${VERISTAT_OUTPUT}" > compare.csv + +python3 ./.github/scripts/veristat_compare.py compare.csv diff --git a/.github/scripts/download-gcc-bpf.sh b/.github/scripts/download-gcc-bpf.sh new file mode 100755 index 0000000000000..894584a01b2ec --- /dev/null +++ b/.github/scripts/download-gcc-bpf.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -euo pipefail + +GCC_BPF_RELEASE_GH_REPO=$1 +INSTALL_DIR=$(realpath $2) + +cd /tmp + +tag=$(gh release list -L 1 -R ${GCC_BPF_RELEASE_GH_REPO} --json tagName -q .[].tagName) +if [[ -z "$tag" ]]; then + echo "Could not find latest GCC BPF release at ${GCC_BPF_RELEASE_GH_REPO}" + exit 1 +fi + +url="https://github.com/${GCC_BPF_RELEASE_GH_REPO}/releases/download/${tag}/${tag}.tar.zst" +echo "Downloading $url" +wget -q "$url" + +tarball=${tag}.tar.zst +dir=$(tar tf $tarball | head -1 || true) + +echo "Extracting $tarball ..." +tar -I zstd -xf $tarball && rm -f $tarball + +rm -rf $INSTALL_DIR +mv -v $dir $INSTALL_DIR + +cd - + diff --git a/.github/scripts/matrix.py b/.github/scripts/matrix.py new file mode 100644 index 0000000000000..312f76259be9a --- /dev/null +++ b/.github/scripts/matrix.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 + +import dataclasses +import json +import os + +from enum import Enum +from typing import Any, Dict, Final, List, Optional, Set, Union + +import requests + +MANAGED_OWNER: Final[str] = "kernel-patches" +MANAGED_REPOS: Final[Set[str]] = { + f"{MANAGED_OWNER}/bpf", + f"{MANAGED_OWNER}/vmtest", +} + +DEFAULT_SELF_HOSTED_RUNNER_TAGS: Final[List[str]] = ["self-hosted", "docker-noble-main"] +DEFAULT_GITHUB_HOSTED_RUNNER: Final[str] = "ubuntu-24.04" +DEFAULT_GCC_VERSION: Final[int] = 14 +DEFAULT_LLVM_VERSION: Final[int] = 20 + +RUNNERS_BUSY_THRESHOLD: Final[float] = 0.8 + + +class Arch(str, Enum): + """ + CPU architecture supported by CI. + """ + + AARCH64 = "aarch64" + S390X = "s390x" + X86_64 = "x86_64" + + +class Compiler(str, Enum): + GCC = "gcc" + LLVM = "llvm" + + +def query_runners_from_github() -> List[Dict[str, Any]]: + if "GITHUB_TOKEN" not in os.environ: + return [] + token = os.environ["GITHUB_TOKEN"] + headers = { + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + } + owner = os.environ["GITHUB_REPOSITORY_OWNER"] + url: Optional[str] = f"https://api.github.com/orgs/{owner}/actions/runners" + # GitHub returns 30 runners per page, fetch all + all_runners = [] + try: + while url is not None: + response = requests.get(url, headers=headers) + if response.status_code != 200: + print(f"Failed to query runners: {response.status_code}") + print(f"response: {response.text}") + return [] + data = response.json() + all_runners.extend(data.get("runners", [])) + # Check for next page URL in Link header + url = None + if "Link" in response.headers: + links = requests.utils.parse_header_links(response.headers["Link"]) + for link in links: + if link["rel"] == "next": + url = link["url"] + break + return all_runners + except Exception as e: + print(f"Warning: Failed to query runner status due to exception: {e}") + return [] + + +all_runners_cached: Optional[List[Dict[str, Any]]] = None + + +def all_runners() -> List[Dict[str, Any]]: + global all_runners_cached + if all_runners_cached is None: + print("Querying runners from GitHub...") + all_runners_cached = query_runners_from_github() + print(f"Github returned {len(all_runners_cached)} runners") + counts = count_by_status(all_runners_cached) + print( + f"Busy: {counts['busy']}, Idle: {counts['idle']}, Offline: {counts['offline']}" + ) + return all_runners_cached + + +def runner_labels(runner: Dict[str, Any]) -> List[str]: + return [label["name"] for label in runner["labels"]] + + +def is_self_hosted_runner(runner: Dict[str, Any]) -> bool: + labels = runner_labels(runner) + for label in DEFAULT_SELF_HOSTED_RUNNER_TAGS: + if label not in labels: + return False + return True + + +def self_hosted_runners() -> List[Dict[str, Any]]: + runners = all_runners() + return [r for r in runners if is_self_hosted_runner(r)] + + +def runners_by_arch(arch: Arch) -> List[Dict[str, Any]]: + runners = self_hosted_runners() + return [r for r in runners if arch.value in runner_labels(r)] + + +def count_by_status(runners: List[Dict[str, Any]]) -> Dict[str, int]: + result = {"busy": 0, "idle": 0, "offline": 0} + for runner in runners: + if runner["status"] == "online": + if runner["busy"]: + result["busy"] += 1 + else: + result["idle"] += 1 + else: + result["offline"] += 1 + return result + + +@dataclasses.dataclass +class BuildConfig: + arch: Arch + kernel_compiler: Compiler = Compiler.GCC + gcc_version: int = DEFAULT_GCC_VERSION + llvm_version: int = DEFAULT_LLVM_VERSION + kernel: str = "LATEST" + run_veristat: bool = False + parallel_tests: bool = False + build_release: bool = False + + @property + def runs_on(self) -> List[str]: + if is_managed_repo(): + return DEFAULT_SELF_HOSTED_RUNNER_TAGS + [self.arch.value] + else: + return [DEFAULT_GITHUB_HOSTED_RUNNER] + + @property + def build_runs_on(self) -> List[str]: + if not is_managed_repo(): + return [DEFAULT_GITHUB_HOSTED_RUNNER] + + # @Temporary: disable codebuild runners for cross-compilation jobs + match self.arch: + case Arch.S390X: + return DEFAULT_SELF_HOSTED_RUNNER_TAGS + [Arch.X86_64.value] + case Arch.AARCH64: + return DEFAULT_SELF_HOSTED_RUNNER_TAGS + [Arch.AARCH64.value] + + # For managed repos, check the busyness of relevant self-hosted runners + # If they are too busy, use codebuild + runner_arch = self.arch + # We don't build s390x kernel on s390x runners, because it's too slow + # Cross-compiling on x86_64 is faster + if runner_arch == Arch.S390X: + runner_arch = Arch.X86_64 + runners = runners_by_arch(runner_arch) + counts = count_by_status(runners) + online = counts["idle"] + counts["busy"] + busy = counts["busy"] + # if online <= 0, then something is wrong, don't use codebuild + if online > 0 and busy / online > RUNNERS_BUSY_THRESHOLD: + return ["codebuild"] + else: + return DEFAULT_SELF_HOSTED_RUNNER_TAGS + [runner_arch.value] + + @property + def tests(self) -> Dict[str, Any]: + tests_list = [ + "test_progs", + "test_progs_parallel", + "test_progs_no_alu32", + "test_progs_no_alu32_parallel", + "test_verifier", + ] + + if self.arch.value != "s390x": + tests_list.append("test_maps") + + if self.llvm_version >= 18: + tests_list.append("test_progs_cpuv4") + + # if self.arch in [Arch.X86_64, Arch.AARCH64]: + # tests_list.append("sched_ext") + + # Don't run GCC BPF runner, because too many tests are failing + # See: https://lore.kernel.org/bpf/87bjw6qpje.fsf@oracle.com/ + # if self.arch == Arch.X86_64: + # tests_list.append("test_progs-bpf_gcc") + + if not self.parallel_tests: + tests_list = [test for test in tests_list if not test.endswith("parallel")] + + return {"include": [generate_test_config(test) for test in tests_list]} + + def to_dict(self) -> Dict[str, Any]: + return { + "arch": self.arch.value, + "kernel_compiler": self.kernel_compiler.value, + "gcc_version": DEFAULT_GCC_VERSION, + "llvm_version": DEFAULT_LLVM_VERSION, + "kernel": self.kernel, + "run_veristat": self.run_veristat, + "parallel_tests": self.parallel_tests, + "build_release": self.build_release, + "runs_on": self.runs_on, + "tests": self.tests, + "build_runs_on": self.build_runs_on, + } + + +def is_managed_repo() -> bool: + return ( + os.environ["GITHUB_REPOSITORY_OWNER"] == MANAGED_OWNER + and os.environ["GITHUB_REPOSITORY"] in MANAGED_REPOS + ) + + +def set_output(name, value): + """Write an output variable to the GitHub output file.""" + with open(os.getenv("GITHUB_OUTPUT"), "a", encoding="utf-8") as file: + file.write(f"{name}={value}\n") + + +def generate_test_config(test: str) -> Dict[str, Union[str, int]]: + """Create the configuration for the provided test.""" + is_parallel = test.endswith("_parallel") + config = { + "test": test, + "continue_on_error": is_parallel, + # While in experimental mode, parallel jobs may get stuck + # anywhere, including in user space where the kernel won't detect + # a problem and panic. We add a second layer of (smaller) timeouts + # here such that if we get stuck in a parallel run, we hit this + # timeout and fail without affecting the overall job success (as + # would be the case if we hit the job-wide timeout). For + # non-experimental jobs, 360 is the default which will be + # superseded by the overall workflow timeout (but we need to + # specify something). + "timeout_minutes": 30 if is_parallel else 360, + } + return config + + +if __name__ == "__main__": + matrix = [ + BuildConfig( + arch=Arch.X86_64, + run_veristat=True, + parallel_tests=True, + ), + BuildConfig( + arch=Arch.X86_64, + kernel_compiler=Compiler.LLVM, + build_release=True, + ), + BuildConfig( + arch=Arch.AARCH64, + ), + BuildConfig( + arch=Arch.S390X, + ), + ] + + # Outside of managed repositories only run on x86_64 + if not is_managed_repo(): + matrix = [config for config in matrix if config.arch == Arch.X86_64] + + json_matrix = json.dumps({"include": [config.to_dict() for config in matrix]}) + print(json.dumps(json.loads(json_matrix), indent=4)) + set_output("build_matrix", json_matrix) diff --git a/.github/scripts/tests/test_veristat_compare.py b/.github/scripts/tests/test_veristat_compare.py new file mode 100644 index 0000000000000..b65b69295235d --- /dev/null +++ b/.github/scripts/tests/test_veristat_compare.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +import unittest +from typing import Iterable, List + +from ..veristat_compare import parse_table, VeristatFields + + +def gen_csv_table(records: Iterable[str]) -> List[str]: + return [ + ",".join(VeristatFields.headers()), + *records, + ] + + +class TestVeristatCompare(unittest.TestCase): + def test_parse_table_ignore_new_prog(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,N/A,success,N/A,N/A,1,N/A", + ] + ) + veristat_info = parse_table(table) + self.assertEqual(veristat_info.table, []) + self.assertFalse(veristat_info.changes) + self.assertFalse(veristat_info.new_failures) + + def test_parse_table_ignore_removed_prog(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,success,N/A,N/A,1,N/A,N/A", + ] + ) + veristat_info = parse_table(table) + self.assertEqual(veristat_info.table, []) + self.assertFalse(veristat_info.changes) + self.assertFalse(veristat_info.new_failures) + + def test_parse_table_new_failure(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,success,failure,MISMATCH,1,1,+0 (+0.00%)", + ] + ) + veristat_info = parse_table(table) + self.assertEqual( + veristat_info.table, + [["prog_file.bpf.o", "prog_name", "success -> failure (!!)", "+0.00 %"]], + ) + self.assertTrue(veristat_info.changes) + self.assertTrue(veristat_info.new_failures) + + def test_parse_table_new_changes(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,failure,success,MISMATCH,0,0,+0 (+0.00%)", + "prog_file.bpf.o,prog_name_increase,failure,failure,MATCH,1,2,+1 (+100.00%)", + "prog_file.bpf.o,prog_name_decrease,success,success,MATCH,1,1,-1 (-100.00%)", + ] + ) + veristat_info = parse_table(table) + self.assertEqual( + veristat_info.table, + [ + ["prog_file.bpf.o", "prog_name", "failure -> success", "+0.00 %"], + ["prog_file.bpf.o", "prog_name_increase", "failure", "+100.00 %"], + ["prog_file.bpf.o", "prog_name_decrease", "success", "-100.00 %"], + ], + ) + self.assertTrue(veristat_info.changes) + self.assertFalse(veristat_info.new_failures) + + +if __name__ == "__main__": + unittest.main() diff --git a/.github/scripts/tmpfsify-workspace.sh b/.github/scripts/tmpfsify-workspace.sh new file mode 100755 index 0000000000000..6fd62b4ad2a49 --- /dev/null +++ b/.github/scripts/tmpfsify-workspace.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -x -euo pipefail + +TMPFS_SIZE=20 # GB +MEM_TOTAL=$(awk '/MemTotal/ {print int($2/1024)}' /proc/meminfo) + +# sanity check: total mem is at least double TMPFS_SIZE +if [ $MEM_TOTAL -lt $(($TMPFS_SIZE*1024*2)) ]; then + echo "tmpfsify-workspace.sh: will not allocate tmpfs, total memory is too low (${MEM_TOTAL}MB)" + exit 0 +fi + +dir="$(basename "$GITHUB_WORKSPACE")" +cd "$(dirname "$GITHUB_WORKSPACE")" +mv "${dir}" "${dir}.backup" +mkdir "${dir}" +sudo mount -t tmpfs -o size=${TMPFS_SIZE}G tmpfs "${dir}" +rsync -a "${dir}.backup/" "${dir}" +cd - + diff --git a/.github/scripts/veristat_compare.py b/.github/scripts/veristat_compare.py new file mode 100644 index 0000000000000..07271b8cbd3aa --- /dev/null +++ b/.github/scripts/veristat_compare.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 + +# This script reads a CSV file produced by the following invocation: +# +# veristat --emit file,prog,verdict,states \ +# --output-format csv \ +# --compare ... +# +# And produces a markdown summary for the file. +# The summary is printed to standard output and appended to a file +# pointed to by GITHUB_STEP_SUMMARY variable. +# +# Script exits with return code 1 if there are new failures in the +# veristat results. +# +# For testing purposes invoke as follows: +# +# GITHUB_STEP_SUMMARY=/dev/null python3 veristat-compare.py test.csv +# +# File format (columns): +# 0. file_name +# 1. prog_name +# 2. verdict_base +# 3. verdict_comp +# 4. verdict_diff +# 5. total_states_base +# 6. total_states_comp +# 7. total_states_diff +# +# Records sample: +# file-a,a,success,failure,MISMATCH,12,12,+0 (+0.00%) +# file-b,b,success,success,MATCH,67,67,+0 (+0.00%) +# +# For better readability suffixes '_OLD' and '_NEW' +# are used instead of '_base' and '_comp' for variable +# names etc. + +import io +import os +import sys +import re +import csv +import logging +import argparse +import enum +from dataclasses import dataclass +from typing import Dict, Iterable, List, Final + + +TRESHOLD_PCT: Final[int] = 0 + +SUMMARY_HEADERS = ["File", "Program", "Verdict", "States Diff (%)"] + +# expected format: +0 (+0.00%) / -0 (-0.00%) +TOTAL_STATES_DIFF_REGEX = ( + r"(?P[+-]\d+) \((?P[+-]\d+\.\d+)\%\)" +) + + +TEXT_SUMMARY_TEMPLATE: Final[str] = ( + """ +# {title} + +{table} +""".strip() +) + +HTML_SUMMARY_TEMPLATE: Final[str] = ( + """ +# {title} + +
+Click to expand + +{table} +
+""".strip() +) + +GITHUB_MARKUP_REPLACEMENTS: Final[Dict[str, str]] = { + "->": "→", + "(!!)": ":bangbang:", +} + +NEW_FAILURE_SUFFIX: Final[str] = "(!!)" + + +class VeristatFields(str, enum.Enum): + FILE_NAME = "file_name" + PROG_NAME = "prog_name" + VERDICT_OLD = "verdict_base" + VERDICT_NEW = "verdict_comp" + VERDICT_DIFF = "verdict_diff" + TOTAL_STATES_OLD = "total_states_base" + TOTAL_STATES_NEW = "total_states_comp" + TOTAL_STATES_DIFF = "total_states_diff" + + @classmethod + def headers(cls) -> List[str]: + return [ + cls.FILE_NAME, + cls.PROG_NAME, + cls.VERDICT_OLD, + cls.VERDICT_NEW, + cls.VERDICT_DIFF, + cls.TOTAL_STATES_OLD, + cls.TOTAL_STATES_NEW, + cls.TOTAL_STATES_DIFF, + ] + + +@dataclass +class VeristatInfo: + table: list + changes: bool + new_failures: bool + + def get_results_title(self) -> str: + if self.new_failures: + return "There are new veristat failures" + + if self.changes: + return "There are changes in verification performance" + + return "No changes in verification performance" + + def get_results_summary(self, markup: bool = False) -> str: + title = self.get_results_title() + if not self.table: + return f"# {title}\n" + + template = TEXT_SUMMARY_TEMPLATE + table = format_table(headers=SUMMARY_HEADERS, rows=self.table) + + if markup: + template = HTML_SUMMARY_TEMPLATE + table = github_markup_decorate(table) + + return template.format(title=title, table=table) + + +def get_state_diff(value: str) -> float: + if value == "N/A": + return 0.0 + + matches = re.match(TOTAL_STATES_DIFF_REGEX, value) + if not matches: + raise ValueError(f"Failed to parse total states diff field value '{value}'") + + if percentage_diff := matches.group("percentage_diff"): + return float(percentage_diff) + + raise ValueError(f"Invalid {VeristatFields.TOTAL_STATES_DIFF} field value: {value}") + + +def parse_table(csv_file: Iterable[str]) -> VeristatInfo: + reader = csv.DictReader(csv_file) + assert reader.fieldnames == VeristatFields.headers() + + new_failures = False + changes = False + table = [] + + for record in reader: + add = False + + verdict_old, verdict_new = ( + record[VeristatFields.VERDICT_OLD], + record[VeristatFields.VERDICT_NEW], + ) + + # Ignore results from completely new and removed programs + if "N/A" in [verdict_new, verdict_old]: + continue + + if record[VeristatFields.VERDICT_DIFF] == "MISMATCH": + changes = True + add = True + verdict = f"{verdict_old} -> {verdict_new}" + if verdict_new == "failure": + new_failures = True + verdict += f" {NEW_FAILURE_SUFFIX}" + else: + verdict = record[VeristatFields.VERDICT_NEW] + + diff = get_state_diff(record[VeristatFields.TOTAL_STATES_DIFF]) + if abs(diff) > TRESHOLD_PCT: + changes = True + add = True + + if not add: + continue + + table.append( + [ + record[VeristatFields.FILE_NAME], + record[VeristatFields.PROG_NAME], + verdict, + f"{diff:+.2f} %", + ] + ) + + return VeristatInfo(table=table, changes=changes, new_failures=new_failures) + + +def github_markup_decorate(input_str: str) -> str: + for text, markup in GITHUB_MARKUP_REPLACEMENTS.items(): + input_str = input_str.replace(text, markup) + return input_str + + +def format_table(headers: List[str], rows: List[List[str]]) -> str: + column_width = [ + max(len(row[column_idx]) for row in [headers] + rows) + for column_idx in range(len(headers)) + ] + + # Row template string in the following format: + # "{0:8}|{1:10}|{2:15}|{3:7}|{4:10}" + row_template = "|".join( + f"{{{idx}:{width}}}" for idx, width in enumerate(column_width) + ) + row_template_nl = f"|{row_template}|\n" + + with io.StringIO() as out: + out.write(row_template_nl.format(*headers)) + + separator_row = ["-" * width for width in column_width] + out.write(row_template_nl.format(*separator_row)) + + for row in rows: + row_str = row_template_nl.format(*row) + out.write(row_str) + + return out.getvalue() + + +def main(compare_csv_filename: os.PathLike, output_filename: os.PathLike) -> None: + with open(compare_csv_filename, newline="", encoding="utf-8") as csv_file: + veristat_results = parse_table(csv_file) + + sys.stdout.write(veristat_results.get_results_summary()) + + with open(output_filename, encoding="utf-8", mode="a") as file: + file.write(veristat_results.get_results_summary(markup=True)) + + if veristat_results.new_failures: + return 1 + + return 0 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Print veristat comparison output as markdown step summary" + ) + parser.add_argument("filename") + args = parser.parse_args() + summary_filename = os.getenv("GITHUB_STEP_SUMMARY") + if not summary_filename: + logging.error("GITHUB_STEP_SUMMARY environment variable is not set") + sys.exit(1) + sys.exit(main(args.filename, summary_filename)) diff --git a/.github/workflows/gcc-bpf.yml b/.github/workflows/gcc-bpf.yml new file mode 100644 index 0000000000000..5f05234399d33 --- /dev/null +++ b/.github/workflows/gcc-bpf.yml @@ -0,0 +1,103 @@ +name: Testing GCC BPF compiler + +on: + workflow_call: + inputs: + runs_on: + required: true + type: string + arch: + required: true + type: string + gcc_version: + required: true + type: string + llvm_version: + required: true + type: string + toolchain: + required: true + type: string + toolchain_full: + required: true + type: string + download_sources: + required: true + type: boolean + +jobs: + test: + name: GCC BPF + runs-on: >- + ${{ + contains(fromJSON(inputs.runs_on), 'codebuild') + && format('codebuild-bpf-ci-{0}-{1}', github.run_id, github.run_attempt) + || fromJSON(inputs.runs_on) + }} + env: + ARCH: ${{ inputs.arch }} + BPF_NEXT_BASE_BRANCH: 'master' + GCC_BPF_INSTALL_DIR: ${{ github.workspace }}/gcc-bpf + GCC_BPF_RELEASE_REPO: 'theihor/gcc-bpf' + KBUILD_OUTPUT: ${{ github.workspace }}/src/kbuild-output + REPO_ROOT: ${{ github.workspace }}/src + + steps: + + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - if: ${{ inputs.download_sources }} + name: Download bpf-next tree + uses: libbpf/ci/get-linux-source@v3 + with: + dest: ${{ env.REPO_ROOT }} + rev: ${{ env.BPF_NEXT_BASE_BRANCH }} + + - if: ${{ ! inputs.download_sources }} + name: Checkout ${{ github.repository }} to ./src + uses: actions/checkout@v4 + with: + path: 'src' + + - uses: libbpf/ci/patch-kernel@v3 + with: + patches-root: '${{ github.workspace }}/ci/diffs' + repo-root: ${{ env.REPO_ROOT }} + + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }} + path: ${{ env.REPO_ROOT }} + + - name: Untar artifacts + working-directory: ${{ env.REPO_ROOT }} + run: zstd -d -T0 vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}.tar.zst --stdout | tar -xf - + + - name: Setup build environment + uses: libbpf/ci/setup-build-env@v3 + with: + arch: ${{ inputs.arch }} + gcc-version: ${{ inputs.gcc_version }} + llvm-version: ${{ inputs.llvm_version }} + + - name: Download GCC BPF compiler + shell: bash + env: + GH_TOKEN: ${{ github.token }} + run: .github/scripts/download-gcc-bpf.sh ${{ env.GCC_BPF_RELEASE_REPO }} ${{ env.GCC_BPF_INSTALL_DIR }} + + - name: Build selftests/bpf/test_progs-bpf_gcc + uses: libbpf/ci/build-selftests@v3 + env: + BPF_GCC: ${{ env.GCC_BPF_INSTALL_DIR }} + MAX_MAKE_JOBS: 32 + SELFTESTS_BPF_TARGETS: 'test_progs-bpf_gcc' + with: + arch: ${{ inputs.arch }} + kernel-root: ${{ env.REPO_ROOT }} + llvm-version: ${{ inputs.llvm_version }} + toolchain: ${{ inputs.toolchain }} diff --git a/.github/workflows/kernel-build-test.yml b/.github/workflows/kernel-build-test.yml new file mode 100644 index 0000000000000..37690cef82a0c --- /dev/null +++ b/.github/workflows/kernel-build-test.yml @@ -0,0 +1,154 @@ +name: Reusable Build/Test/Veristat workflow + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: The toolchain and for llvm, its version, e.g gcc, llvm-15 + toolchain: + required: true + type: string + description: The toolchain, e.g gcc, llvm + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + build_runs_on: + required: true + type: string + description: The runners to run the builds on. This is a json string representing an array of labels. + gcc_version: + required: true + type: string + description: GCC version to install + llvm_version: + required: true + type: string + description: LLVM version to install + kernel: + required: true + type: string + description: The kernel to run the test against. For KPD this is always LATEST, which runs against a newly built kernel. + tests: + required: true + type: string + description: A serialized json array with the tests to be running, it must follow the json-matrix format, https://www.jitsejan.com/use-github-actions-with-json-file-as-matrix + run_veristat: + required: true + type: boolean + description: Whether or not to run the veristat job. + run_tests: + required: true + type: boolean + description: Whether or not to run the test job. + download_sources: + required: true + type: boolean + description: Whether to download the linux sources into the working directory. + default: false + build_release: + required: true + type: boolean + description: Build selftests with -O2 optimization in addition to non-optimized build. + default: false + secrets: + AWS_ROLE_ARN: + required: true + +jobs: + + # Build kernel and selftest + build: + uses: ./.github/workflows/kernel-build.yml + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + toolchain: ${{ inputs.toolchain }} + runs_on: ${{ inputs.build_runs_on }} + gcc_version: ${{ inputs.gcc_version }} + llvm_version: ${{ inputs.llvm_version }} + kernel: ${{ inputs.kernel }} + download_sources: ${{ inputs.download_sources }} + + build-release: + if: ${{ inputs.build_release }} + uses: ./.github/workflows/kernel-build.yml + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + toolchain: ${{ inputs.toolchain }} + runs_on: ${{ inputs.build_runs_on }} + gcc_version: ${{ inputs.gcc_version }} + llvm_version: ${{ inputs.llvm_version }} + kernel: ${{ inputs.kernel }} + download_sources: ${{ inputs.download_sources }} + release: true + + test: + if: ${{ inputs.run_tests }} + uses: ./.github/workflows/kernel-test.yml + # Setting name to test here to avoid lengthy autogenerated names due to matrix + # e.g build-and-test x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc + name: "test" + needs: [build] + strategy: + fail-fast: false + matrix: ${{ fromJSON(inputs.tests) }} + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + runs_on: ${{ inputs.runs_on }} + kernel: ${{ inputs.kernel }} + test: ${{ matrix.test }} + continue_on_error: ${{ toJSON(matrix.continue_on_error) }} + timeout_minutes: ${{ matrix.timeout_minutes }} + + veristat-kernel: + if: ${{ inputs.run_veristat }} + uses: ./.github/workflows/veristat-kernel.yml + needs: [build] + permissions: + id-token: write + contents: read + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + runs_on: ${{ inputs.runs_on }} + + veristat-meta: + # Check for vars.AWS_REGION is necessary to skip this job in case of a PR from a fork. + if: ${{ inputs.run_veristat && github.repository_owner == 'kernel-patches' && vars.AWS_REGION }} + uses: ./.github/workflows/veristat-meta.yml + needs: [build] + permissions: + id-token: write + contents: read + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + aws_region: ${{ vars.AWS_REGION }} + runs_on: ${{ inputs.runs_on }} + secrets: + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} + + gcc-bpf: + name: 'GCC BPF' + if: ${{ inputs.arch == 'x86_64' }} + uses: ./.github/workflows/gcc-bpf.yml + needs: [build] + with: + # GCC BPF does not need /dev/kvm, so use the "build" runners + runs_on: ${{ inputs.build_runs_on }} + arch: ${{ inputs.arch }} + gcc_version: ${{ inputs.gcc_version }} + llvm_version: ${{ inputs.llvm_version }} + toolchain: ${{ inputs.toolchain }} + toolchain_full: ${{ inputs.toolchain_full }} + download_sources: ${{ inputs.download_sources }} + diff --git a/.github/workflows/kernel-build.yml b/.github/workflows/kernel-build.yml new file mode 100644 index 0000000000000..6a5dba01dbf49 --- /dev/null +++ b/.github/workflows/kernel-build.yml @@ -0,0 +1,196 @@ + +name: Reusable build workflow + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: The toolchain and for llvm, its version, e.g gcc, llvm-15 + toolchain: + required: true + type: string + description: The toolchain, e.g gcc, llvm + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + gcc_version: + required: true + type: string + description: GCC version to install + llvm_version: + required: true + type: string + description: LLVM version to install + kernel: + required: true + type: string + description: The kernel to run the test against. For KPD this is always LATEST, which runs against a newly built kernel. + download_sources: + required: true + type: boolean + description: Whether to download the linux sources into the working directory. + default: false + release: + required: false + type: boolean + description: Build selftest with -O2 optimization + default: false + +jobs: + build: + name: build kernel and selftests ${{ inputs.release && '-O2' || '' }} + # To run on CodeBuild, runs-on value must correspond to the AWS + # CodeBuild project associated with the kernel-patches webhook + # However matrix.py passes just a 'codebuild' string + runs-on: >- + ${{ + contains(fromJSON(inputs.runs_on), 'codebuild') + && format('codebuild-bpf-ci-{0}-{1}', github.run_id, github.run_attempt) + || fromJSON(inputs.runs_on) + }} + env: + ARTIFACTS_ARCHIVE: "vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}.tar.zst" + BPF_NEXT_BASE_BRANCH: 'master' + BPF_NEXT_FETCH_DEPTH: 64 # A bit of history is needed to facilitate incremental builds + CROSS_COMPILE: ${{ inputs.arch != 'x86_64' && 'true' || '' }} + # BUILD_SCHED_EXT_SELFTESTS: ${{ inputs.arch == 'x86_64' || inputs.arch == 'aarch64' && 'true' || '' }} + KBUILD_OUTPUT: ${{ github.workspace }}/kbuild-output + KERNEL: ${{ inputs.kernel }} + KERNEL_ROOT: ${{ github.workspace }} + REPO_PATH: "" + REPO_ROOT: ${{ github.workspace }} + RUNNER_TYPE: ${{ contains(fromJSON(inputs.runs_on), 'codebuild') && 'codebuild' || 'default' }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: ${{ inputs.download_sources && 1 || env.BPF_NEXT_FETCH_DEPTH }} + + - if: ${{ env.RUNNER_TYPE == 'codebuild' }} + shell: bash + run: .github/scripts/tmpfsify-workspace.sh + + - if: ${{ inputs.download_sources }} + name: Download bpf-next tree + env: + FETCH_DEPTH: ${{ env.BPF_NEXT_FETCH_DEPTH }} + uses: libbpf/ci/get-linux-source@v3 + with: + dest: '.kernel' + rev: ${{ env.BPF_NEXT_BASE_BRANCH }} + - uses: libbpf/ci/prepare-incremental-build@v3 + with: + repo-root: ${{ inputs.download_sources && '.kernel' || env.REPO_ROOT }} + base-branch: >- + ${{ inputs.download_sources && env.BPF_NEXT_BASE_BRANCH + || github.event_name == 'pull_request' && github.base_ref + || github.ref_name + }} + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + - if: ${{ inputs.download_sources }} + name: Move linux source in place + shell: bash + run: | + cd .kernel + rm -rf .git + mv -t .. $(ls -A) + cd .. + rmdir .kernel + - uses: libbpf/ci/patch-kernel@v3 + with: + patches-root: '${{ github.workspace }}/ci/diffs' + repo-root: ${{ env.REPO_ROOT }} + + - name: Setup build environment + uses: libbpf/ci/setup-build-env@v3 + with: + arch: ${{ inputs.arch }} + gcc-version: ${{ inputs.gcc_version }} + llvm-version: ${{ inputs.llvm_version }} + pahole: master + + # We have to setup qemu+binfmt in order to enable cross-compation of selftests. + # During selftests build, freshly built bpftool is executed. + # On self-hosted bare-metal hosts binfmt is pre-configured. + - if: ${{ env.RUNNER_TYPE == 'codebuild' && env.CROSS_COMPILE }} + name: Set up docker + uses: docker/setup-docker-action@v4 + - if: ${{ env.RUNNER_TYPE == 'codebuild' && env.CROSS_COMPILE }} + name: Setup binfmt and qemu + uses: docker/setup-qemu-action@v3 + with: + image: tonistiigi/binfmt:qemu-v9.2.0 + + - name: Build kernel image + uses: libbpf/ci/build-linux@v3 + with: + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + max-make-jobs: 32 + llvm-version: ${{ inputs.llvm_version }} + + - name: Build selftests/bpf + uses: libbpf/ci/build-selftests@v3 + env: + MAX_MAKE_JOBS: 32 + RELEASE: ${{ inputs.release && '1' || '' }} + with: + arch: ${{ inputs.arch }} + kernel-root: ${{ env.KERNEL_ROOT }} + llvm-version: ${{ inputs.llvm_version }} + toolchain: ${{ inputs.toolchain }} + + - if: ${{ env.BUILD_SCHED_EXT_SELFTESTS }} + name: Build selftests/sched_ext + uses: libbpf/ci/build-scx-selftests@v3 + with: + kbuild-output: ${{ env.KBUILD_OUTPUT }} + repo-root: ${{ env.REPO_ROOT }} + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + llvm-version: ${{ inputs.llvm_version }} + max-make-jobs: 32 + + - if: ${{ github.event_name != 'push' }} + name: Build samples + uses: libbpf/ci/build-samples@v3 + with: + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + max-make-jobs: 32 + llvm-version: ${{ inputs.llvm_version }} + - name: Tar artifacts + id: tar-artifacts + uses: libbpf/ci/tar-artifacts@v3 + env: + ARCHIVE_BPF_SELFTESTS: 'true' + ARCHIVE_MAKE_HELPERS: 'true' + ARCHIVE_SCHED_EXT_SELFTESTS: ${{ env.BUILD_SCHED_EXT_SELFTESTS }} + with: + arch: ${{ inputs.arch }} + archive: ${{ env.ARTIFACTS_ARCHIVE }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + repo-root: ${{ env.REPO_ROOT }} + - if: ${{ github.event_name != 'push' }} + name: Remove KBUILD_OUTPUT content + shell: bash + run: | + # Remove $KBUILD_OUTPUT to prevent cache creation for pull requests. + # Only on pushed changes are build artifacts actually cached, because + # of github.com/actions/cache's cache isolation logic. + rm -rf "${KBUILD_OUTPUT}" + - uses: actions/upload-artifact@v4 + with: + name: vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}${{ inputs.release && '-release' || '' }} + if-no-files-found: error + path: ${{ env.ARTIFACTS_ARCHIVE }} diff --git a/.github/workflows/kernel-test.yml b/.github/workflows/kernel-test.yml new file mode 100644 index 0000000000000..2885f2759de4a --- /dev/null +++ b/.github/workflows/kernel-test.yml @@ -0,0 +1,96 @@ +name: Reusable test workflow + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: The toolchain and for llvm, its version, e.g gcc, llvm-15 + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + kernel: + required: true + type: string + description: The kernel to run the test against. For KPD this is always LATEST, which runs against a newly built kernel. + test: + required: true + type: string + description: The test to run in the vm, e.g test_progs, test_maps, test_progs_no_alu32... + continue_on_error: + required: true + type: string + description: Whether to continue on error. This is typically set to true for parallel tests which are currently known to fail, but we don't want to fail the whole CI because of that. + timeout_minutes: + required: true + type: number + description: In case a test runs for too long, after how many seconds shall we timeout and error. + +jobs: + test: + name: ${{ inputs.test }} on ${{ inputs.arch }} with ${{ inputs.toolchain_full }} + runs-on: ${{ fromJSON(inputs.runs_on) }} + timeout-minutes: 100 + env: + ARCH: ${{ inputs.arch }} + KERNEL: ${{ inputs.kernel }} + REPO_ROOT: ${{ github.workspace }} + REPO_PATH: "" + # https://github.com/actions/runner/issues/1483#issuecomment-1031671517 + # booleans are weird in GH. + CONTINUE_ON_ERROR: ${{ inputs.continue_on_error }} + DEPLOYMENT: ${{ github.repository == 'kernel-patches/bpf' && 'prod' || 'rc' }} + ALLOWLIST_FILE: /tmp/allowlist + DENYLIST_FILE: /tmp/denylist + steps: + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }} + path: . + + - name: Untar artifacts + # zstd is installed by default in the runner images. + run: zstd -d -T0 vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}.tar.zst --stdout | tar -xf - + + - name: Run selftests + uses: libbpf/ci/run-vmtest@v3 + # https://github.com/actions/runner/issues/1483#issuecomment-1031671517 + # booleans are weird in GH. + continue-on-error: ${{ fromJSON(env.CONTINUE_ON_ERROR) }} + timeout-minutes: ${{ inputs.timeout_minutes }} + env: + ARCH: ${{ inputs.arch }} + DEPLOYMENT: ${{ env.DEPLOYMENT }} + KERNEL_TEST: ${{ inputs.test }} + SELFTESTS_BPF: ${{ github.workspace }}/selftests/bpf + VMTEST_CONFIGS: ${{ github.workspace }}/ci/vmtest/configs + TEST_PROGS_TRAFFIC_MONITOR: ${{ inputs.arch == 'x86_64' && 'true' || '' }} + TEST_PROGS_WATCHDOG_TIMEOUT: 600 + with: + arch: ${{ inputs.arch }} + vmlinuz: '${{ github.workspace }}/vmlinuz' + kernel-root: ${{ env.REPO_ROOT }} + max-cpu: 8 + kernel-test: ${{ inputs.test }} + # Here we must use kbuild-output local to the repo, because + # it was extracted from the artifacts. + kbuild-output: ${{ env.REPO_ROOT }}/kbuild-output + + - if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: tmon-logs-${{ inputs.arch }}-${{ inputs.toolchain_full }}-${{ inputs.test }} + if-no-files-found: ignore + path: /tmp/tmon_pcap/* diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000000000..1c910fd297309 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,65 @@ +name: "lint" + +on: + pull_request: + push: + branches: + - master + +jobs: + shellcheck: + # This workflow gets injected into other Linux repositories, but we don't + # want it to run there. + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: ShellCheck + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Run ShellCheck + uses: ludeeus/action-shellcheck@master + env: + SHELLCHECK_OPTS: --severity=warning --exclude=SC1091 + + # Ensure some consistency in the formatting. + lint: + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: Lint + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Run black + uses: psf/black@stable + with: + src: ./.github/scripts + + validate_matrix: + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: Validate matrix.py + runs-on: ubuntu-latest + env: + GITHUB_REPOSITORY_OWNER: ${{ matrix.owner }} + GITHUB_REPOSITORY: ${{ matrix.repository }} + GITHUB_OUTPUT: /dev/stdout + strategy: + matrix: + owner: ['kernel-patches', 'foo'] + repository: ['bpf', 'vmtest', 'bar'] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: run script + run: | + python3 .github/scripts/matrix.py + + unittests: + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: Unittests + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Run unittests + run: python3 -m unittest scripts/tests/*.py + working-directory: .github diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000000000..24773459a252d --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,73 @@ +name: bpf-ci + +on: + pull_request: + push: + branches: + - bpf_base + - bpf-next_base + - bpf-net_base + - for-next_base + +concurrency: + group: ci-test-${{ github.ref_name }} + cancel-in-progress: true + +jobs: + set-matrix: + # FIXME: set-matrix is lightweight, run it on any self-hosted machines for kernel-patches org + # so we do not wait for GH hosted runners when there potentially all are busy because of bpf-rc + # repo for instance. + # This could be somehow fixed long term by making this action/workflow re-usable and letting the called + # specify what to run on. + runs-on: ${{ github.repository_owner == 'kernel-patches' && 'x86_64' || 'ubuntu-latest' }} + permissions: read-all + outputs: + build-matrix: ${{ steps.set-matrix-impl.outputs.build_matrix }} + steps: + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + - name: Install script dependencies + shell: bash + run: | + sudo apt-get -y update + sudo apt-get -y install python3-requests + - id: set-matrix-impl + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT_READ_RUNNERS }} + run: | + python3 .github/scripts/matrix.py + + build-and-test: + # Setting name to arch-compiler here to avoid lengthy autogenerated names due to matrix + # e.g build-and-test x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc + name: ${{ matrix.arch }} ${{ matrix.kernel_compiler }}-${{ matrix.kernel_compiler == 'gcc' && matrix.gcc_version || matrix.llvm_version }} + uses: ./.github/workflows/kernel-build-test.yml + needs: [set-matrix] + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.set-matrix.outputs.build-matrix) }} + with: + arch: ${{ matrix.arch }} + toolchain: ${{ matrix.kernel_compiler }} + toolchain_full: ${{ matrix.kernel_compiler }}-${{ matrix.kernel_compiler == 'gcc' && matrix.gcc_version || matrix.llvm_version }} + runs_on: ${{ toJSON(matrix.runs_on) }} + build_runs_on: ${{ toJSON(matrix.build_runs_on) }} + gcc_version: ${{ matrix.gcc_version }} + llvm_version: ${{ matrix.llvm_version }} + kernel: ${{ matrix.kernel }} + tests: ${{ toJSON(matrix.tests) }} + run_veristat: ${{ matrix.run_veristat }} + # We only run tests on pull requests. + run_tests: ${{ github.event_name != 'push' }} + # Download sources + download_sources: ${{ github.repository == 'kernel-patches/vmtest' }} + build_release: ${{ matrix.build_release }} + secrets: + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} diff --git a/.github/workflows/veristat-kernel.yml b/.github/workflows/veristat-kernel.yml new file mode 100644 index 0000000000000..99168008a2ac1 --- /dev/null +++ b/.github/workflows/veristat-kernel.yml @@ -0,0 +1,65 @@ +name: veristat_kernel + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: Toolchain identifier, such as llvm-20 + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + +jobs: + veristat: + name: veristat-kernel + runs-on: ${{ fromJSON(inputs.runs_on) }} + timeout-minutes: 100 + permissions: + id-token: write + contents: read + env: + KERNEL: LATEST + REPO_ROOT: ${{ github.workspace }} + REPO_PATH: "" + KBUILD_OUTPUT: kbuild-output/ + ARCH_AND_TOOL: ${{ inputs.arch }}-${{ inputs.toolchain_full }} + + steps: + + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ env.ARCH_AND_TOOL }} + path: . + + - name: Untar artifacts + run: zstd -d -T0 vmlinux-${{ env.ARCH_AND_TOOL }}.tar.zst --stdout | tar -xf - + + - name: Run veristat + uses: libbpf/ci/run-vmtest@v3 + with: + arch: x86_64 + vmlinuz: '${{ github.workspace }}/vmlinuz' + kernel-root: '.' + max-cpu: 8 + kernel-test: 'run_veristat_kernel' + output-dir: '${{ github.workspace }}' + + - name: Compare and save veristat.kernel.csv + uses: ./.github/actions/veristat_baseline_compare + with: + veristat_output: veristat-kernel + baseline_name: ${{ env.ARCH_AND_TOOL}}-baseline-veristat-kernel + diff --git a/.github/workflows/veristat-meta.yml b/.github/workflows/veristat-meta.yml new file mode 100644 index 0000000000000..0b3ebe8f5c557 --- /dev/null +++ b/.github/workflows/veristat-meta.yml @@ -0,0 +1,87 @@ +name: veristat_meta + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: Toolchain identifier, such as llvm-20 + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + aws_region: + required: true + type: string + description: The AWS region where we pull bpf objects to run against veristat. + secrets: + AWS_ROLE_ARN: + required: true + description: The AWS role used by GH to pull BPF objects from AWS. + +jobs: + veristat: + name: veristat-meta + runs-on: ${{ fromJSON(inputs.runs_on) }} + timeout-minutes: 100 + permissions: + id-token: write + contents: read + env: + KERNEL: LATEST + REPO_ROOT: ${{ github.workspace }} + REPO_PATH: "" + KBUILD_OUTPUT: kbuild-output/ + ARCH_AND_TOOL: ${{ inputs.arch }}-${{ inputs.toolchain_full }} + + steps: + + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ env.ARCH_AND_TOOL }} + path: . + + - name: Untar artifacts + run: zstd -d -T0 vmlinux-${{ env.ARCH_AND_TOOL }}.tar.zst --stdout | tar -xf - + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v3 + with: + aws-region: ${{ inputs.aws_region }} + role-to-assume: ${{ secrets.AWS_ROLE_ARN }} + role-session-name: github-action-bpf-ci + + - name: Download BPF objects + run: | + mkdir ./bpf_objects + aws s3 sync s3://veristat-bpf-binaries ./bpf_objects + env: + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} + + - name: Run veristat + uses: libbpf/ci/run-vmtest@v3 + with: + arch: x86_64 + vmlinuz: '${{ github.workspace }}/vmlinuz' + kernel-root: '.' + max-cpu: 8 + kernel-test: 'run_veristat_meta' + output-dir: '${{ github.workspace }}' + + - name: Compare and save veristat.meta.csv + uses: ./.github/actions/veristat_baseline_compare + with: + veristat_output: veristat-meta + baseline_name: ${{ env.ARCH_AND_TOOL}}-baseline-veristat-meta + diff --git a/Documentation/admin-guide/bug-hunting.rst b/Documentation/admin-guide/bug-hunting.rst index ce6f4e8ca4876..30858757c9f20 100644 --- a/Documentation/admin-guide/bug-hunting.rst +++ b/Documentation/admin-guide/bug-hunting.rst @@ -196,7 +196,7 @@ will see the assembler code for the routine shown, but if your kernel has debug symbols the C code will also be available. (Debug symbols can be enabled in the kernel hacking menu of the menu configuration.) For example:: - $ objdump -r -S -l --disassemble net/dccp/ipv4.o + $ objdump -r -S -l --disassemble net/ipv4/tcp.o .. note:: diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml index 45819b2358002..61e51ea58a985 100644 --- a/Documentation/devicetree/bindings/net/ethernet-controller.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml @@ -197,7 +197,7 @@ properties: description: Link speed. $ref: /schemas/types.yaml#/definitions/uint32 - enum: [10, 100, 1000, 2500, 10000] + enum: [10, 100, 1000, 2500, 5000, 10000] full-duplex: $ref: /schemas/types.yaml#/definitions/flag diff --git a/Documentation/devicetree/bindings/net/ethernet-phy.yaml b/Documentation/devicetree/bindings/net/ethernet-phy.yaml index 824bbe4333b7e..71e2cd32580f2 100644 --- a/Documentation/devicetree/bindings/net/ethernet-phy.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-phy.yaml @@ -238,6 +238,16 @@ properties: peak-to-peak specified in ANSI X3.263. When omitted, the PHYs default will be left as is. + mac-termination-ohms: + maximum: 200 + description: + The xMII signals need series termination on the driver side to match both + the output driver impedance and the line characteristic impedance, to + prevent reflections and EMI problems. Select a resistance value which is + supported by the builtin resistors of the PHY, otherwise the resistors may + have to be placed on board. When omitted, the PHYs default will be left as + is. + leds: type: object diff --git a/Documentation/devicetree/bindings/net/renesas,r9a09g057-gbeth.yaml b/Documentation/devicetree/bindings/net/renesas,r9a09g057-gbeth.yaml new file mode 100644 index 0000000000000..02a6793c26f58 --- /dev/null +++ b/Documentation/devicetree/bindings/net/renesas,r9a09g057-gbeth.yaml @@ -0,0 +1,201 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/renesas,r9a09g057-gbeth.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: GBETH glue layer for Renesas RZ/V2H(P) (and similar SoCs) + +maintainers: + - Lad Prabhakar + +select: + properties: + compatible: + contains: + enum: + - renesas,r9a09g057-gbeth + - renesas,rzv2h-gbeth + required: + - compatible + +properties: + compatible: + items: + - enum: + - renesas,r9a09g057-gbeth # RZ/V2H(P) + - const: renesas,rzv2h-gbeth + - const: snps,dwmac-5.20 + + reg: + maxItems: 1 + + clocks: + items: + - description: CSR clock + - description: AXI system clock + - description: PTP clock + - description: TX clock + - description: RX clock + - description: TX clock phase-shifted by 180 degrees + - description: RX clock phase-shifted by 180 degrees + + clock-names: + items: + - const: stmmaceth + - const: pclk + - const: ptp_ref + - const: tx + - const: rx + - const: tx-180 + - const: rx-180 + + interrupts: + minItems: 11 + + interrupt-names: + items: + - const: macirq + - const: eth_wake_irq + - const: eth_lpi + - const: rx-queue-0 + - const: rx-queue-1 + - const: rx-queue-2 + - const: rx-queue-3 + - const: tx-queue-0 + - const: tx-queue-1 + - const: tx-queue-2 + - const: tx-queue-3 + + resets: + items: + - description: AXI power-on system reset + +required: + - compatible + - reg + - clocks + - clock-names + - interrupts + - interrupt-names + - resets + +allOf: + - $ref: snps,dwmac.yaml# + +unevaluatedProperties: false + +examples: + - | + #include + #include + + ethernet@15c30000 { + compatible = "renesas,r9a09g057-gbeth", "renesas,rzv2h-gbeth", "snps,dwmac-5.20"; + reg = <0x15c30000 0x10000>; + clocks = <&cpg CPG_MOD 0xbd>, <&cpg CPG_MOD 0xbc>, + <&ptp_clock>, <&cpg CPG_MOD 0xb8>, + <&cpg CPG_MOD 0xb9>, <&cpg CPG_MOD 0xba>, + <&cpg CPG_MOD 0xbb>; + clock-names = "stmmaceth", "pclk", "ptp_ref", + "tx", "rx", "tx-180", "rx-180"; + resets = <&cpg 0xb0>; + interrupts = , + , + , + , + , + , + , + , + , + , + ; + interrupt-names = "macirq", "eth_wake_irq", "eth_lpi", + "rx-queue-0", "rx-queue-1", "rx-queue-2", + "rx-queue-3", "tx-queue-0", "tx-queue-1", + "tx-queue-2", "tx-queue-3"; + phy-mode = "rgmii-id"; + snps,multicast-filter-bins = <256>; + snps,perfect-filter-entries = <128>; + rx-fifo-depth = <8192>; + tx-fifo-depth = <8192>; + snps,fixed-burst; + snps,force_thresh_dma_mode; + snps,axi-config = <&stmmac_axi_setup>; + snps,mtl-rx-config = <&mtl_rx_setup>; + snps,mtl-tx-config = <&mtl_tx_setup>; + snps,txpbl = <32>; + snps,rxpbl = <32>; + phy-handle = <&phy0>; + + stmmac_axi_setup: stmmac-axi-config { + snps,lpi_en; + snps,wr_osr_lmt = <0xf>; + snps,rd_osr_lmt = <0xf>; + snps,blen = <16 8 4 0 0 0 0>; + }; + + mtl_rx_setup: rx-queues-config { + snps,rx-queues-to-use = <4>; + snps,rx-sched-sp; + + queue0 { + snps,dcb-algorithm; + snps,priority = <0x1>; + snps,map-to-dma-channel = <0>; + }; + + queue1 { + snps,dcb-algorithm; + snps,priority = <0x2>; + snps,map-to-dma-channel = <1>; + }; + + queue2 { + snps,dcb-algorithm; + snps,priority = <0x4>; + snps,map-to-dma-channel = <2>; + }; + + queue3 { + snps,dcb-algorithm; + snps,priority = <0x8>; + snps,map-to-dma-channel = <3>; + }; + }; + + mtl_tx_setup: tx-queues-config { + snps,tx-queues-to-use = <4>; + + queue0 { + snps,dcb-algorithm; + snps,priority = <0x1>; + }; + + queue1 { + snps,dcb-algorithm; + snps,priority = <0x2>; + }; + + queue2 { + snps,dcb-algorithm; + snps,priority = <0x4>; + }; + + queue3 { + snps,dcb-algorithm; + snps,priority = <0x1>; + }; + }; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + compatible = "snps,dwmac-mdio"; + + phy0: ethernet-phy@0 { + reg = <0>; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml index 78b3030dc56d2..b525eca538506 100644 --- a/Documentation/devicetree/bindings/net/snps,dwmac.yaml +++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml @@ -75,6 +75,7 @@ properties: - qcom,sm8150-ethqos - renesas,r9a06g032-gmac - renesas,rzn1-gmac + - renesas,rzv2h-gbeth - rockchip,px30-gmac - rockchip,rk3128-gmac - rockchip,rk3228-gmac @@ -114,19 +115,25 @@ properties: interrupts: minItems: 1 - items: - - description: Combined signal for various interrupt events - - description: The interrupt to manage the remote wake-up packet detection - - description: The interrupt that occurs when Rx exits the LPI state - - description: The interrupt that occurs when HW safety error triggered + maxItems: 11 interrupt-names: minItems: 1 + maxItems: 11 items: - - const: macirq - - enum: [eth_wake_irq, eth_lpi, sfty] - - enum: [eth_wake_irq, eth_lpi, sfty] - - enum: [eth_wake_irq, eth_lpi, sfty] + oneOf: + - description: Combined signal for various interrupt events + const: macirq + - description: The interrupt to manage the remote wake-up packet detection + const: eth_wake_irq + - description: The interrupt that occurs when Rx exits the LPI state + const: eth_lpi + - description: The interrupt that occurs when HW safety error triggered + const: sfty + - description: Per channel receive completion interrupt + pattern: '^rx-queue-[0-3]$' + - description: Per channel transmit completion interrupt + pattern: '^tx-queue-[0-3]$' clocks: minItems: 1 diff --git a/Documentation/devicetree/bindings/net/ti,dp83822.yaml b/Documentation/devicetree/bindings/net/ti,dp83822.yaml index 50c24248df266..28a0bddb9af94 100644 --- a/Documentation/devicetree/bindings/net/ti,dp83822.yaml +++ b/Documentation/devicetree/bindings/net/ti,dp83822.yaml @@ -122,6 +122,9 @@ properties: - free-running - recovered + mac-termination-ohms: + enum: [43, 44, 46, 48, 50, 53, 55, 58, 61, 65, 69, 73, 78, 84, 91, 99] + required: - reg @@ -137,6 +140,7 @@ examples: rx-internal-delay-ps = <1>; tx-internal-delay-ps = <1>; ti,gpio2-clk-out = "xi"; + mac-termination-ohms = <43>; }; }; diff --git a/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml b/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml index b11894fbaec47..7b3d948f187df 100644 --- a/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml +++ b/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml @@ -143,6 +143,8 @@ properties: label: description: label associated with this port + fixed-link: true + ti,mac-only: $ref: /schemas/types.yaml#/definitions/flag description: diff --git a/Documentation/netlink/specs/ovpn.yaml b/Documentation/netlink/specs/ovpn.yaml new file mode 100644 index 0000000000000..096c51f0c69a8 --- /dev/null +++ b/Documentation/netlink/specs/ovpn.yaml @@ -0,0 +1,367 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +# +# Author: Antonio Quartulli +# +# Copyright (c) 2024-2025, OpenVPN Inc. +# + +name: ovpn + +protocol: genetlink + +doc: Netlink protocol to control OpenVPN network devices + +definitions: + - + type: const + name: nonce-tail-size + value: 8 + - + type: enum + name: cipher-alg + entries: [ none, aes-gcm, chacha20-poly1305 ] + - + type: enum + name: del-peer-reason + entries: + - teardown + - userspace + - expired + - transport-error + - transport-disconnect + - + type: enum + name: key-slot + entries: [ primary, secondary ] + +attribute-sets: + - + name: peer + attributes: + - + name: id + type: u32 + doc: >- + The unique ID of the peer in the device context. To be used to identify + peers during operations for a specific device + checks: + max: 0xFFFFFF + - + name: remote-ipv4 + type: u32 + doc: The remote IPv4 address of the peer + byte-order: big-endian + display-hint: ipv4 + - + name: remote-ipv6 + type: binary + doc: The remote IPv6 address of the peer + display-hint: ipv6 + checks: + exact-len: 16 + - + name: remote-ipv6-scope-id + type: u32 + doc: The scope id of the remote IPv6 address of the peer (RFC2553) + - + name: remote-port + type: u16 + doc: The remote port of the peer + byte-order: big-endian + checks: + min: 1 + - + name: socket + type: u32 + doc: The socket to be used to communicate with the peer + - + name: socket-netnsid + type: s32 + doc: The ID of the netns the socket assigned to this peer lives in + - + name: vpn-ipv4 + type: u32 + doc: The IPv4 address assigned to the peer by the server + byte-order: big-endian + display-hint: ipv4 + - + name: vpn-ipv6 + type: binary + doc: The IPv6 address assigned to the peer by the server + display-hint: ipv6 + checks: + exact-len: 16 + - + name: local-ipv4 + type: u32 + doc: The local IPv4 to be used to send packets to the peer (UDP only) + byte-order: big-endian + display-hint: ipv4 + - + name: local-ipv6 + type: binary + doc: The local IPv6 to be used to send packets to the peer (UDP only) + display-hint: ipv6 + checks: + exact-len: 16 + - + name: local-port + type: u16 + doc: The local port to be used to send packets to the peer (UDP only) + byte-order: big-endian + checks: + min: 1 + - + name: keepalive-interval + type: u32 + doc: >- + The number of seconds after which a keep alive message is sent to the + peer + - + name: keepalive-timeout + type: u32 + doc: >- + The number of seconds from the last activity after which the peer is + assumed dead + - + name: del-reason + type: u32 + doc: The reason why a peer was deleted + enum: del-peer-reason + - + name: vpn-rx-bytes + type: uint + doc: Number of bytes received over the tunnel + - + name: vpn-tx-bytes + type: uint + doc: Number of bytes transmitted over the tunnel + - + name: vpn-rx-packets + type: uint + doc: Number of packets received over the tunnel + - + name: vpn-tx-packets + type: uint + doc: Number of packets transmitted over the tunnel + - + name: link-rx-bytes + type: uint + doc: Number of bytes received at the transport level + - + name: link-tx-bytes + type: uint + doc: Number of bytes transmitted at the transport level + - + name: link-rx-packets + type: uint + doc: Number of packets received at the transport level + - + name: link-tx-packets + type: uint + doc: Number of packets transmitted at the transport level + - + name: keyconf + attributes: + - + name: peer-id + type: u32 + doc: >- + The unique ID of the peer in the device context. To be used to + identify peers during key operations + checks: + max: 0xFFFFFF + - + name: slot + type: u32 + doc: The slot where the key should be stored + enum: key-slot + - + name: key-id + doc: >- + The unique ID of the key in the peer context. Used to fetch the + correct key upon decryption + type: u32 + checks: + max: 7 + - + name: cipher-alg + type: u32 + doc: The cipher to be used when communicating with the peer + enum: cipher-alg + - + name: encrypt-dir + type: nest + doc: Key material for encrypt direction + nested-attributes: keydir + - + name: decrypt-dir + type: nest + doc: Key material for decrypt direction + nested-attributes: keydir + - + name: keydir + attributes: + - + name: cipher-key + type: binary + doc: The actual key to be used by the cipher + checks: + max-len: 256 + - + name: nonce-tail + type: binary + doc: >- + Random nonce to be concatenated to the packet ID, in order to + obtain the actual cipher IV + checks: + exact-len: nonce-tail-size + - + name: ovpn + attributes: + - + name: ifindex + type: u32 + doc: Index of the ovpn interface to operate on + - + name: peer + type: nest + doc: >- + The peer object containing the attributed of interest for the specific + operation + nested-attributes: peer + - + name: keyconf + type: nest + doc: Peer specific cipher configuration + nested-attributes: keyconf + +operations: + list: + - + name: peer-new + attribute-set: ovpn + flags: [ admin-perm ] + doc: Add a remote peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - peer + - + name: peer-set + attribute-set: ovpn + flags: [ admin-perm ] + doc: modify a remote peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - peer + - + name: peer-get + attribute-set: ovpn + flags: [ admin-perm ] + doc: Retrieve data about existing remote peers (or a specific one) + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - peer + reply: + attributes: + - peer + dump: + request: + attributes: + - ifindex + reply: + attributes: + - peer + - + name: peer-del + attribute-set: ovpn + flags: [ admin-perm ] + doc: Delete existing remote peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - peer + - + name: peer-del-ntf + doc: Notification about a peer being deleted + notify: peer-get + mcgrp: peers + + - + name: key-new + attribute-set: ovpn + flags: [ admin-perm ] + doc: Add a cipher key for a specific peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - keyconf + - + name: key-get + attribute-set: ovpn + flags: [ admin-perm ] + doc: Retrieve non-sensitive data about peer key and cipher + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - keyconf + reply: + attributes: + - keyconf + - + name: key-swap + attribute-set: ovpn + flags: [ admin-perm ] + doc: Swap primary and secondary session keys for a specific peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - keyconf + - + name: key-swap-ntf + notify: key-get + doc: >- + Notification about key having exhausted its IV space and requiring + renegotiation + mcgrp: peers + - + name: key-del + attribute-set: ovpn + flags: [ admin-perm ] + doc: Delete cipher key for a specific peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - keyconf + +mcast-groups: + list: + - + name: peers diff --git a/Documentation/netlink/specs/rt_addr.yaml b/Documentation/netlink/specs/rt-addr.yaml similarity index 89% rename from Documentation/netlink/specs/rt_addr.yaml rename to Documentation/netlink/specs/rt-addr.yaml index df6b23f06a228..4f86aa1075da7 100644 --- a/Documentation/netlink/specs/rt_addr.yaml +++ b/Documentation/netlink/specs/rt-addr.yaml @@ -2,6 +2,7 @@ name: rt-addr protocol: netlink-raw +uapi-header: linux/rtnetlink.h protonum: 0 doc: @@ -49,6 +50,8 @@ definitions: - name: ifa-flags type: flags + name-prefix: ifa-f- + enum-name: entries: - name: secondary @@ -124,6 +127,7 @@ attribute-sets: operations: fixed-header: ifaddrmsg enum-model: directional + name-prefix: rtm- list: - name: newaddr @@ -133,11 +137,6 @@ operations: request: value: 20 attributes: &ifaddr-all - - ifa-family - - ifa-flags - - ifa-prefixlen - - ifa-scope - - ifa-index - address - label - local @@ -150,11 +149,6 @@ operations: request: value: 21 attributes: - - ifa-family - - ifa-flags - - ifa-prefixlen - - ifa-scope - - ifa-index - address - local - @@ -164,8 +158,7 @@ operations: dump: request: value: 22 - attributes: - - ifa-index + attributes: [] reply: value: 20 attributes: *ifaddr-all @@ -177,9 +170,7 @@ operations: do: request: value: 58 - attributes: - - ifa-family - - ifa-index + attributes: [] reply: value: 58 attributes: &mcaddr-attrs @@ -188,8 +179,7 @@ operations: dump: request: value: 58 - attributes: - - ifa-family + attributes: [] reply: value: 58 attributes: *mcaddr-attrs diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt-link.yaml similarity index 99% rename from Documentation/netlink/specs/rt_link.yaml rename to Documentation/netlink/specs/rt-link.yaml index 6b9d5ee87d93a..726dfa083d14b 100644 --- a/Documentation/netlink/specs/rt_link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -938,6 +938,12 @@ definitions: entries: - name: none - name: default + - + name: ovpn-mode + type: enum + entries: + - p2p + - mp attribute-sets: - @@ -2279,6 +2285,13 @@ attribute-sets: - name: tailroom type: u16 + - + name: linkinfo-ovpn-attrs + attributes: + - + name: mode + type: u8 + enum: ovpn-mode sub-messages: - @@ -2329,6 +2342,9 @@ sub-messages: - value: netkit attribute-set: linkinfo-netkit-attrs + - + value: ovpn + attribute-set: linkinfo-ovpn-attrs - name: linkinfo-member-data-msg formats: diff --git a/Documentation/netlink/specs/rt_neigh.yaml b/Documentation/netlink/specs/rt-neigh.yaml similarity index 100% rename from Documentation/netlink/specs/rt_neigh.yaml rename to Documentation/netlink/specs/rt-neigh.yaml diff --git a/Documentation/netlink/specs/rt_route.yaml b/Documentation/netlink/specs/rt-route.yaml similarity index 93% rename from Documentation/netlink/specs/rt_route.yaml rename to Documentation/netlink/specs/rt-route.yaml index 292469c7d4b9f..800f3a823d473 100644 --- a/Documentation/netlink/specs/rt_route.yaml +++ b/Documentation/netlink/specs/rt-route.yaml @@ -2,6 +2,7 @@ name: rt-route protocol: netlink-raw +uapi-header: linux/rtnetlink.h protonum: 0 doc: @@ -11,6 +12,7 @@ definitions: - name: rtm-type name-prefix: rtn- + enum-name: type: enum entries: - unspec @@ -245,21 +247,19 @@ attribute-sets: operations: enum-model: directional + fixed-header: rtmsg + name-prefix: rtm- list: - name: getroute doc: Dump route information. attribute-set: route-attrs - fixed-header: rtmsg do: request: value: 26 attributes: - - rtm-family - src - - rtm-src-len - dst - - rtm-dst-len - iif - oif - ip-proto @@ -271,15 +271,6 @@ operations: reply: value: 24 attributes: &all-route-attrs - - rtm-family - - rtm-dst-len - - rtm-src-len - - rtm-tos - - rtm-table - - rtm-protocol - - rtm-scope - - rtm-type - - rtm-flags - dst - src - iif @@ -311,8 +302,7 @@ operations: dump: request: value: 26 - attributes: - - rtm-family + attributes: [] reply: value: 24 attributes: *all-route-attrs @@ -320,7 +310,6 @@ operations: name: newroute doc: Create a new route attribute-set: route-attrs - fixed-header: rtmsg do: request: value: 24 @@ -329,7 +318,6 @@ operations: name: delroute doc: Delete an existing route attribute-set: route-attrs - fixed-header: rtmsg do: request: value: 25 diff --git a/Documentation/netlink/specs/rt_rule.yaml b/Documentation/netlink/specs/rt-rule.yaml similarity index 100% rename from Documentation/netlink/specs/rt_rule.yaml rename to Documentation/netlink/specs/rt-rule.yaml diff --git a/Documentation/networking/dccp.rst b/Documentation/networking/dccp.rst deleted file mode 100644 index 91e5c33ba3ff5..0000000000000 --- a/Documentation/networking/dccp.rst +++ /dev/null @@ -1,219 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============= -DCCP protocol -============= - - -.. Contents - - Introduction - - Missing features - - Socket options - - Sysctl variables - - IOCTLs - - Other tunables - - Notes - - -Introduction -============ -Datagram Congestion Control Protocol (DCCP) is an unreliable, connection -oriented protocol designed to solve issues present in UDP and TCP, particularly -for real-time and multimedia (streaming) traffic. -It divides into a base protocol (RFC 4340) and pluggable congestion control -modules called CCIDs. Like pluggable TCP congestion control, at least one CCID -needs to be enabled in order for the protocol to function properly. In the Linux -implementation, this is the TCP-like CCID2 (RFC 4341). Additional CCIDs, such as -the TCP-friendly CCID3 (RFC 4342), are optional. -For a brief introduction to CCIDs and suggestions for choosing a CCID to match -given applications, see section 10 of RFC 4340. - -It has a base protocol and pluggable congestion control IDs (CCIDs). - -DCCP is a Proposed Standard (RFC 2026), and the homepage for DCCP as a protocol -is at http://www.ietf.org/html.charters/dccp-charter.html - - -Missing features -================ -The Linux DCCP implementation does not currently support all the features that are -specified in RFCs 4340...42. - -The known bugs are at: - - http://www.linuxfoundation.org/collaborate/workgroups/networking/todo#DCCP - -For more up-to-date versions of the DCCP implementation, please consider using -the experimental DCCP test tree; instructions for checking this out are on: -http://www.linuxfoundation.org/collaborate/workgroups/networking/dccp_testing#Experimental_DCCP_source_tree - - -Socket options -============== -DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes -a policy ID as argument and can only be set before the connection (i.e. changes -during an established connection are not supported). Currently, two policies are -defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special, -and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an -u32 priority value as ancillary data to sendmsg(), where higher numbers indicate -a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to -be formatted using a cmsg(3) message header filled in as follows:: - - cmsg->cmsg_level = SOL_DCCP; - cmsg->cmsg_type = DCCP_SCM_PRIORITY; - cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */ - -DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero -value is always interpreted as unbounded queue length. If different from zero, -the interpretation of this parameter depends on the current dequeuing policy -(see above): the "simple" policy will enforce a fixed queue size by returning -EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the -lowest-priority packet first. The default value for this parameter is -initialised from /proc/sys/net/dccp/default/tx_qlen. - -DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of -service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, -the socket will fall back to 0 (which means that no meaningful service code -is present). On active sockets this is set before connect(); specifying more -than one code has no effect (all subsequent service codes are ignored). The -case is different for passive sockets, where multiple service codes (up to 32) -can be set before calling bind(). - -DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet -size (application payload size) in bytes, see RFC 4340, section 14. - -DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs -supported by the endpoint. The option value is an array of type uint8_t whose -size is passed as option length. The minimum array size is 4 elements, the -value returned in the optlen argument always reflects the true number of -built-in CCIDs. - -DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same -time, combining the operation of the next two socket options. This option is -preferable over the latter two, since often applications will use the same -type of CCID for both directions; and mixed use of CCIDs is not currently well -understood. This socket option takes as argument at least one uint8_t value, or -an array of uint8_t values, which must match available CCIDS (see above). CCIDs -must be registered on the socket before calling connect() or listen(). - -DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets -the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. -Please note that the getsockopt argument type here is ``int``, not uint8_t. - -DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. - -DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold -timewait state when closing the connection (RFC 4340, 8.3). The usual case is -that the closing server sends a CloseReq, whereupon the client holds timewait -state. When this boolean socket option is on, the server sends a Close instead -and will enter TIMEWAIT. This option must be set after accept() returns. - -DCCP_SOCKOPT_SEND_CSCOV and DCCP_SOCKOPT_RECV_CSCOV are used for setting the -partial checksum coverage (RFC 4340, sec. 9.2). The default is that checksums -always cover the entire packet and that only fully covered application data is -accepted by the receiver. Hence, when using this feature on the sender, it must -be enabled at the receiver, too with suitable choice of CsCov. - -DCCP_SOCKOPT_SEND_CSCOV sets the sender checksum coverage. Values in the - range 0..15 are acceptable. The default setting is 0 (full coverage), - values between 1..15 indicate partial coverage. - -DCCP_SOCKOPT_RECV_CSCOV is for the receiver and has a different meaning: it - sets a threshold, where again values 0..15 are acceptable. The default - of 0 means that all packets with a partial coverage will be discarded. - Values in the range 1..15 indicate that packets with minimally such a - coverage value are also acceptable. The higher the number, the more - restrictive this setting (see [RFC 4340, sec. 9.2.1]). Partial coverage - settings are inherited to the child socket after accept(). - -The following two options apply to CCID 3 exclusively and are getsockopt()-only. -In either case, a TFRC info struct (defined in ) is returned. - -DCCP_SOCKOPT_CCID_RX_INFO - Returns a ``struct tfrc_rx_info`` in optval; the buffer for optval and - optlen must be set to at least sizeof(struct tfrc_rx_info). - -DCCP_SOCKOPT_CCID_TX_INFO - Returns a ``struct tfrc_tx_info`` in optval; the buffer for optval and - optlen must be set to at least sizeof(struct tfrc_tx_info). - -On unidirectional connections it is useful to close the unused half-connection -via shutdown (SHUT_WR or SHUT_RD): this will reduce per-packet processing costs. - - -Sysctl variables -================ -Several DCCP default parameters can be managed by the following sysctls -(sysctl net.dccp.default or /proc/sys/net/dccp/default): - -request_retries - The number of active connection initiation retries (the number of - Requests minus one) before timing out. In addition, it also governs - the behaviour of the other, passive side: this variable also sets - the number of times DCCP repeats sending a Response when the initial - handshake does not progress from RESPOND to OPEN (i.e. when no Ack - is received after the initial Request). This value should be greater - than 0, suggested is less than 10. Analogue of tcp_syn_retries. - -retries1 - How often a DCCP Response is retransmitted until the listening DCCP - side considers its connecting peer dead. Analogue of tcp_retries1. - -retries2 - The number of times a general DCCP packet is retransmitted. This has - importance for retransmitted acknowledgments and feature negotiation, - data packets are never retransmitted. Analogue of tcp_retries2. - -tx_ccid = 2 - Default CCID for the sender-receiver half-connection. Depending on the - choice of CCID, the Send Ack Vector feature is enabled automatically. - -rx_ccid = 2 - Default CCID for the receiver-sender half-connection; see tx_ccid. - -seq_window = 100 - The initial sequence window (sec. 7.5.2) of the sender. This influences - the local ackno validity and the remote seqno validity windows (7.5.1). - Values in the range Wmin = 32 (RFC 4340, 7.5.2) up to 2^32-1 can be set. - -tx_qlen = 5 - The size of the transmit buffer in packets. A value of 0 corresponds - to an unbounded transmit buffer. - -sync_ratelimit = 125 ms - The timeout between subsequent DCCP-Sync packets sent in response to - sequence-invalid packets on the same socket (RFC 4340, 7.5.4). The unit - of this parameter is milliseconds; a value of 0 disables rate-limiting. - - -IOCTLS -====== -FIONREAD - Works as in udp(7): returns in the ``int`` argument pointer the size of - the next pending datagram in bytes, or 0 when no datagram is pending. - -SIOCOUTQ - Returns the number of unsent data bytes in the socket send queue as ``int`` - into the buffer specified by the argument pointer. - -Other tunables -============== -Per-route rto_min support - CCID-2 supports the RTAX_RTO_MIN per-route setting for the minimum value - of the RTO timer. This setting can be modified via the 'rto_min' option - of iproute2; for example:: - - > ip route change 10.0.0.0/24 rto_min 250j dev wlan0 - > ip route add 10.0.0.254/32 rto_min 800j dev wlan0 - > ip route show dev wlan0 - - CCID-3 also supports the rto_min setting: it is used to define the lower - bound for the expiry of the nofeedback timer. This can be useful on LANs - with very low RTTs (e.g., loopback, Gbit ethernet). - - -Notes -===== -DCCP does not travel through NAT successfully at present on many boxes. This is -because the checksum covers the pseudo-header as per TCP and UDP. Linux NAT -support for DCCP has been added. diff --git a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst index 04e0595bb0a77..3483e498c08eb 100644 --- a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst +++ b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst @@ -31,6 +31,46 @@ separate entry. Statistics ---------- +TX MAC Interface +~~~~~~~~~~~~~~~~ + + - ``ptp_illegal_req``: packets sent to the NIC with PTP request bit set but routed to BMC/FW + - ``ptp_good_ts``: packets successfully routed to MAC with PTP request bit set + - ``ptp_bad_ts``: packets destined for MAC with PTP request bit set but aborted because of some error (e.g., DMA read error) + +TX Extension (TEI) Interface (TTI) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + - ``tti_cm_drop``: control messages dropped at the TX Extension (TEI) Interface because of credit starvation + - ``tti_frame_drop``: packets dropped at the TX Extension (TEI) Interface because of credit starvation + - ``tti_tbi_drop``: packets dropped at the TX BMC Interface (TBI) because of credit starvation + +RXB (RX Buffer) Enqueue +~~~~~~~~~~~~~~~~~~~~~~~ + + - ``rxb_integrity_err[i]``: frames enqueued with integrity errors (e.g., multi-bit ECC errors) on RXB input i + - ``rxb_mac_err[i]``: frames enqueued with MAC end-of-frame errors (e.g., bad FCS) on RXB input i + - ``rxb_parser_err[i]``: frames experienced RPC parser errors + - ``rxb_frm_err[i]``: frames experienced signaling errors (e.g., missing end-of-packet/start-of-packet) on RXB input i + - ``rxb_drbo[i]_frames``: frames received at RXB input i + - ``rxb_drbo[i]_bytes``: bytes received at RXB input i + +RXB (RX Buffer) FIFO +~~~~~~~~~~~~~~~~~~~~ + + - ``rxb_fifo[i]_drop``: transitions into the drop state on RXB pool i + - ``rxb_fifo[i]_dropped_frames``: frames dropped on RXB pool i + - ``rxb_fifo[i]_ecn``: transitions into the ECN mark state on RXB pool i + - ``rxb_fifo[i]_level``: current occupancy of RXB pool i + +RXB (RX Buffer) Dequeue +~~~~~~~~~~~~~~~~~~~~~~~ + + - ``rxb_intf[i]_frames``: frames sent to the output i + - ``rxb_intf[i]_bytes``: bytes sent to the output i + - ``rxb_pbuf[i]_frames``: frames sent to output i from the perspective of internal packet buffer + - ``rxb_pbuf[i]_bytes``: bytes sent to output i from the perspective of internal packet buffer + RPC (Rx parser) ~~~~~~~~~~~~~~~ @@ -44,6 +84,15 @@ RPC (Rx parser) - ``rpc_out_of_hdr_err``: frames where header was larger than parsable region - ``ovr_size_err``: oversized frames +Hardware Queues +~~~~~~~~~~~~~~~ + +1. RX DMA Engine: + + - ``rde_[i]_pkt_err``: packets with MAC EOP, RPC parser, RXB truncation, or RDE frame truncation errors. These error are flagged in the packet metadata because of cut-through support but the actual drop happens once PCIE/RDE is reached. + - ``rde_[i]_pkt_cq_drop``: packets dropped because RCQ is full + - ``rde_[i]_pkt_bdq_drop``: packets dropped because HPQ or PPQ ran out of host buffer + PCIe ~~~~ diff --git a/Documentation/networking/devlink/devlink-info.rst b/Documentation/networking/devlink/devlink-info.rst index 23073bc219d8e..dd6adc4d05596 100644 --- a/Documentation/networking/devlink/devlink-info.rst +++ b/Documentation/networking/devlink/devlink-info.rst @@ -86,6 +86,10 @@ In case software/firmware components are loaded from the disk (e.g. ``/lib/firmware``) only the running version should be reported via the kernel API. +Please note that any security versions reported via devlink are purely +informational. Devlink does not use a secure channel to communicate with +the device. + Generic Versions ================ diff --git a/Documentation/networking/devlink/index.rst b/Documentation/networking/devlink/index.rst index 948c8c44e233f..8319f43b5933d 100644 --- a/Documentation/networking/devlink/index.rst +++ b/Documentation/networking/devlink/index.rst @@ -84,6 +84,7 @@ parameters, info versions, and other features it supports. i40e ionic ice + ixgbe mlx4 mlx5 mlxsw diff --git a/Documentation/networking/devlink/ixgbe.rst b/Documentation/networking/devlink/ixgbe.rst new file mode 100644 index 0000000000000..3fce291348fac --- /dev/null +++ b/Documentation/networking/devlink/ixgbe.rst @@ -0,0 +1,122 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +ixgbe devlink support +===================== + +This document describes the devlink features implemented by the ``ixgbe`` +device driver. + +Info versions +============= + +Any of the versions dealing with the security presented by ``devlink-info`` +is purely informational. Devlink does not use a secure channel to communicate +with the device. + +The ``ixgbe`` driver reports the following versions + +.. list-table:: devlink info versions implemented + :widths: 5 5 5 90 + + * - Name + - Type + - Example + - Description + * - ``board.id`` + - fixed + - H49289-000 + - The Product Board Assembly (PBA) identifier of the board. + * - ``fw.undi`` + - running + - 1.1937.0 + - Version of the Option ROM containing the UEFI driver. The version is + reported in ``major.minor.patch`` format. The major version is + incremented whenever a major breaking change occurs, or when the + minor version would overflow. The minor version is incremented for + non-breaking changes and reset to 1 when the major version is + incremented. The patch version is normally 0 but is incremented when + a fix is delivered as a patch against an older base Option ROM. + * - ``fw.undi.srev`` + - running + - 4 + - Number indicating the security revision of the Option ROM. + * - ``fw.bundle_id`` + - running + - 0x80000d0d + - Unique identifier of the firmware image file that was loaded onto + the device. Also referred to as the EETRACK identifier of the NVM. + * - ``fw.mgmt.api`` + - running + - 1.5.1 + - 3-digit version number (major.minor.patch) of the API exported over + the AdminQ by the management firmware. Used by the driver to + identify what commands are supported. Historical versions of the + kernel only displayed a 2-digit version number (major.minor). + * - ``fw.mgmt.build`` + - running + - 0x305d955f + - Unique identifier of the source for the management firmware. + * - ``fw.mgmt.srev`` + - running + - 3 + - Number indicating the security revision of the firmware. + * - ``fw.psid.api`` + - running + - 0.80 + - Version defining the format of the flash contents. + * - ``fw.netlist`` + - running + - 1.1.2000-6.7.0 + - The version of the netlist module. This module defines the device's + Ethernet capabilities and default settings, and is used by the + management firmware as part of managing link and device + connectivity. + * - ``fw.netlist.build`` + - running + - 0xee16ced7 + - The first 4 bytes of the hash of the netlist module contents. + +Flash Update +============ + +The ``ixgbe`` driver implements support for flash update using the +``devlink-flash`` interface. It supports updating the device flash using a +combined flash image that contains the ``fw.mgmt``, ``fw.undi``, and +``fw.netlist`` components. + +.. list-table:: List of supported overwrite modes + :widths: 5 95 + + * - Bits + - Behavior + * - ``DEVLINK_FLASH_OVERWRITE_SETTINGS`` + - Do not preserve settings stored in the flash components being + updated. This includes overwriting the port configuration that + determines the number of physical functions the device will + initialize with. + * - ``DEVLINK_FLASH_OVERWRITE_SETTINGS`` and ``DEVLINK_FLASH_OVERWRITE_IDENTIFIERS`` + - Do not preserve either settings or identifiers. Overwrite everything + in the flash with the contents from the provided image, without + performing any preservation. This includes overwriting device + identifying fields such as the MAC address, Vital product Data (VPD) area, + and device serial number. It is expected that this combination be used with an + image customized for the specific device. + +Reload +====== + +The ``ixgbe`` driver supports activating new firmware after a flash update +using ``DEVLINK_CMD_RELOAD`` with the ``DEVLINK_RELOAD_ACTION_FW_ACTIVATE`` +action. + +.. code:: shell + + $ devlink dev reload pci/0000:01:00.0 reload action fw_activate + +The new firmware is activated by issuing a device specific Embedded +Management Processor reset which requests the device to reset and reload the +EMP firmware image. + +The driver does not currently support reloading the driver via +``DEVLINK_RELOAD_ACTION_DRIVER_REINIT``. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index c64133d309bfe..ac90b82f3ce95 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -48,7 +48,6 @@ Contents: ax25 bonding cdc_mbim - dccp dctcp devmem dns_resolver diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 5c63ab928b97d..b43222ee57cf9 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -37,8 +37,8 @@ ip_no_pmtu_disc - INTEGER Mode 3 is a hardened pmtu discover mode. The kernel will only accept fragmentation-needed errors if the underlying protocol can verify them besides a plain socket lookup. Current - protocols for which pmtu events will be honored are TCP, SCTP - and DCCP as they verify e.g. the sequence number or the + protocols for which pmtu events will be honored are TCP and + SCTP as they verify e.g. the sequence number or the association. This mode should not be enabled globally but is only intended to secure e.g. name servers in namespaces where TCP path mtu must still work but path MTU information of other diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 6327e689e8a84..ca8605eb82ffc 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -131,7 +131,7 @@ struct ref_tracker_dir refcnt_tracker struct list_head link_watch_list enum:8 reg_state bool dismantle -enum:16 rtnl_link_state +bool rtnl_link_initilizing bool needs_free_netdev void*priv_destructor struct net_device struct netpoll_info* npinfo read_mostly napi_poll/napi_poll_lock diff --git a/Documentation/networking/net_cachelines/snmp.rst b/Documentation/networking/net_cachelines/snmp.rst index bc96efc92cf5b..bd44b3eebbef7 100644 --- a/Documentation/networking/net_cachelines/snmp.rst +++ b/Documentation/networking/net_cachelines/snmp.rst @@ -37,6 +37,8 @@ unsigned_long LINUX_MIB_TIMEWAITKILLED unsigned_long LINUX_MIB_PAWSACTIVEREJECTED unsigned_long LINUX_MIB_PAWSESTABREJECTED unsigned_long LINUX_MIB_TSECR_REJECTED +unsigned_long LINUX_MIB_PAWS_OLD_ACK +unsigned_long LINUX_MIB_PAWS_TW_REJECTED unsigned_long LINUX_MIB_DELAYEDACKLOST unsigned_long LINUX_MIB_LISTENOVERFLOWS unsigned_long LINUX_MIB_LISTENDROPS diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index eab601ab2db0b..7ebb6c36482de 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -8,7 +8,7 @@ Network Devices, the Kernel, and You! Introduction ============ The following is a random collection of documentation regarding -network devices. +network devices. It is intended for driver developers. struct net_device lifetime rules ================================ @@ -314,13 +314,8 @@ napi->poll: softirq will be called with interrupts disabled by netconsole. -struct netdev_queue_mgmt_ops synchronization rules -================================================== - -All queue management ndo callbacks are holding netdev instance lock. - -RTNL and netdev instance lock -============================= +netdev instance lock +==================== Historically, all networking control operations were protected by a single global lock known as ``rtnl_lock``. There is an ongoing effort to replace this @@ -328,10 +323,13 @@ global lock with separate locks for each network namespace. Additionally, properties of individual netdev are increasingly protected by per-netdev locks. For device drivers that implement shaping or queue management APIs, all control -operations will be performed under the netdev instance lock. Currently, this -instance lock is acquired within the context of ``rtnl_lock``. The drivers -can also explicitly request instance lock to be acquired via -``request_ops_lock``. In the future, there will be an option for individual +operations will be performed under the netdev instance lock. +Drivers can also explicitly request instance lock to be held during ops +by setting ``request_ops_lock`` to true. Code comments and docs refer +to drivers which have ops called under the instance lock as "ops locked". +See also the documentation of the ``lock`` member of struct net_device. + +In the future, there will be an option for individual drivers to opt out of using ``rtnl_lock`` and instead perform their control operations directly under the netdev instance lock. @@ -344,18 +342,59 @@ functions handle acquiring the instance lock themselves, while the ``netif_xxx`` functions assume that the driver has already acquired the instance lock. +struct net_device_ops +--------------------- + +``ndos`` are called without holding the instance lock for most drivers. + +"Ops locked" drivers will have most of the ``ndos`` invoked under +the instance lock. + +struct ethtool_ops +------------------ + +Similarly to ``ndos`` the instance lock is only held for select drivers. +For "ops locked" drivers all ethtool ops without exceptions should +be called under the instance lock. + +struct netdev_stat_ops +---------------------- + +"qstat" ops are invoked under the instance lock for "ops locked" drivers, +and under rtnl_lock for all other drivers. + +struct net_shaper_ops +--------------------- + +All net shaper callbacks are invoked while holding the netdev instance +lock. ``rtnl_lock`` may or may not be held. + +Note that supporting net shapers automatically enables "ops locking". + +struct netdev_queue_mgmt_ops +---------------------------- + +All queue management callbacks are invoked while holding the netdev instance +lock. ``rtnl_lock`` may or may not be held. + +Note that supporting struct netdev_queue_mgmt_ops automatically enables +"ops locking". + Notifiers and netdev instance lock -================================== +---------------------------------- For device drivers that implement shaping or queue management APIs, some of the notifiers (``enum netdev_cmd``) are running under the netdev instance lock. +The following netdev notifiers are always run under the instance lock: +* ``NETDEV_XDP_FEAT_CHANGE`` + For devices with locked ops, currently only the following notifiers are running under the lock: +* ``NETDEV_CHANGE`` * ``NETDEV_REGISTER`` * ``NETDEV_UP`` -* ``NETDEV_CHANGE`` The following notifiers are running without the lock: * ``NETDEV_UNREGISTER`` diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst index e807e18ba32ac..fe2ea73be4417 100644 --- a/Documentation/networking/rxrpc.rst +++ b/Documentation/networking/rxrpc.rst @@ -1172,3 +1172,18 @@ adjusted through sysctls in /proc/net/rxrpc/: header plus exactly 1412 bytes of data. The terminal packet must contain a four byte header plus any amount of data. In any event, a jumbo packet may not exceed rxrpc_rx_mtu in size. + + +API Function Reference +====================== + +.. kernel-doc:: net/rxrpc/af_rxrpc.c +.. kernel-doc:: net/rxrpc/call_object.c +.. kernel-doc:: net/rxrpc/key.c +.. kernel-doc:: net/rxrpc/oob.c +.. kernel-doc:: net/rxrpc/peer_object.c +.. kernel-doc:: net/rxrpc/recvmsg.c +.. kernel-doc:: net/rxrpc/rxgk.c +.. kernel-doc:: net/rxrpc/rxkad.c +.. kernel-doc:: net/rxrpc/sendmsg.c +.. kernel-doc:: net/rxrpc/server_key.c diff --git a/Documentation/translations/zh_CN/admin-guide/bug-hunting.rst b/Documentation/translations/zh_CN/admin-guide/bug-hunting.rst index c3f6a83294dc4..4b3432753eb9f 100644 --- a/Documentation/translations/zh_CN/admin-guide/bug-hunting.rst +++ b/Documentation/translations/zh_CN/admin-guide/bug-hunting.rst @@ -188,7 +188,7 @@ objdump 编行。如果没有调试符号,您将看到所示例程的汇编程序代码,但是如果内核有调试 符号,C代码也将可见(调试符号可以在内核配置菜单的hacking项中启用)。例如:: - $ objdump -r -S -l --disassemble net/dccp/ipv4.o + $ objdump -r -S -l --disassemble net/ipv4/tcp.o .. note:: diff --git a/Documentation/translations/zh_TW/admin-guide/bug-hunting.rst b/Documentation/translations/zh_TW/admin-guide/bug-hunting.rst index b25ecc44d7354..80ea5677ee524 100644 --- a/Documentation/translations/zh_TW/admin-guide/bug-hunting.rst +++ b/Documentation/translations/zh_TW/admin-guide/bug-hunting.rst @@ -191,7 +191,7 @@ objdump 編行。如果沒有調試符號,您將看到所示例程的彙編程序代碼,但是如果內核有調試 符號,C代碼也將可見(調試符號可以在內核配置菜單的hacking項中啓用)。例如:: - $ objdump -r -S -l --disassemble net/dccp/ipv4.o + $ objdump -r -S -l --disassemble net/ipv4/tcp.o .. note:: diff --git a/Documentation/userspace-api/netlink/netlink-raw.rst b/Documentation/userspace-api/netlink/netlink-raw.rst index 1990eea772d08..31fc91020eb34 100644 --- a/Documentation/userspace-api/netlink/netlink-raw.rst +++ b/Documentation/userspace-api/netlink/netlink-raw.rst @@ -62,7 +62,7 @@ Sub-messages ------------ Several raw netlink families such as -:doc:`rt_link<../../networking/netlink_spec/rt_link>` and +:doc:`rt-link<../../networking/netlink_spec/rt-link>` and :doc:`tc<../../networking/netlink_spec/tc>` use attribute nesting as an abstraction to carry module specific information. diff --git a/MAINTAINERS b/MAINTAINERS index 8c7d796131a81..82e4b96030df5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6547,15 +6547,6 @@ S: Maintained F: Documentation/scsi/dc395x.rst F: drivers/scsi/dc395x.* -DCCP PROTOCOL -L: dccp@vger.kernel.org -S: Orphan -W: http://www.linuxfoundation.org/collaborate/workgroups/networking/dccp -F: include/linux/dccp.h -F: include/linux/tfrc.h -F: include/uapi/linux/dccp.h -F: net/dccp/ - DEBUGOBJECTS: M: Thomas Gleixner L: linux-kernel@vger.kernel.org @@ -18140,6 +18131,17 @@ F: arch/openrisc/ F: drivers/irqchip/irq-ompic.c F: drivers/irqchip/irq-or1k-* +OPENVPN DATA CHANNEL OFFLOAD +M: Antonio Quartulli +L: openvpn-devel@lists.sourceforge.net (subscribers-only) +L: netdev@vger.kernel.org +S: Supported +T: git https://github.com/OpenVPN/linux-kernel-ovpn.git +F: Documentation/netlink/specs/ovpn.yaml +F: drivers/net/ovpn/ +F: include/uapi/linux/ovpn.h +F: tools/testing/selftests/net/ovpn/ + OPENVSWITCH M: Aaron Conole M: Eelco Chaudron @@ -20624,6 +20626,14 @@ S: Maintained F: Documentation/devicetree/bindings/usb/renesas,rzn1-usbf.yaml F: drivers/usb/gadget/udc/renesas_usbf.c +RENESAS RZ/V2H(P) DWMAC GBETH GLUE LAYER DRIVER +M: Lad Prabhakar +L: netdev@vger.kernel.org +L: linux-renesas-soc@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/net/renesas,r9a09g057-gbeth.yaml +F: drivers/net/ethernet/stmicro/stmmac/dwmac-renesas-gbeth.c + RENESAS RZ/V2M I2C DRIVER M: Fabrizio Castro L: linux-i2c@vger.kernel.org diff --git a/README b/README index fd903645e6de0..e69de29bb2d1d 100644 --- a/README +++ b/README @@ -1,18 +0,0 @@ -Linux kernel -============ - -There are several guides for kernel developers and users. These guides can -be rendered in a number of formats, like HTML and PDF. Please read -Documentation/admin-guide/README.rst first. - -In order to build the documentation, use ``make htmldocs`` or -``make pdfdocs``. The formatted documentation can also be read online at: - - https://www.kernel.org/doc/html/latest/ - -There are various text files in the Documentation/ subdirectory, -several of them using the reStructuredText markup notation. - -Please read the Documentation/process/changes.rst file, as it contains the -requirements for building and running the kernel, and information about -the problems which may result by upgrading your kernel. diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 31ecb8b7b9f1e..85bd696b0e902 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -267,8 +267,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 1f57514624d51..93003ceb51bbf 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -263,8 +263,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 02db7a48e57e7..fa6ed1bb01e3c 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -270,8 +270,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index f0e673cb17eb7..80ca001d5eab5 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -260,8 +260,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index e8ca5a50b86da..5e35e4b1fe163 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -262,8 +262,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index b3a270441bb17..11bc3f3c7576f 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -261,8 +261,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index d215dba006ced..343f702210df1 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -281,8 +281,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index a888ed93ff829..6104a5a245947 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -259,8 +259,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index b481782375f68..a99dd6c7c9c88 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -260,8 +260,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 6eba743d8eb5c..0d87c119d4c46 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -261,8 +261,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 9bdbb418ffa88..841f9615b6abc 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -256,8 +256,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index e1cf20fa53431..12c5dc92ea4c8 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -257,8 +257,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/mips/configs/bigsur_defconfig b/arch/mips/configs/bigsur_defconfig index 8f7c36868204a..97d2cd9972850 100644 --- a/arch/mips/configs/bigsur_defconfig +++ b/arch/mips/configs/bigsur_defconfig @@ -81,7 +81,6 @@ CONFIG_IP_VS_SH=m CONFIG_IP_VS_SED=m CONFIG_IP_VS_NQ=m CONFIG_IP_VS_FTP=m -CONFIG_IP_DCCP=m CONFIG_BRIDGE=m CONFIG_VLAN_8021Q=m CONFIG_VLAN_8021Q_GVRP=y diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig index 12f3eed8a9468..df55fe6df8bab 100644 --- a/arch/mips/configs/gpr_defconfig +++ b/arch/mips/configs/gpr_defconfig @@ -84,7 +84,6 @@ CONFIG_BRIDGE_EBT_MARK_T=m CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m -CONFIG_IP_DCCP=m CONFIG_IP_SCTP=m CONFIG_TIPC=m CONFIG_ATM=y diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig index 06b7a0b97ecae..487bd84043f7c 100644 --- a/arch/mips/configs/mtx1_defconfig +++ b/arch/mips/configs/mtx1_defconfig @@ -130,7 +130,6 @@ CONFIG_BRIDGE_EBT_MARK_T=m CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m -CONFIG_IP_DCCP=m CONFIG_IP_SCTP=m CONFIG_TIPC=m CONFIG_ATM=y diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig index 1bc3466bc909e..ae45f70b29f0d 100644 --- a/arch/powerpc/configs/pmac32_defconfig +++ b/arch/powerpc/configs/pmac32_defconfig @@ -87,7 +87,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_IP_DCCP=m CONFIG_BT=m CONFIG_BT_RFCOMM=m CONFIG_BT_RFCOMM_TTY=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index a91a766b71a44..5e0f2cd7e62b9 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -225,7 +225,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m CONFIG_TIPC=m CONFIG_ATM=m CONFIG_ATM_CLIP=m diff --git a/ci/diffs/.keep b/ci/diffs/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/ci/diffs/0001-Revert-bpf-Avoid-unnecessary-audit-log-for-CPU-secur.patch b/ci/diffs/0001-Revert-bpf-Avoid-unnecessary-audit-log-for-CPU-secur.patch new file mode 100644 index 0000000000000..3b6139225e7bf --- /dev/null +++ b/ci/diffs/0001-Revert-bpf-Avoid-unnecessary-audit-log-for-CPU-secur.patch @@ -0,0 +1,33 @@ +From 5440a12ac8fb2a8e051c597fcf5d85b427fe612a Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Fri, 13 Oct 2023 12:44:34 -0700 +Subject: [PATCH] Revert "bpf: Avoid unnecessary audit log for CPU security + mitigations" + +This reverts commit 236334aeec0f93217cf9235f2004e61a0a1a5985. +--- + include/linux/bpf.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/include/linux/bpf.h b/include/linux/bpf.h +index f0891ba24cb1..61bde4520f5c 100644 +--- a/include/linux/bpf.h ++++ b/include/linux/bpf.h +@@ -2164,12 +2164,12 @@ static inline bool bpf_allow_uninit_stack(void) + + static inline bool bpf_bypass_spec_v1(void) + { +- return cpu_mitigations_off() || perfmon_capable(); ++ return perfmon_capable() || cpu_mitigations_off(); + } + + static inline bool bpf_bypass_spec_v4(void) + { +- return cpu_mitigations_off() || perfmon_capable(); ++ return perfmon_capable() || cpu_mitigations_off(); + } + + int bpf_map_new_fd(struct bpf_map *map, int flags); +-- +2.34.1 + diff --git a/ci/diffs/0001-arch-Kconfig-Move-SPECULATION_MITIGATIONS-to-arch-Kc.patch b/ci/diffs/0001-arch-Kconfig-Move-SPECULATION_MITIGATIONS-to-arch-Kc.patch new file mode 100644 index 0000000000000..63bdd28adedd2 --- /dev/null +++ b/ci/diffs/0001-arch-Kconfig-Move-SPECULATION_MITIGATIONS-to-arch-Kc.patch @@ -0,0 +1,69 @@ +From c71766e8ff7a7f950522d25896fba758585500df Mon Sep 17 00:00:00 2001 +From: Song Liu +Date: Mon, 22 Apr 2024 21:14:40 -0700 +Subject: [PATCH] arch/Kconfig: Move SPECULATION_MITIGATIONS to arch/Kconfig + +SPECULATION_MITIGATIONS is currently defined only for x86. As a result, +IS_ENABLED(CONFIG_SPECULATION_MITIGATIONS) is always false for other +archs. f337a6a21e2f effectively set "mitigations=off" by default on +non-x86 archs, which is not desired behavior. Jakub observed this +change when running bpf selftests on s390 and arm64. + +Fix this by moving SPECULATION_MITIGATIONS to arch/Kconfig so that it is +available in all archs and thus can be used safely in kernel/cpu.c + +Fixes: f337a6a21e2f ("x86/cpu: Actually turn off mitigations by default for SPECULATION_MITIGATIONS=n") +Cc: stable@vger.kernel.org +Cc: Sean Christopherson +Cc: Ingo Molnar +Cc: Daniel Sneddon +Cc: Jakub Kicinski +Signed-off-by: Song Liu +--- + arch/Kconfig | 10 ++++++++++ + arch/x86/Kconfig | 10 ---------- + 2 files changed, 10 insertions(+), 10 deletions(-) + +diff --git a/arch/Kconfig b/arch/Kconfig +index 9f066785bb71..8f4af75005f8 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -1609,4 +1609,14 @@ config CC_HAS_SANE_FUNCTION_ALIGNMENT + # strict alignment always, even with -falign-functions. + def_bool CC_HAS_MIN_FUNCTION_ALIGNMENT || CC_IS_CLANG + ++menuconfig SPECULATION_MITIGATIONS ++ bool "Mitigations for speculative execution vulnerabilities" ++ default y ++ help ++ Say Y here to enable options which enable mitigations for ++ speculative execution hardware vulnerabilities. ++ ++ If you say N, all mitigations will be disabled. You really ++ should know what you are doing to say so. ++ + endmenu +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 39886bab943a..50c890fce5e0 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -2486,16 +2486,6 @@ config PREFIX_SYMBOLS + def_bool y + depends on CALL_PADDING && !CFI_CLANG + +-menuconfig SPECULATION_MITIGATIONS +- bool "Mitigations for speculative execution vulnerabilities" +- default y +- help +- Say Y here to enable options which enable mitigations for +- speculative execution hardware vulnerabilities. +- +- If you say N, all mitigations will be disabled. You really +- should know what you are doing to say so. +- + if SPECULATION_MITIGATIONS + + config MITIGATION_PAGE_TABLE_ISOLATION +-- +2.43.0 + diff --git a/ci/diffs/0001-bpf-Fix-a-few-selftest-failures-due-to-llvm18-change.patch b/ci/diffs/0001-bpf-Fix-a-few-selftest-failures-due-to-llvm18-change.patch new file mode 100644 index 0000000000000..a13d767197413 --- /dev/null +++ b/ci/diffs/0001-bpf-Fix-a-few-selftest-failures-due-to-llvm18-change.patch @@ -0,0 +1,94 @@ +From fb9a697860acd8f54f2ba6647923794378eb33da Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Sun, 26 Nov 2023 21:03:42 -0800 +Subject: [PATCH] bpf: Fix a few selftest failures due to llvm18 change + +With latest upstream llvm18, the following test cases failed: + + $ ./test_progs -j + #13/2 bpf_cookie/multi_kprobe_link_api:FAIL + #13/3 bpf_cookie/multi_kprobe_attach_api:FAIL + #13 bpf_cookie:FAIL + #77 fentry_fexit:FAIL + #78/1 fentry_test/fentry:FAIL + #78 fentry_test:FAIL + #82/1 fexit_test/fexit:FAIL + #82 fexit_test:FAIL + #112/1 kprobe_multi_test/skel_api:FAIL + #112/2 kprobe_multi_test/link_api_addrs:FAIL + [...] + #112 kprobe_multi_test:FAIL + #356/17 test_global_funcs/global_func17:FAIL + #356 test_global_funcs:FAIL + +Further analysis shows llvm upstream patch [1] is responsible for the above +failures. For example, for function bpf_fentry_test7() in net/bpf/test_run.c, +without [1], the asm code is: + + 0000000000000400 : + 400: f3 0f 1e fa endbr64 + 404: e8 00 00 00 00 callq 0x409 + 409: 48 89 f8 movq %rdi, %rax + 40c: c3 retq + 40d: 0f 1f 00 nopl (%rax) + +... and with [1], the asm code is: + + 0000000000005d20 : + 5d20: e8 00 00 00 00 callq 0x5d25 + 5d25: c3 retq + +... and is called instead of +and this caused test failures for #13/#77 etc. except #356. + +For test case #356/17, with [1] (progs/test_global_func17.c)), the main prog +looks like: + + 0000000000000000 : + 0: b4 00 00 00 2a 00 00 00 w0 = 0x2a + 1: 95 00 00 00 00 00 00 00 exit + +... which passed verification while the test itself expects a verification +failure. + +Let us add 'barrier_var' style asm code in both places to prevent function +specialization which caused selftests failure. + + [1] https://github.com/llvm/llvm-project/pull/72903 + +Signed-off-by: Yonghong Song +Signed-off-by: Daniel Borkmann +Link: https://lore.kernel.org/bpf/20231127050342.1945270-1-yonghong.song@linux.dev +--- + net/bpf/test_run.c | 2 +- + tools/testing/selftests/bpf/progs/test_global_func17.c | 1 + + 2 files changed, 2 insertions(+), 1 deletion(-) + +diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c +index c9fdcc5cdce1..711cf5d59816 100644 +--- a/net/bpf/test_run.c ++++ b/net/bpf/test_run.c +@@ -542,7 +542,7 @@ struct bpf_fentry_test_t { + + int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg) + { +- asm volatile (""); ++ asm volatile ("": "+r"(arg)); + return (long)arg; + } + +diff --git a/tools/testing/selftests/bpf/progs/test_global_func17.c b/tools/testing/selftests/bpf/progs/test_global_func17.c +index a32e11c7d933..5de44b09e8ec 100644 +--- a/tools/testing/selftests/bpf/progs/test_global_func17.c ++++ b/tools/testing/selftests/bpf/progs/test_global_func17.c +@@ -5,6 +5,7 @@ + + __noinline int foo(int *p) + { ++ barrier_var(p); + return p ? (*p = 42) : 0; + } + +-- +2.34.1 + diff --git a/ci/diffs/0001-bpf-Fix-a-verifier-bug-due-to-incorrect-branch-offse.patch b/ci/diffs/0001-bpf-Fix-a-verifier-bug-due-to-incorrect-branch-offse.patch new file mode 100644 index 0000000000000..5832a42664706 --- /dev/null +++ b/ci/diffs/0001-bpf-Fix-a-verifier-bug-due-to-incorrect-branch-offse.patch @@ -0,0 +1,67 @@ +From dfce9cb3140592b886838e06f3e0c25fea2a9cae Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Thu, 30 Nov 2023 18:46:40 -0800 +Subject: [PATCH 1/1] bpf: Fix a verifier bug due to incorrect branch offset + comparison with cpu=v4 + +Bpf cpu=v4 support is introduced in [1] and Commit 4cd58e9af8b9 +("bpf: Support new 32bit offset jmp instruction") added support for new +32bit offset jmp instruction. Unfortunately, in function +bpf_adj_delta_to_off(), for new branch insn with 32bit offset, the offset +(plus/minor a small delta) compares to 16-bit offset bound +[S16_MIN, S16_MAX], which caused the following verification failure: + $ ./test_progs-cpuv4 -t verif_scale_pyperf180 + ... + insn 10 cannot be patched due to 16-bit range + ... + libbpf: failed to load object 'pyperf180.bpf.o' + scale_test:FAIL:expect_success unexpected error: -12 (errno 12) + #405 verif_scale_pyperf180:FAIL + +Note that due to recent llvm18 development, the patch [2] (already applied +in bpf-next) needs to be applied to bpf tree for testing purpose. + +The fix is rather simple. For 32bit offset branch insn, the adjusted +offset compares to [S32_MIN, S32_MAX] and then verification succeeded. + + [1] https://lore.kernel.org/all/20230728011143.3710005-1-yonghong.song@linux.dev + [2] https://lore.kernel.org/bpf/20231110193644.3130906-1-yonghong.song@linux.dev + +Fixes: 4cd58e9af8b9 ("bpf: Support new 32bit offset jmp instruction") +Signed-off-by: Yonghong Song +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/bpf/20231201024640.3417057-1-yonghong.song@linux.dev +--- + kernel/bpf/core.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c +index cd3afe57ece3..fe254ae035fe 100644 +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -371,14 +371,18 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, + static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, + s32 end_new, s32 curr, const bool probe_pass) + { +- const s32 off_min = S16_MIN, off_max = S16_MAX; ++ s64 off_min, off_max, off; + s32 delta = end_new - end_old; +- s32 off; + +- if (insn->code == (BPF_JMP32 | BPF_JA)) ++ if (insn->code == (BPF_JMP32 | BPF_JA)) { + off = insn->imm; +- else ++ off_min = S32_MIN; ++ off_max = S32_MAX; ++ } else { + off = insn->off; ++ off_min = S16_MIN; ++ off_max = S16_MAX; ++ } + + if (curr < pos && curr + off + 1 >= end_old) + off += delta; +-- +2.34.1 + diff --git a/ci/diffs/0001-bpf-next-selftests-bpf-Fix-a-btf_dump-selftest-failure.patch b/ci/diffs/0001-bpf-next-selftests-bpf-Fix-a-btf_dump-selftest-failure.patch new file mode 100644 index 0000000000000..ea6b2386d0345 --- /dev/null +++ b/ci/diffs/0001-bpf-next-selftests-bpf-Fix-a-btf_dump-selftest-failure.patch @@ -0,0 +1,40 @@ +From patchwork Fri Aug 2 18:54:34 2024 +From: Yonghong Song +Subject: [PATCH bpf-next] selftests/bpf: Fix a btf_dump selftest failure + +Jakub reported bpf selftest "btf_dump" failure after forwarding to +v6.11-rc1 with netdev. + Error: #33 btf_dump + Error: #33/15 btf_dump/btf_dump: var_data + btf_dump_data:FAIL:find type id unexpected find type id: actual -2 < expected 0 + +The reason for the failure is due to + commit 94ede2a3e913 ("profiling: remove stale percpu flip buffer variables") +where percpu static variable "cpu_profile_flip" is removed. + +Let us replace "cpu_profile_flip" with a variable in bpf subsystem +so whenever that variable gets deleted or renamed, we can detect the +failure immediately. In this case, I picked a static percpu variable +"bpf_cgrp_storage_busy" which is defined in kernel/bpf/bpf_cgrp_storage.c. + +Reported-by: Jakub Kicinski +Signed-off-by: Yonghong Song +--- + tools/testing/selftests/bpf/prog_tests/btf_dump.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c +index 09a8e6f9b379..b293b8501fd6 100644 +--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c ++++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c +@@ -805,8 +805,8 @@ static void test_btf_dump_var_data(struct btf *btf, struct btf_dump *d, + TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_number", int, BTF_F_COMPACT, + "int cpu_number = (int)100", 100); + #endif +- TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_profile_flip", int, BTF_F_COMPACT, +- "static int cpu_profile_flip = (int)2", 2); ++ TEST_BTF_DUMP_VAR(btf, d, NULL, str, "bpf_cgrp_storage_busy", int, BTF_F_COMPACT, ++ "static int bpf_cgrp_storage_busy = (int)2", 2); + } + + static void test_btf_datasec(struct btf *btf, struct btf_dump *d, char *str, diff --git a/ci/diffs/0001-net-bpf-Use-sockopt_lock_sock-in-ip_sock_set_tos.patch b/ci/diffs/0001-net-bpf-Use-sockopt_lock_sock-in-ip_sock_set_tos.patch new file mode 100644 index 0000000000000..bd12bd9b3fba5 --- /dev/null +++ b/ci/diffs/0001-net-bpf-Use-sockopt_lock_sock-in-ip_sock_set_tos.patch @@ -0,0 +1,99 @@ +From c8268f8e9fa33c32e1f2f86fc7b703408a396c70 Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Fri, 27 Oct 2023 11:24:24 -0700 +Subject: [PATCH] net: bpf: Use sockopt_lock_sock() in ip_sock_set_tos() + +With latest sync from net-next tree, bpf-next has a bpf selftest failure: + [root@arch-fb-vm1 bpf]# ./test_progs -t setget_sockopt + ... + [ 76.194349] ============================================ + [ 76.194682] WARNING: possible recursive locking detected + [ 76.195039] 6.6.0-rc7-g37884503df08-dirty #67 Tainted: G W OE + [ 76.195518] -------------------------------------------- + [ 76.195852] new_name/154 is trying to acquire lock: + [ 76.196159] ffff8c3e06ad8d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: ip_sock_set_tos+0x19/0x30 + [ 76.196669] + [ 76.196669] but task is already holding lock: + [ 76.197028] ffff8c3e06ad8d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: inet_listen+0x21/0x70 + [ 76.197517] + [ 76.197517] other info that might help us debug this: + [ 76.197919] Possible unsafe locking scenario: + [ 76.197919] + [ 76.198287] CPU0 + [ 76.198444] ---- + [ 76.198600] lock(sk_lock-AF_INET); + [ 76.198831] lock(sk_lock-AF_INET); + [ 76.199062] + [ 76.199062] *** DEADLOCK *** + [ 76.199062] + [ 76.199420] May be due to missing lock nesting notation + [ 76.199420] + [ 76.199879] 2 locks held by new_name/154: + [ 76.200131] #0: ffff8c3e06ad8d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: inet_listen+0x21/0x70 + [ 76.200644] #1: ffffffff90f96a40 (rcu_read_lock){....}-{1:2}, at: __cgroup_bpf_run_filter_sock_ops+0x55/0x290 + [ 76.201268] + [ 76.201268] stack backtrace: + [ 76.201538] CPU: 4 PID: 154 Comm: new_name Tainted: G W OE 6.6.0-rc7-g37884503df08-dirty #67 + [ 76.202134] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 + [ 76.202699] Call Trace: + [ 76.202858] + [ 76.203002] dump_stack_lvl+0x4b/0x80 + [ 76.203239] __lock_acquire+0x740/0x1ec0 + [ 76.203503] lock_acquire+0xc1/0x2a0 + [ 76.203766] ? ip_sock_set_tos+0x19/0x30 + [ 76.204050] ? sk_stream_write_space+0x12a/0x230 + [ 76.204389] ? lock_release+0xbe/0x260 + [ 76.204661] lock_sock_nested+0x32/0x80 + [ 76.204942] ? ip_sock_set_tos+0x19/0x30 + [ 76.205208] ip_sock_set_tos+0x19/0x30 + [ 76.205452] do_ip_setsockopt+0x4b3/0x1580 + [ 76.205719] __bpf_setsockopt+0x62/0xa0 + [ 76.205963] bpf_sock_ops_setsockopt+0x11/0x20 + [ 76.206247] bpf_prog_630217292049c96e_bpf_test_sockopt_int+0xbc/0x123 + [ 76.206660] bpf_prog_493685a3bae00bbd_bpf_test_ip_sockopt+0x49/0x4b + [ 76.207055] bpf_prog_b0bcd27f269aeea0_skops_sockopt+0x44c/0xec7 + [ 76.207437] __cgroup_bpf_run_filter_sock_ops+0xda/0x290 + [ 76.207829] __inet_listen_sk+0x108/0x1b0 + [ 76.208122] inet_listen+0x48/0x70 + [ 76.208373] __sys_listen+0x74/0xb0 + [ 76.208630] __x64_sys_listen+0x16/0x20 + [ 76.208911] do_syscall_64+0x3f/0x90 + [ 76.209174] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 + ... + +Both ip_sock_set_tos() and inet_listen() calls lock_sock(sk) which +caused a dead lock. + +To fix the issue, use sockopt_lock_sock() in ip_sock_set_tos() +instead. sockopt_lock_sock() will avoid lock_sock() if it is in bpf +context. + +Fixes: 878d951c6712 ("inet: lock the socket in ip_sock_set_tos()") +Suggested-by: Martin KaFai Lau +Signed-off-by: Yonghong Song +Signed-off-by: Andrii Nakryiko +Reviewed-by: Eric Dumazet +Link: https://lore.kernel.org/bpf/20231027182424.1444845-1-yonghong.song@linux.dev +--- + net/ipv4/ip_sockglue.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c +index 9c68b6b74d9f..2efc53526a38 100644 +--- a/net/ipv4/ip_sockglue.c ++++ b/net/ipv4/ip_sockglue.c +@@ -602,9 +602,9 @@ void __ip_sock_set_tos(struct sock *sk, int val) + + void ip_sock_set_tos(struct sock *sk, int val) + { +- lock_sock(sk); ++ sockopt_lock_sock(sk); + __ip_sock_set_tos(sk, val); +- release_sock(sk); ++ sockopt_release_sock(sk); + } + EXPORT_SYMBOL(ip_sock_set_tos); + +-- +2.34.1 + diff --git a/ci/diffs/0001-selftests-bpf-Filter-out-_GNU_SOURCE-when-compiling-.patch b/ci/diffs/0001-selftests-bpf-Filter-out-_GNU_SOURCE-when-compiling-.patch new file mode 100644 index 0000000000000..da5bcdc455967 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-Filter-out-_GNU_SOURCE-when-compiling-.patch @@ -0,0 +1,51 @@ +From 41c24102af7b6236277a214428b203d51a3462df Mon Sep 17 00:00:00 2001 +From: Stanislav Fomichev +Date: Thu, 25 Jul 2024 14:40:29 -0700 +Subject: [PATCH 1/1] selftests/bpf: Filter out _GNU_SOURCE when compiling + test_cpp + +Jakub reports build failures when merging linux/master with net tree: + +CXX test_cpp +In file included from :454: +:2:9: error: '_GNU_SOURCE' macro redefined [-Werror,-Wmacro-redefined] + 2 | #define _GNU_SOURCE + | ^ +:445:9: note: previous definition is here + 445 | #define _GNU_SOURCE 1 + +The culprit is commit cc937dad85ae ("selftests: centralize -D_GNU_SOURCE= to +CFLAGS in lib.mk") which unconditionally added -D_GNU_SOUCE to CLFAGS. +Apparently clang++ also unconditionally adds it for the C++ targets [0] +which causes a conflict. Add small change in the selftests makefile +to filter it out for test_cpp. + +Not sure which tree it should go via, targeting bpf for now, but net +might be better? + +0: https://stackoverflow.com/questions/11670581/why-is-gnu-source-defined-by-default-and-how-to-turn-it-off + +Signed-off-by: Stanislav Fomichev +Signed-off-by: Andrii Nakryiko +Acked-by: Jiri Olsa +Link: https://lore.kernel.org/bpf/20240725214029.1760809-1-sdf@fomichev.me +--- + tools/testing/selftests/bpf/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index dd49c1d23a60..81d4757ecd4c 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -713,7 +713,7 @@ $(OUTPUT)/xdp_features: xdp_features.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xdp + # Make sure we are able to include and link libbpf against c++. + $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) + $(call msg,CXX,,$@) +- $(Q)$(CXX) $(CFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ ++ $(Q)$(CXX) $(subst -D_GNU_SOURCE=,,$(CFLAGS)) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ + + # Benchmark runner + $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ) +-- +2.43.0 + diff --git a/ci/diffs/0001-selftests-bpf-Fix-bpf_cookie-and-find_vma-in-nested-.patch b/ci/diffs/0001-selftests-bpf-Fix-bpf_cookie-and-find_vma-in-nested-.patch new file mode 100644 index 0000000000000..4ebfe20b24707 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-Fix-bpf_cookie-and-find_vma-in-nested-.patch @@ -0,0 +1,50 @@ +From f3d2080e8cf23125f79e345061149ae40f66816f Mon Sep 17 00:00:00 2001 +From: Song Liu +Date: Mon, 3 Jun 2024 23:43:17 -0700 +Subject: [PATCH bpf-next] selftests/bpf: Fix bpf_cookie and find_vma in nested + VM + +bpf_cookie and find_vma are flaky in nested VMs, which is used by some CI +systems. It turns out these failures are caused by unreliable perf event +in nested VM. Fix these by: + + 1. Use PERF_COUNT_SW_CPU_CLOCK in find_vma; + 2. Increase sample_freq in bpf_cookie. + +Signed-off-by: Song Liu +--- + tools/testing/selftests/bpf/prog_tests/bpf_cookie.c | 2 +- + tools/testing/selftests/bpf/prog_tests/find_vma.c | 4 ++-- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +index 4407ea428e77..070c52c312e5 100644 +--- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c ++++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +@@ -451,7 +451,7 @@ static void pe_subtest(struct test_bpf_cookie *skel) + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.freq = 1; +- attr.sample_freq = 1000; ++ attr.sample_freq = 10000; + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + if (!ASSERT_GE(pfd, 0, "perf_fd")) + goto cleanup; +diff --git a/tools/testing/selftests/bpf/prog_tests/find_vma.c b/tools/testing/selftests/bpf/prog_tests/find_vma.c +index 5165b38f0e59..f7619e0ade10 100644 +--- a/tools/testing/selftests/bpf/prog_tests/find_vma.c ++++ b/tools/testing/selftests/bpf/prog_tests/find_vma.c +@@ -29,8 +29,8 @@ static int open_pe(void) + + /* create perf event */ + attr.size = sizeof(attr); +- attr.type = PERF_TYPE_HARDWARE; +- attr.config = PERF_COUNT_HW_CPU_CYCLES; ++ attr.type = PERF_TYPE_SOFTWARE; ++ attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.freq = 1; + attr.sample_freq = 1000; + pfd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, PERF_FLAG_FD_CLOEXEC); +-- +2.43.0 + diff --git a/ci/diffs/0001-selftests-bpf-Fix-pyperf180-compilation-failure-with.patch b/ci/diffs/0001-selftests-bpf-Fix-pyperf180-compilation-failure-with.patch new file mode 100644 index 0000000000000..d55d2e7af8651 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-Fix-pyperf180-compilation-failure-with.patch @@ -0,0 +1,78 @@ +From 100888fb6d8a185866b1520031ee7e3182b173de Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Fri, 10 Nov 2023 11:36:44 -0800 +Subject: [PATCH] selftests/bpf: Fix pyperf180 compilation failure with clang18 + +With latest clang18 (main branch of llvm-project repo), when building bpf selftests, + [~/work/bpf-next (master)]$ make -C tools/testing/selftests/bpf LLVM=1 -j + +The following compilation error happens: + fatal error: error in backend: Branch target out of insn range + ... + Stack dump: + 0. Program arguments: clang -g -Wall -Werror -D__TARGET_ARCH_x86 -mlittle-endian + -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/include + -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -I/home/yhs/work/bpf-next/tools/include/uapi + -I/home/yhs/work/bpf-next/tools/testing/selftests/usr/include -idirafter + /home/yhs/work/llvm-project/llvm/build.18/install/lib/clang/18/include -idirafter /usr/local/include + -idirafter /usr/include -Wno-compare-distinct-pointer-types -DENABLE_ATOMICS_TESTS -O2 --target=bpf + -c progs/pyperf180.c -mcpu=v3 -o /home/yhs/work/bpf-next/tools/testing/selftests/bpf/pyperf180.bpf.o + 1. parser at end of file + 2. Code generation + ... + +The compilation failure only happens to cpu=v2 and cpu=v3. cpu=v4 is okay +since cpu=v4 supports 32-bit branch target offset. + +The above failure is due to upstream llvm patch [1] where some inlining behavior +are changed in clang18. + +To workaround the issue, previously all 180 loop iterations are fully unrolled. +The bpf macro __BPF_CPU_VERSION__ (implemented in clang18 recently) is used to avoid +unrolling changes if cpu=v4. If __BPF_CPU_VERSION__ is not available and the +compiler is clang18, the unrollng amount is unconditionally reduced. + + [1] https://github.com/llvm/llvm-project/commit/1a2e77cf9e11dbf56b5720c607313a566eebb16e + +Signed-off-by: Yonghong Song +Signed-off-by: Andrii Nakryiko +Tested-by: Alan Maguire +Link: https://lore.kernel.org/bpf/20231110193644.3130906-1-yonghong.song@linux.dev +--- + tools/testing/selftests/bpf/progs/pyperf180.c | 22 +++++++++++++++++++ + 1 file changed, 22 insertions(+) + +diff --git a/tools/testing/selftests/bpf/progs/pyperf180.c b/tools/testing/selftests/bpf/progs/pyperf180.c +index c39f559d3100..42c4a8b62e36 100644 +--- a/tools/testing/selftests/bpf/progs/pyperf180.c ++++ b/tools/testing/selftests/bpf/progs/pyperf180.c +@@ -1,4 +1,26 @@ + // SPDX-License-Identifier: GPL-2.0 + // Copyright (c) 2019 Facebook + #define STACK_MAX_LEN 180 ++ ++/* llvm upstream commit at clang18 ++ * https://github.com/llvm/llvm-project/commit/1a2e77cf9e11dbf56b5720c607313a566eebb16e ++ * changed inlining behavior and caused compilation failure as some branch ++ * target distance exceeded 16bit representation which is the maximum for ++ * cpu v1/v2/v3. Macro __BPF_CPU_VERSION__ is later implemented in clang18 ++ * to specify which cpu version is used for compilation. So a smaller ++ * unroll_count can be set if __BPF_CPU_VERSION__ is less than 4, which ++ * reduced some branch target distances and resolved the compilation failure. ++ * ++ * To capture the case where a developer/ci uses clang18 but the corresponding ++ * repo checkpoint does not have __BPF_CPU_VERSION__, a smaller unroll_count ++ * will be set as well to prevent potential compilation failures. ++ */ ++#ifdef __BPF_CPU_VERSION__ ++#if __BPF_CPU_VERSION__ < 4 ++#define UNROLL_COUNT 90 ++#endif ++#elif __clang_major__ == 18 ++#define UNROLL_COUNT 90 ++#endif ++ + #include "pyperf.h" +-- +2.34.1 + diff --git a/ci/diffs/0001-selftests-bpf-disable-detection-of-llvm-when-buildin.patch b/ci/diffs/0001-selftests-bpf-disable-detection-of-llvm-when-buildin.patch new file mode 100644 index 0000000000000..6497a6cc38c90 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-disable-detection-of-llvm-when-buildin.patch @@ -0,0 +1,41 @@ +From 42839864a62ee244ec280b09149b1cb439f681db Mon Sep 17 00:00:00 2001 +From: Manu Bretelle +Date: Fri, 27 Oct 2023 18:25:39 -0700 +Subject: [PATCH bpf-next] selftests/bpf: disable detection of llvm when + building bpftool + +The VMs in which we run the selftests do not have llvm installed. +We build selftests/bpftool in a host that have llvm. +bpftool currently will use llvm first and fallback to libbfd but there +is no way to disable detection from the command line. + +Removing it from the feature detection should force us to use libbfd. + +Signed-off-by: Manu Bretelle +--- + tools/bpf/bpftool/Makefile | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile +index e9154ace80ff..01314458e25e 100644 +--- a/tools/bpf/bpftool/Makefile ++++ b/tools/bpf/bpftool/Makefile +@@ -95,7 +95,6 @@ RM ?= rm -f + FEATURE_USER = .bpftool + + FEATURE_TESTS := clang-bpf-co-re +-FEATURE_TESTS += llvm + FEATURE_TESTS += libcap + FEATURE_TESTS += libbfd + FEATURE_TESTS += libbfd-liberty +@@ -104,7 +103,6 @@ FEATURE_TESTS += disassembler-four-args + FEATURE_TESTS += disassembler-init-styled + + FEATURE_DISPLAY := clang-bpf-co-re +-FEATURE_DISPLAY += llvm + FEATURE_DISPLAY += libcap + FEATURE_DISPLAY += libbfd + FEATURE_DISPLAY += libbfd-liberty +-- +2.39.3 + diff --git a/ci/diffs/0001-selftests-bpf-fix-inet_csk_accept-prototype-in-test_.patch b/ci/diffs/0001-selftests-bpf-fix-inet_csk_accept-prototype-in-test_.patch new file mode 100644 index 0000000000000..3fa007c51db68 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-fix-inet_csk_accept-prototype-in-test_.patch @@ -0,0 +1,32 @@ +From 0daad0a615e687e1247230f3d0c31ae60ba32314 Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Tue, 28 May 2024 15:29:38 -0700 +Subject: [PATCH bpf-next] selftests/bpf: fix inet_csk_accept prototype in + test_sk_storage_tracing.c + +Recent kernel change ([0]) changed inet_csk_accept() prototype. Adapt +progs/test_sk_storage_tracing.c to take that into account. + + [0] 92ef0fd55ac8 ("net: change proto and proto_ops accept type") + +Signed-off-by: Andrii Nakryiko +--- + tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c +index 02e718f06e0f..40531e56776e 100644 +--- a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c ++++ b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c +@@ -84,7 +84,7 @@ int BPF_PROG(trace_tcp_connect, struct sock *sk) + } + + SEC("fexit/inet_csk_accept") +-int BPF_PROG(inet_csk_accept, struct sock *sk, int flags, int *err, bool kern, ++int BPF_PROG(inet_csk_accept, struct sock *sk, struct proto_accept_arg *arg, + struct sock *accepted_sk) + { + set_task_info(accepted_sk); +-- +2.43.0 + diff --git a/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch b/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch new file mode 100644 index 0000000000000..ec1e29a8ab974 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch @@ -0,0 +1,31 @@ +From d31a7125891994681503770cff46a119692fb2b9 Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Mon, 11 Dec 2023 17:09:38 -0800 +Subject: [PATCH 1/1] selftests/bpf: work around latest Clang smartness + +Work around the issue while we deal with it in the Clang itself. +See [0]. + + [0] https://github.com/llvm/llvm-project/pull/73662#issuecomment-1849281758 + +Signed-off-by: Andrii Nakryiko +--- + tools/testing/selftests/bpf/progs/iters.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c +index 3aca3dc145b5..929ba6fa2105 100644 +--- a/tools/testing/selftests/bpf/progs/iters.c ++++ b/tools/testing/selftests/bpf/progs/iters.c +@@ -1420,7 +1420,7 @@ SEC("raw_tp") + __success + int iter_arr_with_actual_elem_count(const void *ctx) + { +- int i, n = loop_data.n, sum = 0; ++ unsigned i, n = loop_data.n, sum = 0; + + if (n > ARRAY_SIZE(loop_data.data)) + return 0; +-- +2.34.1 + diff --git a/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch b/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch new file mode 100644 index 0000000000000..e631fac0cc698 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch @@ -0,0 +1,89 @@ +From fe69a1b1b6ed9ffc2c578c63f526026a8ab74f0c Mon Sep 17 00:00:00 2001 +From: Anders Roxell +Date: Thu, 9 Nov 2023 18:43:28 +0100 +Subject: [PATCH] selftests: bpf: xskxceiver: ksft_print_msg: fix format type + error + +Crossbuilding selftests/bpf for architecture arm64, format specifies +type error show up like. + +xskxceiver.c:912:34: error: format specifies type 'int' but the argument +has type '__u64' (aka 'unsigned long long') [-Werror,-Wformat] + ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n", + ~~ + %llu + __func__, pkt->pkt_nb, meta->count); + ^~~~~~~~~~~ +xskxceiver.c:929:55: error: format specifies type 'unsigned long long' but + the argument has type 'u64' (aka 'unsigned long') [-Werror,-Wformat] + ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); + ~~~~ ^~~~ + +Fixing the issues by casting to (unsigned long long) and changing the +specifiers to be %llu from %d and %u, since with u64s it might be %llx +or %lx, depending on architecture. + +Signed-off-by: Anders Roxell +Link: https://lore.kernel.org/r/20231109174328.1774571-1-anders.roxell@linaro.org +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/xskxceiver.c | 19 ++++++++++++------- + 1 file changed, 12 insertions(+), 7 deletions(-) + +diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c +index 591ca9637b23..b604c570309a 100644 +--- a/tools/testing/selftests/bpf/xskxceiver.c ++++ b/tools/testing/selftests/bpf/xskxceiver.c +@@ -908,8 +908,9 @@ static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr) + struct xdp_info *meta = data - sizeof(struct xdp_info); + + if (meta->count != pkt->pkt_nb) { +- ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n", +- __func__, pkt->pkt_nb, meta->count); ++ ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%llu]\n", ++ __func__, pkt->pkt_nb, ++ (unsigned long long)meta->count); + return false; + } + +@@ -926,11 +927,13 @@ static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 exp + + if (addr >= umem->num_frames * umem->frame_size || + addr + len > umem->num_frames * umem->frame_size) { +- ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); ++ ksft_print_msg("Frag invalid addr: %llx len: %u\n", ++ (unsigned long long)addr, len); + return false; + } + if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) { +- ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", addr, len); ++ ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", ++ (unsigned long long)addr, len); + return false; + } + +@@ -1029,7 +1032,8 @@ static int complete_pkts(struct xsk_socket_info *xsk, int batch_size) + u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1); + + ksft_print_msg("[%s] Too many packets completed\n", __func__); +- ksft_print_msg("Last completion address: %llx\n", addr); ++ ksft_print_msg("Last completion address: %llx\n", ++ (unsigned long long)addr); + return TEST_FAILURE; + } + +@@ -1513,8 +1517,9 @@ static int validate_tx_invalid_descs(struct ifobject *ifobject) + } + + if (stats.tx_invalid_descs != ifobject->xsk->pkt_stream->nb_pkts / 2) { +- ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%u] expected [%u]\n", +- __func__, stats.tx_invalid_descs, ++ ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%llu] expected [%u]\n", ++ __func__, ++ (unsigned long long)stats.tx_invalid_descs, + ifobject->xsk->pkt_stream->nb_pkts); + return TEST_FAILURE; + } +-- +2.34.1 + diff --git a/ci/diffs/0001-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch b/ci/diffs/0001-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch new file mode 100644 index 0000000000000..19d269de7e8ca --- /dev/null +++ b/ci/diffs/0001-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch @@ -0,0 +1,142 @@ +From 3772e6cdb51f21a11df2acf6aa431cc8b9137bfb Mon Sep 17 00:00:00 2001 +From: Viktor Malik +Date: Tue, 6 Feb 2024 13:46:09 +0100 +Subject: [PATCH 1/2] tools/resolve_btfids: Refactor set sorting with types + from btf_ids.h + +Instead of using magic offsets to access BTF ID set data, leverage types +from btf_ids.h (btf_id_set and btf_id_set8) which define the actual +layout of the data. Thanks to this change, set sorting should also +continue working if the layout changes. + +This requires to sync the definition of 'struct btf_id_set8' from +include/linux/btf_ids.h to tools/include/linux/btf_ids.h. We don't sync +the rest of the file at the moment, b/c that would require to also sync +multiple dependent headers and we don't need any other defs from +btf_ids.h. + +Signed-off-by: Viktor Malik +Signed-off-by: Andrii Nakryiko +Acked-by: Daniel Xu +Link: https://lore.kernel.org/bpf/ff7f062ddf6a00815fda3087957c4ce667f50532.1707223196.git.vmalik@redhat.com +--- + tools/bpf/resolve_btfids/main.c | 35 ++++++++++++++++++++------------- + tools/include/linux/btf_ids.h | 9 +++++++++ + 2 files changed, 30 insertions(+), 14 deletions(-) + +diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c +index 27a23196d58e..32634f00abba 100644 +--- a/tools/bpf/resolve_btfids/main.c ++++ b/tools/bpf/resolve_btfids/main.c +@@ -70,6 +70,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -78,7 +79,7 @@ + #include + + #define BTF_IDS_SECTION ".BTF_ids" +-#define BTF_ID "__BTF_ID__" ++#define BTF_ID_PREFIX "__BTF_ID__" + + #define BTF_STRUCT "struct" + #define BTF_UNION "union" +@@ -161,7 +162,7 @@ static int eprintf(int level, int var, const char *fmt, ...) + + static bool is_btf_id(const char *name) + { +- return name && !strncmp(name, BTF_ID, sizeof(BTF_ID) - 1); ++ return name && !strncmp(name, BTF_ID_PREFIX, sizeof(BTF_ID_PREFIX) - 1); + } + + static struct btf_id *btf_id__find(struct rb_root *root, const char *name) +@@ -441,7 +442,7 @@ static int symbols_collect(struct object *obj) + * __BTF_ID__TYPE__vfs_truncate__0 + * prefix = ^ + */ +- prefix = name + sizeof(BTF_ID) - 1; ++ prefix = name + sizeof(BTF_ID_PREFIX) - 1; + + /* struct */ + if (!strncmp(prefix, BTF_STRUCT, sizeof(BTF_STRUCT) - 1)) { +@@ -649,19 +650,18 @@ static int cmp_id(const void *pa, const void *pb) + static int sets_patch(struct object *obj) + { + Elf_Data *data = obj->efile.idlist; +- int *ptr = data->d_buf; + struct rb_node *next; + + next = rb_first(&obj->sets); + while (next) { +- unsigned long addr, idx; ++ struct btf_id_set8 *set8; ++ struct btf_id_set *set; ++ unsigned long addr, off; + struct btf_id *id; +- int *base; +- int cnt; + + id = rb_entry(next, struct btf_id, rb_node); + addr = id->addr[0]; +- idx = addr - obj->efile.idlist_addr; ++ off = addr - obj->efile.idlist_addr; + + /* sets are unique */ + if (id->addr_cnt != 1) { +@@ -670,14 +670,21 @@ static int sets_patch(struct object *obj) + return -1; + } + +- idx = idx / sizeof(int); +- base = &ptr[idx] + (id->is_set8 ? 2 : 1); +- cnt = ptr[idx]; ++ if (id->is_set) { ++ set = data->d_buf + off; ++ qsort(set->ids, set->cnt, sizeof(set->ids[0]), cmp_id); ++ } else { ++ set8 = data->d_buf + off; ++ /* ++ * Make sure id is at the beginning of the pairs ++ * struct, otherwise the below qsort would not work. ++ */ ++ BUILD_BUG_ON(set8->pairs != &set8->pairs[0].id); ++ qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); ++ } + + pr_debug("sorting addr %5lu: cnt %6d [%s]\n", +- (idx + 1) * sizeof(int), cnt, id->name); +- +- qsort(base, cnt, id->is_set8 ? sizeof(uint64_t) : sizeof(int), cmp_id); ++ off, id->is_set ? set->cnt : set8->cnt, id->name); + + next = rb_next(next); + } +diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h +index 2f882d5cb30f..72535f00572f 100644 +--- a/tools/include/linux/btf_ids.h ++++ b/tools/include/linux/btf_ids.h +@@ -8,6 +8,15 @@ struct btf_id_set { + u32 ids[]; + }; + ++struct btf_id_set8 { ++ u32 cnt; ++ u32 flags; ++ struct { ++ u32 id; ++ u32 flags; ++ } pairs[]; ++}; ++ + #ifdef CONFIG_DEBUG_INFO_BTF + + #include /* for __PASTE */ +-- +2.39.3 + + + diff --git a/ci/diffs/0001-tracing-kprobes-Fix-symbol-counting-logic-by-looking.patch b/ci/diffs/0001-tracing-kprobes-Fix-symbol-counting-logic-by-looking.patch new file mode 100644 index 0000000000000..24ebc231056cb --- /dev/null +++ b/ci/diffs/0001-tracing-kprobes-Fix-symbol-counting-logic-by-looking.patch @@ -0,0 +1,65 @@ +From 08969a676d234a178ff9f8c67936a2ad98a741eb Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Fri, 27 Oct 2023 16:22:24 -0700 +Subject: [PATCH] tracing/kprobes: Fix symbol counting logic by looking at + modules as well + +Recent changes to count number of matching symbols when creating +a kprobe event failed to take into account kernel modules. As such, it +breaks kprobes on kernel module symbols, by assuming there is no match. + +Fix this my calling module_kallsyms_on_each_symbol() in addition to +kallsyms_on_each_match_symbol() to perform a proper counting. + +Cc: Francis Laniel +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Steven Rostedt +Fixes: b022f0c7e404 ("tracing/kprobes: Return EADDRNOTAVAIL when func matches several symbols") +Signed-off-by: Andrii Nakryiko +--- + kernel/trace/trace_kprobe.c | 24 ++++++++++++++++++++---- + 1 file changed, 20 insertions(+), 4 deletions(-) + +diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c +index effcaede4759..1efb27f35963 100644 +--- a/kernel/trace/trace_kprobe.c ++++ b/kernel/trace/trace_kprobe.c +@@ -714,14 +714,30 @@ static int count_symbols(void *data, unsigned long unused) + return 0; + } + ++struct sym_count_ctx { ++ unsigned int count; ++ const char *name; ++}; ++ ++static int count_mod_symbols(void *data, const char *name, unsigned long unused) ++{ ++ struct sym_count_ctx *ctx = data; ++ ++ if (strcmp(name, ctx->name) == 0) ++ ctx->count++; ++ ++ return 0; ++} ++ + static unsigned int number_of_same_symbols(char *func_name) + { +- unsigned int count; ++ struct sym_count_ctx ctx = { .count = 0, .name = func_name }; ++ ++ kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count); + +- count = 0; +- kallsyms_on_each_match_symbol(count_symbols, func_name, &count); ++ module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx); + +- return count; ++ return ctx.count; + } + + static int __trace_kprobe_create(int argc, const char *argv[]) +-- +2.34.1 + diff --git a/ci/diffs/0002-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch b/ci/diffs/0002-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch new file mode 100644 index 0000000000000..c4d67693bd132 --- /dev/null +++ b/ci/diffs/0002-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch @@ -0,0 +1,117 @@ +From c3dcadfdf2bf8f01471066700c098b5185240df6 Mon Sep 17 00:00:00 2001 +From: Viktor Malik +Date: Tue, 6 Feb 2024 13:46:10 +0100 +Subject: [PATCH 2/2] tools/resolve_btfids: Fix cross-compilation to non-host + endianness + +The .BTF_ids section is pre-filled with zeroed BTF ID entries during the +build and afterwards patched by resolve_btfids with correct values. +Since resolve_btfids always writes in host-native endianness, it relies +on libelf to do the translation when the target ELF is cross-compiled to +a different endianness (this was introduced in commit 61e8aeda9398 +("bpf: Fix libelf endian handling in resolv_btfids")). + +Unfortunately, the translation will corrupt the flags fields of SET8 +entries because these were written during vmlinux compilation and are in +the correct endianness already. This will lead to numerous selftests +failures such as: + + $ sudo ./test_verifier 502 502 + #502/p sleepable fentry accept FAIL + Failed to load prog 'Invalid argument'! + bpf_fentry_test1 is not sleepable + verification time 34 usec + stack depth 0 + processed 0 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 + Summary: 0 PASSED, 0 SKIPPED, 1 FAILED + +Since it's not possible to instruct libelf to translate just certain +values, let's manually bswap the flags (both global and entry flags) in +resolve_btfids when needed, so that libelf then translates everything +correctly. + +Fixes: ef2c6f370a63 ("tools/resolve_btfids: Add support for 8-byte BTF sets") +Signed-off-by: Viktor Malik +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/bpf/7b6bff690919555574ce0f13d2a5996cacf7bf69.1707223196.git.vmalik@redhat.com +--- + tools/bpf/resolve_btfids/main.c | 35 +++++++++++++++++++++++++++++++++ + 1 file changed, 35 insertions(+) + +diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c +index 32634f00abba..d9520cb826b3 100644 +--- a/tools/bpf/resolve_btfids/main.c ++++ b/tools/bpf/resolve_btfids/main.c +@@ -90,6 +90,14 @@ + + #define ADDR_CNT 100 + ++#if __BYTE_ORDER == __LITTLE_ENDIAN ++# define ELFDATANATIVE ELFDATA2LSB ++#elif __BYTE_ORDER == __BIG_ENDIAN ++# define ELFDATANATIVE ELFDATA2MSB ++#else ++# error "Unknown machine endianness!" ++#endif ++ + struct btf_id { + struct rb_node rb_node; + char *name; +@@ -117,6 +125,7 @@ struct object { + int idlist_shndx; + size_t strtabidx; + unsigned long idlist_addr; ++ int encoding; + } efile; + + struct rb_root sets; +@@ -320,6 +329,7 @@ static int elf_collect(struct object *obj) + { + Elf_Scn *scn = NULL; + size_t shdrstrndx; ++ GElf_Ehdr ehdr; + int idx = 0; + Elf *elf; + int fd; +@@ -351,6 +361,13 @@ static int elf_collect(struct object *obj) + return -1; + } + ++ if (gelf_getehdr(obj->efile.elf, &ehdr) == NULL) { ++ pr_err("FAILED cannot get ELF header: %s\n", ++ elf_errmsg(-1)); ++ return -1; ++ } ++ obj->efile.encoding = ehdr.e_ident[EI_DATA]; ++ + /* + * Scan all the elf sections and look for save data + * from .BTF_ids section and symbols. +@@ -681,6 +698,24 @@ static int sets_patch(struct object *obj) + */ + BUILD_BUG_ON(set8->pairs != &set8->pairs[0].id); + qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); ++ ++ /* ++ * When ELF endianness does not match endianness of the ++ * host, libelf will do the translation when updating ++ * the ELF. This, however, corrupts SET8 flags which are ++ * already in the target endianness. So, let's bswap ++ * them to the host endianness and libelf will then ++ * correctly translate everything. ++ */ ++ if (obj->efile.encoding != ELFDATANATIVE) { ++ int i; ++ ++ set8->flags = bswap_32(set8->flags); ++ for (i = 0; i < set8->cnt; i++) { ++ set8->pairs[i].flags = ++ bswap_32(set8->pairs[i].flags); ++ } ++ } + } + + pr_debug("sorting addr %5lu: cnt %6d [%s]\n", +-- +2.39.3 + diff --git a/ci/diffs/0099-s390x_nolockdep.diff b/ci/diffs/0099-s390x_nolockdep.diff new file mode 100644 index 0000000000000..44c2d1a520656 --- /dev/null +++ b/ci/diffs/0099-s390x_nolockdep.diff @@ -0,0 +1,48 @@ +From 470d0c7874ac638ea62cddc3a20ec047fa4ab539 Mon Sep 17 00:00:00 2001 +From: Manu Bretelle +Date: Wed, 14 Feb 2024 17:25:35 -0800 +Subject: [PATCH] bpf/selftests: disable lockdep on s390x + +Tests are slow to run on s390x, this should make them faster. + +Signed-off-by: Manu Bretelle +--- + tools/testing/selftests/bpf/config.s390x | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/tools/testing/selftests/bpf/config.s390x b/tools/testing/selftests/bpf/config.s390x +index 706931a8c2c69..67bfd62b0b582 100644 +--- a/tools/testing/selftests/bpf/config.s390x ++++ b/tools/testing/selftests/bpf/config.s390x +@@ -23,11 +23,11 @@ CONFIG_CPUSETS=y + CONFIG_CRASH_DUMP=y + CONFIG_CRYPTO_USER_API_RNG=y + CONFIG_CRYPTO_USER_API_SKCIPHER=y +-CONFIG_DEBUG_ATOMIC_SLEEP=y ++CONFIG_DEBUG_ATOMIC_SLEEP=n + CONFIG_DEBUG_INFO_BTF=y + CONFIG_DEBUG_INFO_DWARF4=y + CONFIG_DEBUG_LIST=y +-CONFIG_DEBUG_LOCKDEP=y ++CONFIG_DEBUG_LOCKDEP=n + CONFIG_DEBUG_NOTIFIERS=y + CONFIG_DEBUG_PAGEALLOC=y + CONFIG_DEBUG_SECTION_MISMATCH=y +@@ -71,7 +71,7 @@ CONFIG_KRETPROBES=y + CONFIG_KSM=y + CONFIG_LATENCYTOP=y + CONFIG_LIVEPATCH=y +-CONFIG_LOCK_STAT=y ++CONFIG_LOCK_STAT=n + CONFIG_MACVLAN=y + CONFIG_MACVTAP=y + CONFIG_MAGIC_SYSRQ=y +@@ -101,7 +101,7 @@ CONFIG_PCI=y + CONFIG_POSIX_MQUEUE=y + CONFIG_PROC_KCORE=y + CONFIG_PROFILING=y +-CONFIG_PROVE_LOCKING=y ++CONFIG_PROVE_LOCKING=n + CONFIG_PTDUMP_DEBUGFS=y + CONFIG_RC_DEVICES=y + CONFIG_RC_LOOPBACK=y diff --git a/ci/diffs/0099-scripts-sorttable-Fix-endianness-handling-in-build-t.patch b/ci/diffs/0099-scripts-sorttable-Fix-endianness-handling-in-build-t.patch new file mode 100644 index 0000000000000..9702960f2b4c3 --- /dev/null +++ b/ci/diffs/0099-scripts-sorttable-Fix-endianness-handling-in-build-t.patch @@ -0,0 +1,41 @@ +From 5f799e4752d441263ae6656062e2fca98f51c6cb Mon Sep 17 00:00:00 2001 +From: Vasily Gorbik +Date: Wed, 2 Apr 2025 03:15:35 +0200 +Subject: [PATCH] scripts/sorttable: Fix endianness handling in build-time + mcount sort + +Kernel cross-compilation with BUILDTIME_MCOUNT_SORT produces zeroed +mcount values if the build-host endianness does not match the ELF +file endianness. + +The mcount values array is converted from ELF file +endianness to build-host endianness during initialization in +fill_relocs()/fill_addrs(). Avoid extra conversion of these values during +weak-function zeroing; otherwise, they do not match nm-parsed addresses +and all mcount values are zeroed out. + +Fixes: ef378c3b8233 ("scripts/sorttable: Zero out weak functions in mcount_loc table") +Reported-by: Ilya Leoshkevich +Reported-by: Ihor Solodrai +Closes: https://lore.kernel.org/all/your-ad-here.call-01743522822-ext-4975@work.hours/ +Signed-off-by: Vasily Gorbik +--- + scripts/sorttable.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/scripts/sorttable.c b/scripts/sorttable.c +index 7b4b3714b1af..deed676bfe38 100644 +--- a/scripts/sorttable.c ++++ b/scripts/sorttable.c +@@ -857,7 +857,7 @@ static void *sort_mcount_loc(void *arg) + for (void *ptr = vals; ptr < vals + size; ptr += long_size) { + uint64_t key; + +- key = long_size == 4 ? r((uint32_t *)ptr) : r8((uint64_t *)ptr); ++ key = long_size == 4 ? *(uint32_t *)ptr : *(uint64_t *)ptr; + if (!find_func(key)) { + if (long_size == 4) + *(uint32_t *)ptr = 0; +-- +2.49.0 + diff --git a/ci/diffs/0099-selftest-cross-compile.diff b/ci/diffs/0099-selftest-cross-compile.diff new file mode 100644 index 0000000000000..e8732596bdb3f --- /dev/null +++ b/ci/diffs/0099-selftest-cross-compile.diff @@ -0,0 +1,13 @@ +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index a38a3001527c..af68528cc944 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -304,7 +304,7 @@ $(OUTPUT)/test_maps: $(TESTING_HELPERS) + $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) $(UNPRIV_HELPERS) + $(OUTPUT)/xsk.o: $(BPFOBJ) + +-BPFTOOL ?= $(DEFAULT_BPFTOOL) ++BPFTOOL ?= $(TRUNNER_BPFTOOL) + $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ + $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool + $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ diff --git a/ci/diffs/0199-iov_iter-fix-advancing-slot-in-iter_folioq_get_pages.patch b/ci/diffs/0199-iov_iter-fix-advancing-slot-in-iter_folioq_get_pages.patch new file mode 100644 index 0000000000000..b81d22a35322c --- /dev/null +++ b/ci/diffs/0199-iov_iter-fix-advancing-slot-in-iter_folioq_get_pages.patch @@ -0,0 +1,46 @@ +From 0d24852bd71ec85ca0016b6d6fc997e6a3381552 Mon Sep 17 00:00:00 2001 +From: Omar Sandoval +Date: Mon, 30 Sep 2024 11:55:00 -0700 +Subject: [PATCH] iov_iter: fix advancing slot in iter_folioq_get_pages() + +iter_folioq_get_pages() decides to advance to the next folioq slot when +it has reached the end of the current folio. However, it is checking +offset, which is the beginning of the current part, instead of +iov_offset, which is adjusted to the end of the current part, so it +doesn't advance the slot when it's supposed to. As a result, on the next +iteration, we'll use the same folio with an out-of-bounds offset and +return an unrelated page. + +This manifested as various crashes and other failures in 9pfs in drgn's +VM testing setup and BPF CI. + +Fixes: db0aa2e9566f ("mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios") +Link: https://lore.kernel.org/linux-fsdevel/20240923183432.1876750-1-chantr4@gmail.com/ +Tested-by: Manu Bretelle +Signed-off-by: Omar Sandoval +Link: https://lore.kernel.org/r/cbaf141ba6c0e2e209717d02746584072844841a.1727722269.git.osandov@fb.com +Tested-by: Eduard Zingerman +Tested-by: Leon Romanovsky +Tested-by: Joey Gouly +Acked-by: David Howells +Signed-off-by: Christian Brauner +--- + lib/iov_iter.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/iov_iter.c b/lib/iov_iter.c +index 97003155b..1abb32c0d 100644 +--- a/lib/iov_iter.c ++++ b/lib/iov_iter.c +@@ -1033,7 +1033,7 @@ static ssize_t iter_folioq_get_pages(struct iov_iter *iter, + if (maxpages == 0 || extracted >= maxsize) + break; + +- if (offset >= fsize) { ++ if (iov_offset >= fsize) { + iov_offset = 0; + slot++; + if (slot == folioq_nr_slots(folioq) && folioq->next) { +-- +2.34.1 + diff --git a/ci/diffs/0299-selftests-bpf-Fix-uprobe-consumer-test.patch b/ci/diffs/0299-selftests-bpf-Fix-uprobe-consumer-test.patch new file mode 100644 index 0000000000000..11aa3626f5a81 --- /dev/null +++ b/ci/diffs/0299-selftests-bpf-Fix-uprobe-consumer-test.patch @@ -0,0 +1,58 @@ +From affb32e4f056883f285f8535b766293b85752fb4 Mon Sep 17 00:00:00 2001 +From: Jiri Olsa +Date: Tue, 24 Sep 2024 13:07:30 +0200 +Subject: [PATCH] selftests/bpf: Fix uprobe consumer test + +With newly merged code the uprobe behaviour is slightly different +and affects uprobe consumer test. + +We no longer need to check if the uprobe object is still preserved +after removing last uretprobe, because it stays as long as there's +pending/installed uretprobe instance. + +This allows to run uretprobe consumers registered 'after' uprobe was +hit even if previous uretprobe got unregistered before being hit. + +The uprobe object will be now removed after the last uprobe ref is +released and in such case it's held by ri->uprobe (return instance) +which is released after the uretprobe is hit. + +Reported-by: Ihor Solodrai +Signed-off-by: Jiri Olsa +Signed-off-by: Daniel Borkmann +Tested-by: Ihor Solodrai +Closes: https://lore.kernel.org/bpf/w6U8Z9fdhjnkSp2UaFaV1fGqJXvfLEtDKEUyGDkwmoruDJ_AgF_c0FFhrkeKW18OqiP-05s9yDKiT6X-Ns-avN_ABf0dcUkXqbSJN1TQSXo=@pm.me/ +--- + .../testing/selftests/bpf/prog_tests/uprobe_multi_test.c | 9 +-------- + 1 file changed, 1 insertion(+), 8 deletions(-) + +diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +index 844f6fc8487b..c1ac813ff9ba 100644 +--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c ++++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +@@ -869,21 +869,14 @@ static void consumer_test(struct uprobe_multi_consumers *skel, + fmt = "prog 0/1: uprobe"; + } else { + /* +- * uprobe return is tricky ;-) +- * + * to trigger uretprobe consumer, the uretprobe needs to be installed, + * which means one of the 'return' uprobes was alive when probe was hit: + * + * idxs: 2/3 uprobe return in 'installed' mask +- * +- * in addition if 'after' state removes everything that was installed in +- * 'before' state, then uprobe kernel object goes away and return uprobe +- * is not installed and we won't hit it even if it's in 'after' state. + */ + unsigned long had_uretprobes = before & 0b1100; /* is uretprobe installed */ +- unsigned long probe_preserved = before & after; /* did uprobe go away */ + +- if (had_uretprobes && probe_preserved && test_bit(idx, after)) ++ if (had_uretprobes && test_bit(idx, after)) + val++; + fmt = "idx 2/3: uretprobe"; + } +-- +2.34.1 + diff --git a/ci/diffs/0399-selftests-sched_ext-fix-build-after-renames-in-sched.patch b/ci/diffs/0399-selftests-sched_ext-fix-build-after-renames-in-sched.patch new file mode 100644 index 0000000000000..ba37429396236 --- /dev/null +++ b/ci/diffs/0399-selftests-sched_ext-fix-build-after-renames-in-sched.patch @@ -0,0 +1,231 @@ +From 5565144e82b97c5d2082ab19866836dfe5b2e592 Mon Sep 17 00:00:00 2001 +From: Ihor Solodrai +Date: Thu, 21 Nov 2024 13:20:46 -0800 +Subject: [PATCH] selftests/sched_ext: fix build after renames in sched_ext API + +The selftests are falining to build on current tip of bpf-next and +sched_ext [1]. This has broken BPF CI [2] after merge from upstream. + +Use appropriate function names in the selftests according to the +recent changes in the sched_ext API [3]. + +[1] +https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=fc39fb56917bb3cb53e99560ca3612a84456ada2 +[2] https://github.com/kernel-patches/bpf/actions/runs/11959327258/job/33340923745 +[3] https://lore.kernel.org/all/20241109194853.580310-1-tj@kernel.org/ + +Signed-off-by: Ihor Solodrai +--- + .../testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c | 2 +- + .../selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c | 4 ++-- + tools/testing/selftests/sched_ext/dsp_local_on.bpf.c | 2 +- + .../selftests/sched_ext/enq_select_cpu_fails.bpf.c | 2 +- + tools/testing/selftests/sched_ext/exit.bpf.c | 4 ++-- + tools/testing/selftests/sched_ext/maximal.bpf.c | 4 ++-- + tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c | 2 +- + .../selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c | 2 +- + .../testing/selftests/sched_ext/select_cpu_dispatch.bpf.c | 2 +- + .../selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c | 2 +- + .../selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c | 4 ++-- + tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c | 8 ++++---- + 12 files changed, 19 insertions(+), 19 deletions(-) + +diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c +index 37d9bf6fb745..6f4c3f5a1c5d 100644 +--- a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c ++++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c +@@ -20,7 +20,7 @@ s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p, + * If we dispatch to a bogus DSQ that will fall back to the + * builtin global DSQ, we fail gracefully. + */ +- scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL, ++ scx_bpf_dsq_insert_vtime(p, 0xcafef00d, SCX_SLICE_DFL, + p->scx.dsq_vtime, 0); + return cpu; + } +diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c +index dffc97d9cdf1..e4a55027778f 100644 +--- a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c ++++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c +@@ -17,8 +17,8 @@ s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p, + + if (cpu >= 0) { + /* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */ +- scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, +- p->scx.dsq_vtime, 0); ++ scx_bpf_dsq_insert_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, ++ p->scx.dsq_vtime, 0); + return cpu; + } + +diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +index 6a7db1502c29..6325bf76f47e 100644 +--- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c ++++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +@@ -45,7 +45,7 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) + + target = bpf_get_prandom_u32() % nr_cpus; + +- scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); ++ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); + bpf_task_release(p); + } + +diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c +index 1efb50d61040..a7cf868d5e31 100644 +--- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c ++++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c +@@ -31,7 +31,7 @@ void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p, + /* Can only call from ops.select_cpu() */ + scx_bpf_select_cpu_dfl(p, 0, 0, &found); + +- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } + + SEC(".struct_ops.link") +diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c +index d75d4faf07f6..4bc36182d3ff 100644 +--- a/tools/testing/selftests/sched_ext/exit.bpf.c ++++ b/tools/testing/selftests/sched_ext/exit.bpf.c +@@ -33,7 +33,7 @@ void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags) + if (exit_point == EXIT_ENQUEUE) + EXIT_CLEANLY(); + +- scx_bpf_dispatch(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); + } + + void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) +@@ -41,7 +41,7 @@ void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) + if (exit_point == EXIT_DISPATCH) + EXIT_CLEANLY(); + +- scx_bpf_consume(DSQ_ID); ++ scx_bpf_dsq_move_to_local(DSQ_ID); + } + + void BPF_STRUCT_OPS(exit_enable, struct task_struct *p) +diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c +index 4d4cd8d966db..4c005fa71810 100644 +--- a/tools/testing/selftests/sched_ext/maximal.bpf.c ++++ b/tools/testing/selftests/sched_ext/maximal.bpf.c +@@ -20,7 +20,7 @@ s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, + + void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) + { +- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } + + void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) +@@ -28,7 +28,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) + + void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) + { +- scx_bpf_consume(SCX_DSQ_GLOBAL); ++ scx_bpf_dsq_move_to_local(SCX_DSQ_GLOBAL); + } + + void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c +index f171ac470970..13d0f5be788d 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c +@@ -30,7 +30,7 @@ void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p, + } + scx_bpf_put_idle_cpumask(idle_mask); + +- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } + + SEC(".struct_ops.link") +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c +index 9efdbb7da928..815f1d5d61ac 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c +@@ -67,7 +67,7 @@ void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p, + saw_local = true; + } + +- scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags); + } + + s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task, +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c +index 59bfc4f36167..4bb99699e920 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c +@@ -29,7 +29,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p, + cpu = prev_cpu; + + dispatch: +- scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0); ++ scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, 0); + return cpu; + } + +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c +index 3bbd5fcdfb18..2a75de11b2cf 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c +@@ -18,7 +18,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p + s32 prev_cpu, u64 wake_flags) + { + /* Dispatching to a random DSQ should fail. */ +- scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0); ++ scx_bpf_dsq_insert(p, 0xcafef00d, SCX_SLICE_DFL, 0); + + return prev_cpu; + } +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c +index 0fda57fe0ecf..99d075695c97 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c +@@ -18,8 +18,8 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p + s32 prev_cpu, u64 wake_flags) + { + /* Dispatching twice in a row is disallowed. */ +- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); +- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); ++ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); ++ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + + return prev_cpu; + } +diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c +index e6c67bcf5e6e..bfcb96cd4954 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c +@@ -2,8 +2,8 @@ + /* + * A scheduler that validates that enqueue flags are properly stored and + * applied at dispatch time when a task is directly dispatched from +- * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and +- * making the test a very basic vtime scheduler. ++ * ops.select_cpu(). We validate this by using scx_bpf_dsq_insert_vtime(), ++ * and making the test a very basic vtime scheduler. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet +@@ -47,13 +47,13 @@ s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p, + cpu = prev_cpu; + scx_bpf_test_and_clear_cpu_idle(cpu); + ddsp: +- scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); ++ scx_bpf_dsq_insert_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); + return cpu; + } + + void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p) + { +- if (scx_bpf_consume(VTIME_DSQ)) ++ if (scx_bpf_dsq_move_to_local(VTIME_DSQ)) + consumed = true; + } + +-- +2.47.0 + diff --git a/ci/diffs/0499-samples-bpf-fix-samples-compilation.patch b/ci/diffs/0499-samples-bpf-fix-samples-compilation.patch new file mode 100644 index 0000000000000..1f95c3c237cb9 --- /dev/null +++ b/ci/diffs/0499-samples-bpf-fix-samples-compilation.patch @@ -0,0 +1,61 @@ +From 80a5958a52b86e39c1a1bf5f4702011c0cf6ab4f Mon Sep 17 00:00:00 2001 +From: Eduard Zingerman +Date: Mon, 2 Dec 2024 12:14:46 -0800 +Subject: [PATCH] samples/bpf: fix samples compilation + +Commit [0] breaks samples build. + +TODO: moar details here + +[0] 13b25489b6f8 ("kbuild: change working directory to external module directory with M=") + +Signed-off-by: Eduard Zingerman +Signed-off-by: Ihor Solodrai +--- + samples/bpf/Makefile | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile +index bcf103a4c14f..ee10dbf1b471 100644 +--- a/samples/bpf/Makefile ++++ b/samples/bpf/Makefile +@@ -146,13 +146,14 @@ ifeq ($(ARCH), x86) + BPF_EXTRA_CFLAGS += -fcf-protection + endif + +-TPROGS_CFLAGS += -Wall -O2 +-TPROGS_CFLAGS += -Wmissing-prototypes +-TPROGS_CFLAGS += -Wstrict-prototypes +-TPROGS_CFLAGS += $(call try-run,\ ++COMMON_CFLAGS += -Wall -O2 ++COMMON_CFLAGS += -Wmissing-prototypes ++COMMON_CFLAGS += -Wstrict-prototypes ++COMMON_CFLAGS += $(call try-run,\ + printf "int main() { return 0; }" |\ + $(CC) -Werror -fsanitize=bounds -x c - -o "$$TMP",-fsanitize=bounds,) + ++TPROGS_CFLAGS += $(COMMON_CFLAGS) + TPROGS_CFLAGS += -I$(objtree)/usr/include + TPROGS_CFLAGS += -I$(srctree)/tools/testing/selftests/bpf/ + TPROGS_CFLAGS += -I$(LIBBPF_INCLUDE) +@@ -229,7 +230,7 @@ clean: + + $(LIBBPF): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT) + # Fix up variables inherited from Kbuild that tools/ build system won't like +- $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(TPROGS_CFLAGS)" \ ++ $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(COMMON_CFLAGS)" \ + LDFLAGS="$(TPROGS_LDFLAGS)" srctree=$(BPF_SAMPLES_PATH)/../../ \ + O= OUTPUT=$(LIBBPF_OUTPUT)/ DESTDIR=$(LIBBPF_DESTDIR) prefix= \ + $@ install_headers +@@ -305,7 +306,7 @@ $(obj)/$(TRACE_HELPERS): TPROGS_CFLAGS := $(TPROGS_CFLAGS) -D__must_check= + -include $(BPF_SAMPLES_PATH)/Makefile.target + + VMLINUX_BTF_PATHS ?= $(abspath $(if $(O),$(O)/vmlinux)) \ +- $(abspath $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)) \ ++ $(abspath $(if $(objtree),$(objtree)/vmlinux)) \ + $(abspath ./vmlinux) + VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) + +-- +2.47.0 + diff --git a/ci/diffs/2000-s390-fgraph-Fix-to-remove-ftrace_test_recursion_tryl.patch b/ci/diffs/2000-s390-fgraph-Fix-to-remove-ftrace_test_recursion_tryl.patch new file mode 100644 index 0000000000000..7e38b1a151a10 --- /dev/null +++ b/ci/diffs/2000-s390-fgraph-Fix-to-remove-ftrace_test_recursion_tryl.patch @@ -0,0 +1,46 @@ +From faf291ff4beaef8dedebd166f11f815cdee257dc Mon Sep 17 00:00:00 2001 +From: "Masami Hiramatsu (Google)" +Date: Wed, 29 Jan 2025 00:29:37 +0900 +Subject: [PATCH 2000/2001] s390: fgraph: Fix to remove + ftrace_test_recursion_trylock() + +Fix to remove ftrace_test_recursion_trylock() from ftrace_graph_func() +because commit d576aec24df9 ("fgraph: Get ftrace recursion lock in +function_graph_enter") has been moved it to function_graph_enter_regs() +already. + +Reported-by: Jiri Olsa +Closes: https://lore.kernel.org/all/Z5O0shrdgeExZ2kF@krava/ +Fixes: d576aec24df9 ("fgraph: Get ftrace recursion lock in function_graph_enter") +Signed-off-by: Masami Hiramatsu (Google) +Tested-by: Jiri Olsa +--- + arch/s390/kernel/ftrace.c | 5 ----- + 1 file changed, 5 deletions(-) + +diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c +index c0b2c97efefb..63ba6306632e 100644 +--- a/arch/s390/kernel/ftrace.c ++++ b/arch/s390/kernel/ftrace.c +@@ -266,18 +266,13 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) + { + unsigned long *parent = &arch_ftrace_regs(fregs)->regs.gprs[14]; +- int bit; + + if (unlikely(ftrace_graph_is_dead())) + return; + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; +- bit = ftrace_test_recursion_trylock(ip, *parent); +- if (bit < 0) +- return; + if (!function_graph_enter_regs(*parent, ip, 0, parent, fregs)) + *parent = (unsigned long)&return_to_handler; +- ftrace_test_recursion_unlock(bit); + } + + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +-- +2.48.1 + diff --git a/ci/diffs/2001-s390-tracing-Define-ftrace_get_symaddr-for-s390.patch b/ci/diffs/2001-s390-tracing-Define-ftrace_get_symaddr-for-s390.patch new file mode 100644 index 0000000000000..66483206f448f --- /dev/null +++ b/ci/diffs/2001-s390-tracing-Define-ftrace_get_symaddr-for-s390.patch @@ -0,0 +1,28 @@ +From 04fce0d606f59a62105729094013c4784492ec7b Mon Sep 17 00:00:00 2001 +From: "Masami Hiramatsu (Google)" +Date: Wed, 29 Jan 2025 00:29:48 +0900 +Subject: [PATCH 2001/2001] s390: tracing: Define ftrace_get_symaddr() for s390 + +Add ftrace_get_symaddr() for s390, which returns the symbol address +from ftrace's 'ip' parameter. + +Signed-off-by: Masami Hiramatsu (Google) +--- + arch/s390/include/asm/ftrace.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h +index a3b73a4f626e..185331e91f83 100644 +--- a/arch/s390/include/asm/ftrace.h ++++ b/arch/s390/include/asm/ftrace.h +@@ -51,6 +51,7 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) + { + return addr; + } ++#define ftrace_get_symaddr(fentry_ip) ((unsigned long)(fentry_ip)) + + #include + +-- +2.48.1 + diff --git a/ci/diffs/2001-selftests-bpf-add-fno-strict-aliasing-to-BPF_CFLAGS.patch b/ci/diffs/2001-selftests-bpf-add-fno-strict-aliasing-to-BPF_CFLAGS.patch new file mode 100644 index 0000000000000..9b24de70e2846 --- /dev/null +++ b/ci/diffs/2001-selftests-bpf-add-fno-strict-aliasing-to-BPF_CFLAGS.patch @@ -0,0 +1,75 @@ +From f44275e7155dc310d36516fc25be503da099781c Mon Sep 17 00:00:00 2001 +From: Ihor Solodrai +Date: Mon, 6 Jan 2025 20:17:31 +0000 +Subject: [PATCH] selftests/bpf: add -fno-strict-aliasing to BPF_CFLAGS + +Following the discussion at [1], set -fno-strict-aliasing flag for all +BPF object build rules. Remove now unnecessary -CFLAGS variables. + +[1] https://lore.kernel.org/bpf/20250106185447.951609-1-ihor.solodrai@pm.me/ + +CC: Jose E. Marchesi +Signed-off-by: Ihor Solodrai +Acked-by: Eduard Zingerman +Link: https://lore.kernel.org/r/20250106201728.1219791-1-ihor.solodrai@pm.me +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/Makefile | 28 +--------------------------- + 1 file changed, 1 insertion(+), 27 deletions(-) + +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index eb4d21651aa7..d5be2f94deef 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -54,21 +54,6 @@ PCAP_LIBS := $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null) + LDLIBS += $(PCAP_LIBS) + CFLAGS += $(PCAP_CFLAGS) + +-# The following tests perform type punning and they may break strict +-# aliasing rules, which are exploited by both GCC and clang by default +-# while optimizing. This can lead to broken programs. +-progs/bind4_prog.c-CFLAGS := -fno-strict-aliasing +-progs/bind6_prog.c-CFLAGS := -fno-strict-aliasing +-progs/dynptr_fail.c-CFLAGS := -fno-strict-aliasing +-progs/linked_list_fail.c-CFLAGS := -fno-strict-aliasing +-progs/map_kptr_fail.c-CFLAGS := -fno-strict-aliasing +-progs/syscall.c-CFLAGS := -fno-strict-aliasing +-progs/test_pkt_md_access.c-CFLAGS := -fno-strict-aliasing +-progs/test_sk_lookup.c-CFLAGS := -fno-strict-aliasing +-progs/timer_crash.c-CFLAGS := -fno-strict-aliasing +-progs/test_global_func9.c-CFLAGS := -fno-strict-aliasing +-progs/verifier_nocsr.c-CFLAGS := -fno-strict-aliasing +- + # Some utility functions use LLVM libraries + jit_disasm_helpers.c-CFLAGS = $(LLVM_CFLAGS) + +@@ -103,18 +88,6 @@ progs/btf_dump_test_case_packing.c-bpf_gcc-CFLAGS := -Wno-error + progs/btf_dump_test_case_padding.c-bpf_gcc-CFLAGS := -Wno-error + progs/btf_dump_test_case_syntax.c-bpf_gcc-CFLAGS := -Wno-error + +-# The following tests do type-punning, via the __imm_insn macro, from +-# `struct bpf_insn' to long and then uses the value. This triggers an +-# "is used uninitialized" warning in GCC due to strict-aliasing +-# rules. +-progs/verifier_ref_tracking.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_unpriv.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_cgroup_storage.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_ld_ind.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_map_ret_val.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_spill_fill.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_subprog_precision.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_uninit.c-bpf_gcc-CFLAGS := -fno-strict-aliasing + endif + + ifneq ($(CLANG_CPUV4),) +@@ -474,6 +447,7 @@ CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) + BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ + -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ + -I$(abspath $(OUTPUT)/../usr/include) \ ++ -fno-strict-aliasing \ + -Wno-compare-distinct-pointer-types + # TODO: enable me -Wsign-compare + +-- +2.47.1 + diff --git a/ci/diffs/2002-selftests-bpf-add-std-gnu11-to-BPF_CFLAGS-and-CFLAGS.patch b/ci/diffs/2002-selftests-bpf-add-std-gnu11-to-BPF_CFLAGS-and-CFLAGS.patch new file mode 100644 index 0000000000000..127b2641dc223 --- /dev/null +++ b/ci/diffs/2002-selftests-bpf-add-std-gnu11-to-BPF_CFLAGS-and-CFLAGS.patch @@ -0,0 +1,63 @@ +From bab18c7db44d3aa6c84450095451580922359c7a Mon Sep 17 00:00:00 2001 +From: Ihor Solodrai +Date: Tue, 7 Jan 2025 23:58:18 +0000 +Subject: [PATCH] selftests/bpf: add -std=gnu11 to BPF_CFLAGS and CFLAGS + +Latest versions of GCC BPF use C23 standard by default. This causes +compilation errors in vmlinux.h due to bool types declarations. + +Add -std=gnu11 to BPF_CFLAGS and CFLAGS. This aligns with the version +of the standard used when building the kernel currently [1]. + +For more details see the discussions at [2] and [3]. + +[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Makefile#n465 +[2] https://lore.kernel.org/bpf/EYcXjcKDCJY7Yb0GGtAAb7nLKPEvrgWdvWpuNzXm2qi6rYMZDixKv5KwfVVMBq17V55xyC-A1wIjrqG3aw-Imqudo9q9X7D7nLU2gWgbN0w=@pm.me/ +[3] https://lore.kernel.org/bpf/20250106202715.1232864-1-ihor.solodrai@pm.me/ + +CC: Jose E. Marchesi +Signed-off-by: Ihor Solodrai +Link: https://lore.kernel.org/r/20250107235813.2964472-1-ihor.solodrai@pm.me +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/Makefile | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index d5be2f94deef..ea9cee5de0f8 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -41,7 +41,7 @@ srctree := $(patsubst %/,%,$(dir $(srctree))) + srctree := $(patsubst %/,%,$(dir $(srctree))) + endif + +-CFLAGS += -g $(OPT_FLAGS) -rdynamic \ ++CFLAGS += -g $(OPT_FLAGS) -rdynamic -std=gnu11 \ + -Wall -Werror -fno-omit-frame-pointer \ + $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS) \ + -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ +@@ -447,6 +447,7 @@ CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) + BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ + -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ + -I$(abspath $(OUTPUT)/../usr/include) \ ++ -std=gnu11 \ + -fno-strict-aliasing \ + -Wno-compare-distinct-pointer-types + # TODO: enable me -Wsign-compare +@@ -787,9 +788,12 @@ $(OUTPUT)/xdp_features: xdp_features.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xdp + $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ + + # Make sure we are able to include and link libbpf against c++. ++CXXFLAGS += $(CFLAGS) ++CXXFLAGS := $(subst -D_GNU_SOURCE=,,$(CXXFLAGS)) ++CXXFLAGS := $(subst -std=gnu11,-std=gnu++11,$(CXXFLAGS)) + $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) + $(call msg,CXX,,$@) +- $(Q)$(CXX) $(subst -D_GNU_SOURCE=,,$(CFLAGS)) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ ++ $(Q)$(CXX) $(CXXFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ + + # Benchmark runner + $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ) +-- +2.47.1 + diff --git a/ci/diffs/3001-selftests-bpf-Fix-stdout-race-condition-in-traffic-m.patch b/ci/diffs/3001-selftests-bpf-Fix-stdout-race-condition-in-traffic-m.patch new file mode 100644 index 0000000000000..1c078304d58b3 --- /dev/null +++ b/ci/diffs/3001-selftests-bpf-Fix-stdout-race-condition-in-traffic-m.patch @@ -0,0 +1,112 @@ +From b99f27e90268b1a814c13f8bd72ea1db448ea257 Mon Sep 17 00:00:00 2001 +From: Amery Hung +Date: Thu, 13 Feb 2025 15:32:17 -0800 +Subject: [PATCH] selftests/bpf: Fix stdout race condition in traffic monitor + +Fix a race condition between the main test_progs thread and the traffic +monitoring thread. The traffic monitor thread tries to print a line +using multiple printf and use flockfile() to prevent the line from being +torn apart. Meanwhile, the main thread doing io redirection can reassign +or close stdout when going through tests. A deadlock as shown below can +happen. + + main traffic_monitor_thread + ==== ====================== + show_transport() + -> flockfile(stdout) + +stdio_hijack_init() +-> stdout = open_memstream(log_buf, log_cnt); + ... + env.subtest_state->stdout_saved = stdout; + + ... + funlockfile(stdout) +stdio_restore_cleanup() +-> fclose(env.subtest_state->stdout_saved); + +After the traffic monitor thread lock stdout, A new memstream can be +assigned to stdout by the main thread. Therefore, the traffic monitor +thread later will not be able to unlock the original stdout. As the +main thread tries to access the old stdout, it will hang indefinitely +as it is still locked by the traffic monitor thread. + +The deadlock can be reproduced by running test_progs repeatedly with +traffic monitor enabled: + +for ((i=1;i<=100;i++)); do + ./test_progs -a flow_dissector_skb* -m '*' +done + +Fix this by only calling printf once and remove flockfile()/funlockfile(). + +Signed-off-by: Amery Hung +Signed-off-by: Martin KaFai Lau +Link: https://patch.msgid.link/20250213233217.553258-1-ameryhung@gmail.com +--- + tools/testing/selftests/bpf/network_helpers.c | 33 ++++++++----------- + 1 file changed, 13 insertions(+), 20 deletions(-) + +diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c +index a4252e000428..737a952dcf80 100644 +--- a/tools/testing/selftests/bpf/network_helpers.c ++++ b/tools/testing/selftests/bpf/network_helpers.c +@@ -788,12 +788,13 @@ static const char *pkt_type_str(u16 pkt_type) + return "Unknown"; + } + ++#define MAX_FLAGS_STRLEN 21 + /* Show the information of the transport layer in the packet */ + static void show_transport(const u_char *packet, u16 len, u32 ifindex, + const char *src_addr, const char *dst_addr, + u16 proto, bool ipv6, u8 pkt_type) + { +- char *ifname, _ifname[IF_NAMESIZE]; ++ char *ifname, _ifname[IF_NAMESIZE], flags[MAX_FLAGS_STRLEN] = ""; + const char *transport_str; + u16 src_port, dst_port; + struct udphdr *udp; +@@ -834,29 +835,21 @@ static void show_transport(const u_char *packet, u16 len, u32 ifindex, + + /* TCP or UDP*/ + +- flockfile(stdout); ++ if (proto == IPPROTO_TCP) ++ snprintf(flags, MAX_FLAGS_STRLEN, "%s%s%s%s", ++ tcp->fin ? ", FIN" : "", ++ tcp->syn ? ", SYN" : "", ++ tcp->rst ? ", RST" : "", ++ tcp->ack ? ", ACK" : ""); ++ + if (ipv6) +- printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d", ++ printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d%s\n", + ifname, pkt_type_str(pkt_type), src_addr, src_port, +- dst_addr, dst_port, transport_str, len); ++ dst_addr, dst_port, transport_str, len, flags); + else +- printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d", ++ printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d%s\n", + ifname, pkt_type_str(pkt_type), src_addr, src_port, +- dst_addr, dst_port, transport_str, len); +- +- if (proto == IPPROTO_TCP) { +- if (tcp->fin) +- printf(", FIN"); +- if (tcp->syn) +- printf(", SYN"); +- if (tcp->rst) +- printf(", RST"); +- if (tcp->ack) +- printf(", ACK"); +- } +- +- printf("\n"); +- funlockfile(stdout); ++ dst_addr, dst_port, transport_str, len, flags); + } + + static void show_ipv6_packet(const u_char *packet, u32 ifindex, u8 pkt_type) +-- +2.48.1 + diff --git a/ci/diffs/3002-selftests-bpf-Clean-up-call-sites-of-stdio_restore.patch b/ci/diffs/3002-selftests-bpf-Clean-up-call-sites-of-stdio_restore.patch new file mode 100644 index 0000000000000..3c0ad2dee6c0e --- /dev/null +++ b/ci/diffs/3002-selftests-bpf-Clean-up-call-sites-of-stdio_restore.patch @@ -0,0 +1,106 @@ +From aeb5bbb0253892f601702aee9fc00038824a1c59 Mon Sep 17 00:00:00 2001 +From: Amery Hung +Date: Wed, 5 Mar 2025 10:20:55 -0800 +Subject: [PATCH] selftests/bpf: Clean up call sites of stdio_restore() + +reset_affinity() and save_ns() are only called in run_one_test(). There is +no need to call stdio_restore() in reset_affinity() and save_ns() if +stdio_restore() is moved right after a test finishes in run_one_test(). + +Also remove an unnecessary check of env.stdout_saved in crash_handler() +by moving env.stdout_saved assignment to the beginning of main(). + +Signed-off-by: Amery Hung +Signed-off-by: Martin KaFai Lau +Acked-by: Eduard Zingerman +Link: https://patch.msgid.link/20250305182057.2802606-1-ameryhung@gmail.com +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/test_progs.c | 17 ++++++----------- + 1 file changed, 6 insertions(+), 11 deletions(-) + +diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c +index 0cb759632225..ab0f2fed3c58 100644 +--- a/tools/testing/selftests/bpf/test_progs.c ++++ b/tools/testing/selftests/bpf/test_progs.c +@@ -474,8 +474,6 @@ static void dump_test_log(const struct prog_test_def *test, + print_test_result(test, test_state); + } + +-static void stdio_restore(void); +- + /* A bunch of tests set custom affinity per-thread and/or per-process. Reset + * it after each test/sub-test. + */ +@@ -490,13 +488,11 @@ static void reset_affinity(void) + + err = sched_setaffinity(0, sizeof(cpuset), &cpuset); + if (err < 0) { +- stdio_restore(); + fprintf(stderr, "Failed to reset process affinity: %d!\n", err); + exit(EXIT_ERR_SETUP_INFRA); + } + err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); + if (err < 0) { +- stdio_restore(); + fprintf(stderr, "Failed to reset thread affinity: %d!\n", err); + exit(EXIT_ERR_SETUP_INFRA); + } +@@ -514,7 +510,6 @@ static void save_netns(void) + static void restore_netns(void) + { + if (setns(env.saved_netns_fd, CLONE_NEWNET) == -1) { +- stdio_restore(); + perror("setns(CLONE_NEWNS)"); + exit(EXIT_ERR_SETUP_INFRA); + } +@@ -1270,8 +1265,7 @@ void crash_handler(int signum) + + sz = backtrace(bt, ARRAY_SIZE(bt)); + +- if (env.stdout_saved) +- stdio_restore(); ++ stdio_restore(); + if (env.test) { + env.test_state->error_cnt++; + dump_test_log(env.test, env.test_state, true, false, NULL); +@@ -1400,6 +1394,8 @@ static void run_one_test(int test_num) + + state->tested = true; + ++ stdio_restore(); ++ + if (verbose() && env.worker_id == -1) + print_test_result(test, state); + +@@ -1408,7 +1404,6 @@ static void run_one_test(int test_num) + if (test->need_cgroup_cleanup) + cleanup_cgroup_environment(); + +- stdio_restore(); + free(stop_libbpf_log_capture()); + + dump_test_log(test, state, false, false, NULL); +@@ -1943,6 +1938,9 @@ int main(int argc, char **argv) + + sigaction(SIGSEGV, &sigact, NULL); + ++ env.stdout_saved = stdout; ++ env.stderr_saved = stderr; ++ + env.secs_till_notify = 10; + env.secs_till_kill = 120; + err = argp_parse(&argp, argc, argv, 0, NULL, &env); +@@ -1969,9 +1967,6 @@ int main(int argc, char **argv) + return -1; + } + +- env.stdout_saved = stdout; +- env.stderr_saved = stderr; +- + env.has_testmod = true; + if (!env.list_test_names) { + /* ensure previous instance of the module is unloaded */ +-- +2.48.1 + diff --git a/ci/diffs/3003-selftests-bpf-Allow-assigning-traffic-monitor-print-.patch b/ci/diffs/3003-selftests-bpf-Allow-assigning-traffic-monitor-print-.patch new file mode 100644 index 0000000000000..746d1d6210b1f --- /dev/null +++ b/ci/diffs/3003-selftests-bpf-Allow-assigning-traffic-monitor-print-.patch @@ -0,0 +1,179 @@ +From 34a25aabcdea5c6e42a283381f8354d70592744a Mon Sep 17 00:00:00 2001 +From: Amery Hung +Date: Wed, 5 Mar 2025 10:20:56 -0800 +Subject: [PATCH] selftests/bpf: Allow assigning traffic monitor print function + +Allow users to change traffic monitor's print function. If not provided, +traffic monitor will print to stdout by default. + +Signed-off-by: Amery Hung +Signed-off-by: Martin KaFai Lau +Link: https://patch.msgid.link/20250305182057.2802606-2-ameryhung@gmail.com +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/network_helpers.c | 69 ++++++++++++++----- + tools/testing/selftests/bpf/network_helpers.h | 9 +++ + 2 files changed, 59 insertions(+), 19 deletions(-) + +diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c +index 737a952dcf80..97fb7caf95f4 100644 +--- a/tools/testing/selftests/bpf/network_helpers.c ++++ b/tools/testing/selftests/bpf/network_helpers.c +@@ -750,6 +750,36 @@ struct tmonitor_ctx { + int pcap_fd; + }; + ++static int __base_pr(const char *format, va_list args) ++{ ++ return vfprintf(stdout, format, args); ++} ++ ++static tm_print_fn_t __tm_pr = __base_pr; ++ ++tm_print_fn_t traffic_monitor_set_print(tm_print_fn_t fn) ++{ ++ tm_print_fn_t old_print_fn; ++ ++ old_print_fn = __atomic_exchange_n(&__tm_pr, fn, __ATOMIC_RELAXED); ++ ++ return old_print_fn; ++} ++ ++void tm_print(const char *format, ...) ++{ ++ tm_print_fn_t print_fn; ++ va_list args; ++ ++ print_fn = __atomic_load_n(&__tm_pr, __ATOMIC_RELAXED); ++ if (!print_fn) ++ return; ++ ++ va_start(args, format); ++ print_fn(format, args); ++ va_end(args); ++} ++ + /* Is this packet captured with a Ethernet protocol type? */ + static bool is_ethernet(const u_char *packet) + { +@@ -767,7 +797,7 @@ static bool is_ethernet(const u_char *packet) + case 770: /* ARPHRD_FRAD */ + case 778: /* ARPHDR_IPGRE */ + case 803: /* ARPHRD_IEEE80211_RADIOTAP */ +- printf("Packet captured: arphdr_type=%d\n", arphdr_type); ++ tm_print("Packet captured: arphdr_type=%d\n", arphdr_type); + return false; + } + return true; +@@ -817,19 +847,19 @@ static void show_transport(const u_char *packet, u16 len, u32 ifindex, + dst_port = ntohs(tcp->dest); + transport_str = "TCP"; + } else if (proto == IPPROTO_ICMP) { +- printf("%-7s %-3s IPv4 %s > %s: ICMP, length %d, type %d, code %d\n", +- ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, +- packet[0], packet[1]); ++ tm_print("%-7s %-3s IPv4 %s > %s: ICMP, length %d, type %d, code %d\n", ++ ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, ++ packet[0], packet[1]); + return; + } else if (proto == IPPROTO_ICMPV6) { +- printf("%-7s %-3s IPv6 %s > %s: ICMPv6, length %d, type %d, code %d\n", +- ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, +- packet[0], packet[1]); ++ tm_print("%-7s %-3s IPv6 %s > %s: ICMPv6, length %d, type %d, code %d\n", ++ ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, ++ packet[0], packet[1]); + return; + } else { +- printf("%-7s %-3s %s %s > %s: protocol %d\n", +- ifname, pkt_type_str(pkt_type), ipv6 ? "IPv6" : "IPv4", +- src_addr, dst_addr, proto); ++ tm_print("%-7s %-3s %s %s > %s: protocol %d\n", ++ ifname, pkt_type_str(pkt_type), ipv6 ? "IPv6" : "IPv4", ++ src_addr, dst_addr, proto); + return; + } + +@@ -843,13 +873,13 @@ static void show_transport(const u_char *packet, u16 len, u32 ifindex, + tcp->ack ? ", ACK" : ""); + + if (ipv6) +- printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d%s\n", +- ifname, pkt_type_str(pkt_type), src_addr, src_port, +- dst_addr, dst_port, transport_str, len, flags); ++ tm_print("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d%s\n", ++ ifname, pkt_type_str(pkt_type), src_addr, src_port, ++ dst_addr, dst_port, transport_str, len, flags); + else +- printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d%s\n", +- ifname, pkt_type_str(pkt_type), src_addr, src_port, +- dst_addr, dst_port, transport_str, len, flags); ++ tm_print("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d%s\n", ++ ifname, pkt_type_str(pkt_type), src_addr, src_port, ++ dst_addr, dst_port, transport_str, len, flags); + } + + static void show_ipv6_packet(const u_char *packet, u32 ifindex, u8 pkt_type) +@@ -964,8 +994,8 @@ static void *traffic_monitor_thread(void *arg) + ifname = _ifname; + } + +- printf("%-7s %-3s Unknown network protocol type 0x%x\n", +- ifname, pkt_type_str(ptype), proto); ++ tm_print("%-7s %-3s Unknown network protocol type 0x%x\n", ++ ifname, pkt_type_str(ptype), proto); + } + } + +@@ -1165,8 +1195,9 @@ void traffic_monitor_stop(struct tmonitor_ctx *ctx) + write(ctx->wake_fd, &w, sizeof(w)); + pthread_join(ctx->thread, NULL); + +- printf("Packet file: %s\n", strrchr(ctx->pkt_fname, '/') + 1); ++ tm_print("Packet file: %s\n", strrchr(ctx->pkt_fname, '/') + 1); + + traffic_monitor_release(ctx); + } ++ + #endif /* TRAFFIC_MONITOR */ +diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h +index 9f6e05d886c5..a4e20c12c57e 100644 +--- a/tools/testing/selftests/bpf/network_helpers.h ++++ b/tools/testing/selftests/bpf/network_helpers.h +@@ -17,6 +17,7 @@ typedef __u16 __sum16; + #include + #include + #include ++#include + + #define MAGIC_VAL 0x1234 + #define NUM_ITER 100000 +@@ -249,10 +250,13 @@ static inline __sum16 build_udp_v6_csum(const struct ipv6hdr *ip6h, + + struct tmonitor_ctx; + ++typedef int (*tm_print_fn_t)(const char *format, va_list args); ++ + #ifdef TRAFFIC_MONITOR + struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, + const char *subtest_name); + void traffic_monitor_stop(struct tmonitor_ctx *ctx); ++tm_print_fn_t traffic_monitor_set_print(tm_print_fn_t fn); + #else + static inline struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, + const char *subtest_name) +@@ -263,6 +267,11 @@ static inline struct tmonitor_ctx *traffic_monitor_start(const char *netns, cons + static inline void traffic_monitor_stop(struct tmonitor_ctx *ctx) + { + } ++ ++static inline tm_print_fn_t traffic_monitor_set_print(tm_print_fn_t fn) ++{ ++ return NULL; ++} + #endif + + #endif +-- +2.48.1 + diff --git a/ci/diffs/3004-selftests-bpf-Fix-dangling-stdout-seen-by-traffic-mo.patch b/ci/diffs/3004-selftests-bpf-Fix-dangling-stdout-seen-by-traffic-mo.patch new file mode 100644 index 0000000000000..ad9f297ed14e9 --- /dev/null +++ b/ci/diffs/3004-selftests-bpf-Fix-dangling-stdout-seen-by-traffic-mo.patch @@ -0,0 +1,131 @@ +From 8bda5b787dea43a7102d5797756c60d0a905932a Mon Sep 17 00:00:00 2001 +From: Amery Hung +Date: Wed, 5 Mar 2025 10:20:57 -0800 +Subject: [PATCH] selftests/bpf: Fix dangling stdout seen by traffic monitor + thread + +Traffic monitor thread may see dangling stdout as the main thread closes +and reassigns stdout without protection. This happens when the main thread +finishes one subtest and moves to another one in the same netns_new() +scope. + +The issue can be reproduced by running test_progs repeatedly with traffic +monitor enabled: + +for ((i=1;i<=100;i++)); do + ./test_progs -a flow_dissector_skb* -m '*' +done + +For restoring stdout in crash_handler(), since it does not really care +about closing stdout, simlpy flush stdout and restore it to the original +one. + +Then, Fix the issue by consolidating stdio_restore_cleanup() and +stdio_restore(), and protecting the use/close/assignment of stdout with +a lock. The locking in the main thread is always performed regradless of +whether traffic monitor is running or not for simplicity. It won't have +any side-effect. + +Signed-off-by: Amery Hung +Signed-off-by: Martin KaFai Lau +Acked-by: Eduard Zingerman +Link: https://patch.msgid.link/20250305182057.2802606-3-ameryhung@gmail.com +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/test_progs.c | 39 +++++++++++++----------- + 1 file changed, 22 insertions(+), 17 deletions(-) + +diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c +index ab0f2fed3c58..d4ec9586b98c 100644 +--- a/tools/testing/selftests/bpf/test_progs.c ++++ b/tools/testing/selftests/bpf/test_progs.c +@@ -88,7 +88,9 @@ static void stdio_hijack(char **log_buf, size_t *log_cnt) + #endif + } + +-static void stdio_restore_cleanup(void) ++static pthread_mutex_t stdout_lock = PTHREAD_MUTEX_INITIALIZER; ++ ++static void stdio_restore(void) + { + #ifdef __GLIBC__ + if (verbose() && env.worker_id == -1) { +@@ -98,6 +100,8 @@ static void stdio_restore_cleanup(void) + + fflush(stdout); + ++ pthread_mutex_lock(&stdout_lock); ++ + if (env.subtest_state) { + fclose(env.subtest_state->stdout_saved); + env.subtest_state->stdout_saved = NULL; +@@ -106,26 +110,21 @@ static void stdio_restore_cleanup(void) + } else { + fclose(env.test_state->stdout_saved); + env.test_state->stdout_saved = NULL; ++ stdout = env.stdout_saved; ++ stderr = env.stderr_saved; + } ++ ++ pthread_mutex_unlock(&stdout_lock); + #endif + } + +-static void stdio_restore(void) ++static int traffic_monitor_print_fn(const char *format, va_list args) + { +-#ifdef __GLIBC__ +- if (verbose() && env.worker_id == -1) { +- /* nothing to do, output to stdout by default */ +- return; +- } +- +- if (stdout == env.stdout_saved) +- return; +- +- stdio_restore_cleanup(); ++ pthread_mutex_lock(&stdout_lock); ++ vfprintf(stdout, format, args); ++ pthread_mutex_unlock(&stdout_lock); + +- stdout = env.stdout_saved; +- stderr = env.stderr_saved; +-#endif ++ return 0; + } + + /* Adapted from perf/util/string.c */ +@@ -536,7 +535,8 @@ void test__end_subtest(void) + test_result(subtest_state->error_cnt, + subtest_state->skipped)); + +- stdio_restore_cleanup(); ++ stdio_restore(); ++ + env.subtest_state = NULL; + } + +@@ -1265,7 +1265,10 @@ void crash_handler(int signum) + + sz = backtrace(bt, ARRAY_SIZE(bt)); + +- stdio_restore(); ++ fflush(stdout); ++ stdout = env.stdout_saved; ++ stderr = env.stderr_saved; ++ + if (env.test) { + env.test_state->error_cnt++; + dump_test_log(env.test, env.test_state, true, false, NULL); +@@ -1957,6 +1960,8 @@ int main(int argc, char **argv) + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + libbpf_set_print(libbpf_print_fn); + ++ traffic_monitor_set_print(traffic_monitor_print_fn); ++ + srand(time(NULL)); + + env.jit_enabled = is_jit_enabled(); +-- +2.48.1 + diff --git a/ci/diffs/8888-netfs-Fix-setting-NETFS_RREQ_ALL_QUEUED-to-be-after-.patch b/ci/diffs/8888-netfs-Fix-setting-NETFS_RREQ_ALL_QUEUED-to-be-after-.patch new file mode 100644 index 0000000000000..31bc3fb4717ec --- /dev/null +++ b/ci/diffs/8888-netfs-Fix-setting-NETFS_RREQ_ALL_QUEUED-to-be-after-.patch @@ -0,0 +1,100 @@ +From a99b3ef16d4473f8f9b7547f89791d037112a7e6 Mon Sep 17 00:00:00 2001 +From: David Howells +Date: Wed, 12 Feb 2025 22:24:01 +0000 +Subject: [PATCH] netfs: Fix setting NETFS_RREQ_ALL_QUEUED to be after all + subreqs queued + +Due to the code that queues a subreq on the active subrequest list getting +moved to netfs_issue_read(), the NETFS_RREQ_ALL_QUEUED flag may now get set +before the list-add actually happens. This is not a problem if the +collection worker happens after the list-add, but it's a race - and, for +9P, where the read from the server is synchronous and done in the +submitting thread, this is a lot more likely. + +The result is that, if the timing is wrong, a ref gets leaked because the +collector thinks that all the subreqs have completed (because it can't see +the last one yet) and clears NETFS_RREQ_IN_PROGRESS - at which point, the +collection worker no longer goes into the collector. + +This can be provoked with AFS by injecting an msleep() right before the +final subreq is queued. + +Fix this by splitting the queuing part out of netfs_issue_read() into a new +function, netfs_queue_read(), and calling it separately. The setting of +NETFS_RREQ_ALL_QUEUED is then done by netfs_queue_read() whilst it is +holding the spinlock (that's probably unnecessary, but shouldn't hurt). + +It might be better to set a flag on the final subreq, but this could be a +problem if an error occurs and we can't queue it. + +Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") +Reported-by: Ihor Solodrai +Closes: https://lore.kernel.org/r/a7x33d4dnMdGTtRivptq6S1i8btK70SNBP2XyX_xwDAhLvgQoPox6FVBOkifq4eBinfFfbZlIkMZBe3QarlWTxoEtHZwJCZbNKtaqrR7PvI=@pm.me/ +Signed-off-by: David Howells +Tested-by: Ihor Solodrai +cc: Eric Van Hensbergen +cc: Latchesar Ionkov +cc: Dominique Martinet +cc: Christian Schoenebeck +cc: Marc Dionne +cc: Steve French +cc: Paulo Alcantara +cc: Jeff Layton +cc: v9fs@lists.linux.dev +cc: linux-cifs@vger.kernel.org +cc: netfs@lists.linux.dev +cc: linux-fsdevel@vger.kernel.org +--- + fs/netfs/buffered_read.c | 19 +++++++++++++------ + 1 file changed, 13 insertions(+), 6 deletions(-) + +diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c +index f761d44b3436..0d1b6d35ff3b 100644 +--- a/fs/netfs/buffered_read.c ++++ b/fs/netfs/buffered_read.c +@@ -155,8 +155,9 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq, + netfs_cache_read_terminated, subreq); + } + +-static void netfs_issue_read(struct netfs_io_request *rreq, +- struct netfs_io_subrequest *subreq) ++static void netfs_queue_read(struct netfs_io_request *rreq, ++ struct netfs_io_subrequest *subreq, ++ bool last_subreq) + { + struct netfs_io_stream *stream = &rreq->io_streams[0]; + +@@ -177,8 +178,17 @@ static void netfs_issue_read(struct netfs_io_request *rreq, + } + } + ++ if (last_subreq) { ++ smp_wmb(); /* Write lists before ALL_QUEUED. */ ++ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); ++ } ++ + spin_unlock(&rreq->lock); ++} + ++static void netfs_issue_read(struct netfs_io_request *rreq, ++ struct netfs_io_subrequest *subreq) ++{ + switch (subreq->source) { + case NETFS_DOWNLOAD_FROM_SERVER: + rreq->netfs_ops->issue_read(subreq); +@@ -293,11 +303,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) + } + size -= slice; + start += slice; +- if (size <= 0) { +- smp_wmb(); /* Write lists before ALL_QUEUED. */ +- set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); +- } + ++ netfs_queue_read(rreq, subreq, size <= 0); + netfs_issue_read(rreq, subreq); + cond_resched(); + } while (size > 0); +-- +2.48.1 + diff --git a/ci/diffs/9998-sched_ext-Fix-invalid-irq-restore-in-scx_ops_bypass.patch b/ci/diffs/9998-sched_ext-Fix-invalid-irq-restore-in-scx_ops_bypass.patch new file mode 100644 index 0000000000000..0f65cb47ce2e6 --- /dev/null +++ b/ci/diffs/9998-sched_ext-Fix-invalid-irq-restore-in-scx_ops_bypass.patch @@ -0,0 +1,56 @@ +From 10e1d78546b3dd4ea9d773c0b0257064a99211e9 Mon Sep 17 00:00:00 2001 +From: Tejun Heo +Date: Wed, 11 Dec 2024 11:01:51 -1000 +Subject: [PATCH] sched_ext: Fix invalid irq restore in scx_ops_bypass() + +While adding outer irqsave/restore locking, 0e7ffff1b811 ("scx: Fix raciness +in scx_ops_bypass()") forgot to convert an inner rq_unlock_irqrestore() to +rq_unlock() which could re-enable IRQ prematurely leading to the following +warning: + + raw_local_irq_restore() called with IRQs enabled + WARNING: CPU: 1 PID: 96 at kernel/locking/irqflag-debug.c:10 warn_bogus_irq_restore+0x30/0x40 + ... + Sched_ext: create_dsq (enabling) + pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) + pc : warn_bogus_irq_restore+0x30/0x40 + lr : warn_bogus_irq_restore+0x30/0x40 + ... + Call trace: + warn_bogus_irq_restore+0x30/0x40 (P) + warn_bogus_irq_restore+0x30/0x40 (L) + scx_ops_bypass+0x224/0x3b8 + scx_ops_enable.isra.0+0x2c8/0xaa8 + bpf_scx_reg+0x18/0x30 + ... + irq event stamp: 33739 + hardirqs last enabled at (33739): [] scx_ops_bypass+0x174/0x3b8 + hardirqs last disabled at (33738): [] _raw_spin_lock_irqsave+0xb4/0xd8 + +Drop the stray _irqrestore(). + +Signed-off-by: Tejun Heo +Reported-by: Ihor Solodrai +Link: http://lkml.kernel.org/r/qC39k3UsonrBYD_SmuxHnZIQLsuuccoCrkiqb_BT7DvH945A1_LZwE4g-5Pu9FcCtqZt4lY1HhIPi0homRuNWxkgo1rgP3bkxa0donw8kV4=@pm.me +Fixes: 0e7ffff1b811 ("scx: Fix raciness in scx_ops_bypass()") +Cc: stable@vger.kernel.org # v6.12 +--- + kernel/sched/ext.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c +index 7fff1d045477..98519e6d0dcd 100644 +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -4763,7 +4763,7 @@ static void scx_ops_bypass(bool bypass) + * sees scx_rq_bypassing() before moving tasks to SCX. + */ + if (!scx_enabled()) { +- rq_unlock_irqrestore(rq, &rf); ++ rq_unlock(rq, &rf); + continue; + } + +-- +2.47.1 + diff --git a/ci/diffs/9999-scx-Fix-maximal-BPF-selftest-prog.patch b/ci/diffs/9999-scx-Fix-maximal-BPF-selftest-prog.patch new file mode 100644 index 0000000000000..9b5e6d50778e1 --- /dev/null +++ b/ci/diffs/9999-scx-Fix-maximal-BPF-selftest-prog.patch @@ -0,0 +1,56 @@ +From 70414cacbe536a197d56f58a42f9563e5a01b8ec Mon Sep 17 00:00:00 2001 +From: David Vernet +Date: Mon, 9 Dec 2024 09:29:24 -0600 +Subject: [PATCH] scx: Fix maximal BPF selftest prog + +maximal.bpf.c is still dispatching to and consuming from SCX_DSQ_GLOBAL. +Let's have it use its own DSQ to avoid any runtime errors. + +Signed-off-by: David Vernet +--- + tools/testing/selftests/sched_ext/maximal.bpf.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c +index 4c005fa71810..430f5e13bf55 100644 +--- a/tools/testing/selftests/sched_ext/maximal.bpf.c ++++ b/tools/testing/selftests/sched_ext/maximal.bpf.c +@@ -12,6 +12,8 @@ + + char _license[] SEC("license") = "GPL"; + ++#define DSQ_ID 0 ++ + s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, + u64 wake_flags) + { +@@ -20,7 +22,7 @@ s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, + + void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) + { +- scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); + } + + void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) +@@ -28,7 +30,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) + + void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) + { +- scx_bpf_dsq_move_to_local(SCX_DSQ_GLOBAL); ++ scx_bpf_dsq_move_to_local(DSQ_ID); + } + + void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) +@@ -123,7 +125,7 @@ void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) + + s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) + { +- return 0; ++ return scx_bpf_create_dsq(DSQ_ID, -1); + } + + void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info) +-- +2.47.1 + diff --git a/ci/vmtest/configs/DENYLIST b/ci/vmtest/configs/DENYLIST new file mode 100644 index 0000000000000..20a090295a607 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST @@ -0,0 +1,17 @@ +# TEMPORARY +btf_dump/btf_dump: syntax +kprobe_multi_bench_attach +core_reloc/enum64val +core_reloc/size___diff_sz +core_reloc/type_based___diff_sz +test_ima # All of CI is broken on it following 6.3-rc1 merge +lwt_reroute # crashes kernel after netnext merge from 2ab1efad60ad "net/sched: cls_api: complement tcf_tfilter_dump_policy" +tc_links_ingress # started failing after net-next merge from 2ab1efad60ad "net/sched: cls_api: complement tcf_tfilter_dump_policy" +xdp_bonding/xdp_bonding_features # started failing after net merge from 359e54a93ab4 "l2tp: pass correct message length to ip6_append_data" +tc_redirect/tc_redirect_dtime # uapi breakage after net-next commit 885c36e59f46 ("net: Re-use and set mono_delivery_time bit for userspace tstamp packets") +migrate_reuseport/IPv4 TCP_NEW_SYN_RECV reqsk_timer_handler # flaky, under investigation +migrate_reuseport/IPv6 TCP_NEW_SYN_RECV reqsk_timer_handler # flaky, under investigation +connect_force_port # unreliably fails +sockmap_ktls/sockmap_ktls disconnect_after_delete* # https://lore.kernel.org/bpf/20250415163332.1836826-1-ihor.solodrai@linux.dev/ +verif_scale_pyperf600 # llvm 20 generates code that fails verification +arena_spin_lock # llvm 20 generates code that fails verification diff --git a/ci/vmtest/configs/DENYLIST.aarch64 b/ci/vmtest/configs/DENYLIST.aarch64 new file mode 100644 index 0000000000000..bdce99f3855ec --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.aarch64 @@ -0,0 +1,5 @@ +cgrp_local_storage # libbpf: prog 'update_cookie_tracing': failed to attach: ERROR: strerror_r(-524)=22 +core_reloc_btfgen # run_core_reloc_tests:FAIL:run_btfgen unexpected error: 32512 (errno 22) +usdt/multispec # usdt_300_bad_attach unexpected pointer: 0x558c63d8f0 +xdp_bonding # whole test suite is very unstable on aarch64 +res_spin_lock_success # flaky diff --git a/ci/vmtest/configs/DENYLIST.rc b/ci/vmtest/configs/DENYLIST.rc new file mode 100644 index 0000000000000..8aa33e6b71443 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.rc @@ -0,0 +1,3 @@ +send_signal/send_signal_nmi # PMU events configure correctly but don't trigger NMI's for some reason (AMD nested virt) +send_signal/send_signal_nmi_thread # Same as above +token/obj_priv_implicit_token_envvar # Unknown root cause, but reliably fails diff --git a/ci/vmtest/configs/DENYLIST.s390x b/ci/vmtest/configs/DENYLIST.s390x new file mode 100644 index 0000000000000..9b90b615aea55 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.s390x @@ -0,0 +1,11 @@ +deny_namespace # not yet in bpf denylist +tc_redirect/tc_redirect_dtime # very flaky +lru_bug # not yet in bpf-next denylist +# Disabled temporarily for a crash. +# https://lore.kernel.org/bpf/c9923c1d-971d-4022-8dc8-1364e929d34c@gmail.com/ +dummy_st_ops/dummy_init_ptr_arg +fexit_bpf2bpf +tailcalls +trace_ext +xdp_bpf2bpf +xdp_metadata diff --git a/ci/vmtest/configs/DENYLIST.test_progs-bpf_gcc b/ci/vmtest/configs/DENYLIST.test_progs-bpf_gcc new file mode 100644 index 0000000000000..a3c745d1f5b52 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.test_progs-bpf_gcc @@ -0,0 +1,904 @@ +arena_htab +async_stack_depth +bad_struct_ops/invalid_prog_reuse +bpf_cookie +bpf_iter/bpf_hash_map +bpf_iter/ksym +bpf_iter/tcp4 +bpf_iter/tcp6 +bpf_iter/udp4 +bpf_iter/udp6 +bpf_iter/unix +bpf_iter_setsockopt +bpf_iter_setsockopt_unix +bpf_mod_race +bpf_nf/tc-bpf-ct +bpf_nf/xdp-ct +bpf_tcp_ca/cubic +btf_dump/btf_dump: bitfields +btf_dump/btf_dump: packing +btf_dump/btf_dump: padding +btf_dump/btf_dump: syntax +btf_map_in_map +cb_refs +cgroup_get_current_cgroup_id +cgroup_iter/cgroup_iter__self_only_css_task +cgroup_tcp_skb +cgrp_kfunc +cls_redirect/cls_redirect_dynptr +connect_force_port +core_autosize +core_read_macros +core_reloc/type_id +core_reloc/type_id___missing_targets +core_reloc_btfgen/type_id +core_reloc_btfgen/type_id___missing_targets +cpumask/test_acquire_wrong_cpumask +cpumask/test_alloc_double_release +cpumask/test_alloc_free_cpumask +cpumask/test_alloc_no_release +cpumask/test_and_or_xor +cpumask/test_copy_any_anyand +cpumask/test_cpumask_null +cpumask/test_cpumask_weight +cpumask/test_first_firstzero_cpu +cpumask/test_firstand_nocpu +cpumask/test_global_mask_array_l2_rcu +cpumask/test_global_mask_array_one_rcu +cpumask/test_global_mask_array_rcu +cpumask/test_global_mask_nested_deep_array_rcu +cpumask/test_global_mask_nested_deep_rcu +cpumask/test_global_mask_nested_rcu +cpumask/test_global_mask_no_null_check +cpumask/test_global_mask_out_of_rcu +cpumask/test_global_mask_rcu +cpumask/test_global_mask_rcu_no_null_check +cpumask/test_insert_leave +cpumask/test_insert_remove_no_release +cpumask/test_insert_remove_release +cpumask/test_intersects_subset +cpumask/test_invalid_nested_array +cpumask/test_mutate_cpumask +cpumask/test_set_clear_cpu +cpumask/test_setall_clear_cpu +cpumask/test_test_and_set_clear +crypto_basic/crypto_acquire +crypto_sanity +deny_namespace +dummy_st_ops/test_unsupported_field_sleepable +dynptr/add_dynptr_to_map1 +dynptr/add_dynptr_to_map2 +dynptr/clone_invalid1 +dynptr/clone_invalid2 +dynptr/clone_invalidate1 +dynptr/clone_invalidate2 +dynptr/clone_invalidate3 +dynptr/clone_invalidate4 +dynptr/clone_invalidate5 +dynptr/clone_invalidate6 +dynptr/clone_skb_packet_data +dynptr/clone_xdp_packet_data +dynptr/data_slice_missing_null_check1 +dynptr/data_slice_missing_null_check2 +dynptr/data_slice_out_of_bounds_map_value +dynptr/data_slice_out_of_bounds_ringbuf +dynptr/data_slice_out_of_bounds_skb +dynptr/data_slice_use_after_release1 +dynptr/data_slice_use_after_release2 +dynptr/dynptr_adjust_invalid +dynptr/dynptr_from_mem_invalid_api +dynptr/dynptr_invalidate_slice_failure +dynptr/dynptr_invalidate_slice_or_null +dynptr/dynptr_invalidate_slice_reinit +dynptr/dynptr_is_null_invalid +dynptr/dynptr_is_rdonly_invalid +dynptr/dynptr_overwrite_ref +dynptr/dynptr_partial_slot_invalidate +dynptr/dynptr_pruning_overwrite +dynptr/dynptr_pruning_type_confusion +dynptr/dynptr_read_into_slot +dynptr/dynptr_size_invalid +dynptr/dynptr_slice_var_len1 +dynptr/dynptr_slice_var_len2 +dynptr/dynptr_var_off_overwrite +dynptr/global +dynptr/invalid_data_slices +dynptr/invalid_helper1 +dynptr/invalid_helper2 +dynptr/invalid_offset +dynptr/invalid_read1 +dynptr/invalid_read2 +dynptr/invalid_read3 +dynptr/invalid_read4 +dynptr/invalid_slice_rdwr_rdonly +dynptr/invalid_write1 +dynptr/invalid_write2 +dynptr/invalid_write3 +dynptr/invalid_write4 +dynptr/release_twice +dynptr/release_twice_callback +dynptr/ringbuf_invalid_api +dynptr/ringbuf_missing_release1 +dynptr/ringbuf_missing_release2 +dynptr/ringbuf_missing_release_callback +dynptr/ringbuf_release_uninit_dynptr +dynptr/skb_invalid_ctx +dynptr/skb_invalid_ctx_fentry +dynptr/skb_invalid_ctx_fexit +dynptr/skb_invalid_data_slice1 +dynptr/skb_invalid_data_slice2 +dynptr/skb_invalid_data_slice3 +dynptr/skb_invalid_data_slice4 +dynptr/skb_invalid_slice_write +dynptr/test_dynptr_reg_type +dynptr/test_dynptr_skb_no_buff +dynptr/test_dynptr_skb_small_buff +dynptr/test_dynptr_skb_tp_btf +dynptr/test_read_write +dynptr/uninit_write_into_slot +dynptr/use_after_invalid +dynptr/xdp_invalid_ctx +dynptr/xdp_invalid_data_slice1 +dynptr/xdp_invalid_data_slice2 +exceptions/check_assert_eq_int_max +exceptions/check_assert_eq_int_min +exceptions/check_assert_eq_llong_max +exceptions/check_assert_eq_llong_min +exceptions/check_assert_eq_zero +exceptions/check_assert_ge_neg +exceptions/check_assert_ge_pos +exceptions/check_assert_ge_zero +exceptions/check_assert_generic +exceptions/check_assert_gt_neg +exceptions/check_assert_gt_pos +exceptions/check_assert_gt_zero +exceptions/check_assert_le_neg +exceptions/check_assert_le_pos +exceptions/check_assert_le_zero +exceptions/check_assert_lt_neg +exceptions/check_assert_lt_pos +exceptions/check_assert_lt_zero +exceptions/check_assert_range_s64 +exceptions/check_assert_range_u64 +exceptions/check_assert_single_range_s64 +exceptions/check_assert_single_range_u64 +exceptions/check_assert_with_return +exceptions/exception_ext +exceptions/exception_ext_mod_cb_runtime +exceptions/non-throwing extension -> non-throwing subprog +exceptions/non-throwing extension -> throwing global subprog +exceptions/non-throwing fentry -> exception_cb +exceptions/non-throwing fexit -> exception_cb +exceptions/non-throwing fmod_ret -> non-throwing global subprog +exceptions/reject_async_callback_throw +exceptions/reject_exception_throw_cb +exceptions/reject_exception_throw_cb_diff +exceptions/reject_set_exception_cb_bad_ret2 +exceptions/reject_subprog_with_lock +exceptions/reject_subprog_with_rcu_read_lock +exceptions/reject_with_cb +exceptions/reject_with_cb_reference +exceptions/reject_with_lock +exceptions/reject_with_rbtree_add_throw +exceptions/reject_with_rcu_read_lock +exceptions/reject_with_reference +exceptions/reject_with_subprog_reference +exceptions/throwing extension (with custom cb) -> exception_cb +exceptions/throwing extension -> global func in exception_cb +exceptions/throwing extension -> non-throwing global subprog +exceptions/throwing extension -> throwing global subprog +exceptions/throwing fentry -> exception_cb +exceptions/throwing fexit -> exception_cb +failures_wq +fexit_bpf2bpf/fmod_ret_freplace +fexit_bpf2bpf/func_replace +fexit_bpf2bpf/func_replace_global_func +fexit_bpf2bpf/func_replace_multi +fexit_bpf2bpf/func_sockmap_update +fexit_bpf2bpf/target_yes_callees +global_func_dead_code +global_map_resize +inner_array_lookup +irq/irq_flag_overwrite +irq/irq_flag_overwrite_partial +irq/irq_global_subprog +irq/irq_ooo_refs_array +irq/irq_restore_4_subprog +irq/irq_restore_bad_arg +irq/irq_restore_invalid +irq/irq_restore_iter +irq/irq_restore_missing_1_subprog +irq/irq_restore_missing_2 +irq/irq_restore_missing_2_subprog +irq/irq_restore_missing_3 +irq/irq_restore_missing_3_minus_2 +irq/irq_restore_missing_3_minus_2_subprog +irq/irq_restore_missing_3_subprog +irq/irq_restore_ooo +irq/irq_restore_ooo_3 +irq/irq_restore_ooo_3_subprog +irq/irq_save_bad_arg +irq/irq_save_invalid +irq/irq_save_iter +irq/irq_sleepable_helper +irq/irq_sleepable_kfunc +iters/compromise_iter_w_direct_write_and_skip_destroy_fail +iters/compromise_iter_w_direct_write_fail +iters/compromise_iter_w_helper_write_fail +iters/create_and_forget_to_destroy_fail +iters/css_task +iters/delayed_precision_mark +iters/delayed_read_mark +iters/destroy_without_creating_fail +iters/double_create_fail +iters/double_destroy_fail +iters/iter_css_lock_and_unlock +iters/iter_css_task_for_each +iters/iter_css_without_lock +iters/iter_destroy_bad_arg +iters/iter_err_too_permissive1 +iters/iter_err_too_permissive2 +iters/iter_err_too_permissive3 +iters/iter_err_unsafe_asm_loop +iters/iter_err_unsafe_c_loop +iters/iter_nested_iters +iters/iter_new_bad_arg +iters/iter_next_bad_arg +iters/iter_next_ptr_mem_not_trusted +iters/iter_next_rcu_not_trusted +iters/iter_next_rcu_or_null +iters/iter_next_trusted_or_null +iters/iter_obfuscate_counter +iters/iter_subprog_iters +iters/iter_tasks_lock_and_unlock +iters/iter_tasks_without_lock +iters/leak_iter_from_subprog_fail +iters/loop_state_deps1 +iters/loop_state_deps2 +iters/missing_null_check_fail +iters/next_after_destroy_fail +iters/next_without_new_fail +iters/read_from_iter_slot_fail +iters/stacksafe_should_not_conflate_stack_spill_and_iter +iters/testmod_seq_getter_after_bad +iters/testmod_seq_getter_before_bad +iters/wrong_sized_read_fail +jeq_infer_not_null +jit_probe_mem +kfree_skb +kfunc_call/kfunc_call_ctx +kfunc_call/kfunc_call_test1 +kfunc_call/kfunc_call_test2 +kfunc_call/kfunc_call_test4 +kfunc_call/kfunc_call_test_get_mem +kfunc_call/kfunc_call_test_ref_btf_id +kfunc_call/kfunc_call_test_static_unused_arg +kfunc_call/kfunc_syscall_test +kfunc_call/kfunc_syscall_test_null +kfunc_dynptr_param/not_ptr_to_stack +kfunc_dynptr_param/not_valid_dynptr +kfunc_param_nullable/kfunc_dynptr_nullable_test3 +kprobe_multi_test/kprobe_session_return_2 +kptr_xchg_inline +l4lb_all/l4lb_noinline +l4lb_all/l4lb_noinline_dynptr +linked_list +local_kptr_stash/drop_rb_node_off +local_kptr_stash/local_kptr_stash_local_with_root +local_kptr_stash/local_kptr_stash_plain +local_kptr_stash/local_kptr_stash_simple +local_kptr_stash/local_kptr_stash_unstash +local_kptr_stash/refcount_acquire_without_unstash +local_kptr_stash/stash_rb_nodes +log_buf/obj_load_log_buf +log_fixup/bad_core_relo_subprog +log_fixup/bad_core_relo_trunc_full +lru_bug +map_btf +map_in_map/acc_map_in_array +map_in_map/acc_map_in_htab +map_in_map/sleepable_acc_map_in_array +map_in_map/sleepable_acc_map_in_htab +map_kptr/correct_btf_id_check_size +map_kptr/inherit_untrusted_on_walk +map_kptr/kptr_xchg_possibly_null +map_kptr/kptr_xchg_ref_state +map_kptr/mark_ref_as_untrusted_or_null +map_kptr/marked_as_untrusted_or_null +map_kptr/non_const_var_off +map_kptr/non_const_var_off_kptr_xchg +map_kptr/reject_bad_type_xchg +map_kptr/reject_kptr_xchg_on_unref +map_kptr/reject_member_of_ref_xchg +map_kptr/reject_untrusted_xchg +map_kptr/success-map +map_ptr +nested_trust/test_invalid_nested_user_cpus +nested_trust/test_invalid_skb_field +percpu_alloc/array +percpu_alloc/array_sleepable +percpu_alloc/cgrp_local_storage +percpu_alloc/test_array_map_1 +percpu_alloc/test_array_map_2 +percpu_alloc/test_array_map_3 +percpu_alloc/test_array_map_4 +percpu_alloc/test_array_map_5 +percpu_alloc/test_array_map_6 +percpu_alloc/test_array_map_7 +percpu_alloc/test_array_map_8 +perf_branches/perf_branches_no_hw +pkt_access +preempt_lock/preempt_global_subprog_test +preempt_lock/preempt_lock_missing_1 +preempt_lock/preempt_lock_missing_1_subprog +preempt_lock/preempt_lock_missing_2 +preempt_lock/preempt_lock_missing_2_minus_1_subprog +preempt_lock/preempt_lock_missing_2_subprog +preempt_lock/preempt_lock_missing_3 +preempt_lock/preempt_lock_missing_3_minus_2 +preempt_lock/preempt_sleepable_helper +preempt_lock/preempt_sleepable_kfunc +preempted_bpf_ma_op +prog_run_opts +prog_tests_framework +raw_tp_null +rbtree_fail +rbtree_success +recursion +refcounted_kptr +refcounted_kptr_fail +refcounted_kptr_wrong_owner +reference_tracking/sk_lookup_success +ringbuf_multi +setget_sockopt +sk_lookup +skc_to_unix_sock +sock_addr/recvmsg4: attach prog with wrong attach type +sock_addr/recvmsg4: recvfrom (dgram) +sock_addr/recvmsg6: attach prog with wrong attach type +sock_addr/recvmsg6: recvfrom (dgram) +sock_addr/sendmsg4: attach prog with wrong attach type +sock_addr/sendmsg4: kernel_sendmsg (dgram) +sock_addr/sendmsg4: kernel_sendmsg deny (dgram) +sock_addr/sendmsg4: sendmsg (dgram) +sock_addr/sendmsg4: sendmsg deny (dgram) +sock_addr/sendmsg4: sock_sendmsg (dgram) +sock_addr/sendmsg4: sock_sendmsg deny (dgram) +sock_addr/sendmsg6: attach prog with wrong attach type +sock_addr/sendmsg6: kernel_sendmsg (dgram) +sock_addr/sendmsg6: kernel_sendmsg [::] (BSD'ism) (dgram) +sock_addr/sendmsg6: kernel_sendmsg deny (dgram) +sock_addr/sendmsg6: sendmsg (dgram) +sock_addr/sendmsg6: sendmsg IPv4-mapped IPv6 (dgram) +sock_addr/sendmsg6: sendmsg [::] (BSD'ism) (dgram) +sock_addr/sendmsg6: sendmsg deny (dgram) +sock_addr/sendmsg6: sendmsg dst IP = [::] (BSD'ism) (dgram) +sock_addr/sendmsg6: sock_sendmsg (dgram) +sock_addr/sendmsg6: sock_sendmsg [::] (BSD'ism) (dgram) +sock_addr/sendmsg6: sock_sendmsg deny (dgram) +sock_destroy/trace_tcp_destroy_sock +sock_fields +sockmap_listen/sockhash IPv4 TCP test_reuseport_mixed_groups +sockmap_listen/sockhash IPv4 TCP test_reuseport_select_connected +sockmap_listen/sockhash IPv4 UDP test_reuseport_mixed_groups +sockmap_listen/sockhash IPv4 UDP test_reuseport_select_connected +sockmap_listen/sockhash IPv6 TCP test_reuseport_mixed_groups +sockmap_listen/sockhash IPv6 TCP test_reuseport_select_connected +sockmap_listen/sockhash IPv6 UDP test_reuseport_mixed_groups +sockmap_listen/sockhash IPv6 UDP test_reuseport_select_connected +sockmap_listen/sockmap IPv4 TCP test_reuseport_mixed_groups +sockmap_listen/sockmap IPv4 TCP test_reuseport_select_connected +sockmap_listen/sockmap IPv4 UDP test_reuseport_mixed_groups +sockmap_listen/sockmap IPv4 UDP test_reuseport_select_connected +sockmap_listen/sockmap IPv6 TCP test_reuseport_mixed_groups +sockmap_listen/sockmap IPv6 TCP test_reuseport_select_connected +sockmap_listen/sockmap IPv6 UDP test_reuseport_mixed_groups +sockmap_listen/sockmap IPv6 UDP test_reuseport_select_connected +spin_lock +struct_ops_module/unsupported_ops +syscall +tailcalls/classifier_0 +tailcalls/classifier_1 +tailcalls/reject_tail_call_preempt_lock +tailcalls/reject_tail_call_rcu_lock +tailcalls/reject_tail_call_ref +tailcalls/reject_tail_call_spin_lock +tailcalls/tailcall_6 +tailcalls/tailcall_bpf2bpf_2 +tailcalls/tailcall_bpf2bpf_3 +tailcalls/tailcall_bpf2bpf_fentry +tailcalls/tailcall_bpf2bpf_fentry_entry +tailcalls/tailcall_bpf2bpf_fentry_fexit +tailcalls/tailcall_bpf2bpf_fexit +tailcalls/tailcall_bpf2bpf_hierarchy_2 +tailcalls/tailcall_bpf2bpf_hierarchy_3 +task_kfunc +task_local_storage/uptr_across_pages +task_local_storage/uptr_basic +task_local_storage/uptr_kptr_xchg +task_local_storage/uptr_map_failure_e2big +task_local_storage/uptr_map_failure_kstruct +task_local_storage/uptr_map_failure_size0 +task_local_storage/uptr_no_null_check +task_local_storage/uptr_obj_new +task_local_storage/uptr_update_failure +tc_bpf/tc_bpf_non_root +tc_redirect/tc_redirect_dtime +tcp_custom_syncookie +tcp_hdr_options +test_bpf_ma +test_global_funcs/arg_tag_ctx_kprobe +test_global_funcs/arg_tag_ctx_perf +test_global_funcs/arg_tag_ctx_raw_tp +test_global_funcs/global_func1 +test_global_funcs/global_func10 +test_global_funcs/global_func11 +test_global_funcs/global_func12 +test_global_funcs/global_func13 +test_global_funcs/global_func14 +test_global_funcs/global_func15 +test_global_funcs/global_func15_tricky_pruning +test_global_funcs/global_func17 +test_global_funcs/global_func3 +test_global_funcs/global_func5 +test_global_funcs/global_func6 +test_global_funcs/global_func7 +test_lsm/lsm_basic +test_profiler +test_strncmp/strncmp_bad_not_null_term_target +timer +timer_mim +token +tp_btf_nullable/handle_tp_btf_nullable_bare1 +tunnel +uprobe_multi_test/uprobe_sesison_return_2 +user_ringbuf/user_ringbuf_callback_bad_access1 +user_ringbuf/user_ringbuf_callback_bad_access2 +user_ringbuf/user_ringbuf_callback_const_ptr_to_dynptr_reg_off +user_ringbuf/user_ringbuf_callback_discard_dynptr +user_ringbuf/user_ringbuf_callback_invalid_return +user_ringbuf/user_ringbuf_callback_null_context_read +user_ringbuf/user_ringbuf_callback_null_context_write +user_ringbuf/user_ringbuf_callback_reinit_dynptr_mem +user_ringbuf/user_ringbuf_callback_reinit_dynptr_ringbuf +user_ringbuf/user_ringbuf_callback_submit_dynptr +user_ringbuf/user_ringbuf_callback_write_forbidden +verif_scale_pyperf100 +verif_scale_pyperf180 +verif_scale_pyperf600 +verif_scale_pyperf600_nounroll +verif_scale_seg6_loop +verif_scale_strobemeta +verif_scale_strobemeta_nounroll1 +verif_scale_strobemeta_nounroll2 +verif_scale_strobemeta_subprogs +verif_scale_sysctl_loop1 +verif_scale_sysctl_loop2 +verif_scale_xdp_loop +verifier_and/invalid_and_of_negative_number +verifier_and/invalid_range_check +verifier_arena/iter_maps2 +verifier_arena/iter_maps3 +verifier_array_access/a_read_only_array_1_2 +verifier_array_access/a_read_only_array_2_2 +verifier_array_access/a_write_only_array_1_2 +verifier_array_access/a_write_only_array_2_2 +verifier_array_access/an_array_with_a_constant_2 +verifier_array_access/an_array_with_a_register_2 +verifier_array_access/an_array_with_a_variable_2 +verifier_array_access/array_with_no_floor_check +verifier_array_access/with_a_invalid_max_check_1 +verifier_array_access/with_a_invalid_max_check_2 +verifier_basic_stack/invalid_fp_arithmetic +verifier_basic_stack/misaligned_read_from_stack +verifier_basic_stack/stack_out_of_bounds +verifier_bitfield_write +verifier_bits_iter/destroy_uninit +verifier_bits_iter/next_uninit +verifier_bits_iter/no_destroy +verifier_bounds/bounds_map_value_variant_1 +verifier_bounds/bounds_map_value_variant_2 +verifier_bounds/of_boundary_crossing_range_1 +verifier_bounds/of_boundary_crossing_range_2 +verifier_bounds/on_sign_extended_mov_test1 +verifier_bounds/on_sign_extended_mov_test2 +verifier_bounds/reg32_any_reg32_xor_3 +verifier_bounds/reg_any_reg_xor_3 +verifier_bounds/shift_of_maybe_negative_number +verifier_bounds/shift_with_64_bit_input +verifier_bounds/shift_with_oversized_count_operand +verifier_bounds/size_signed_32bit_overflow_test1 +verifier_bounds/size_signed_32bit_overflow_test2 +verifier_bounds/size_signed_32bit_overflow_test3 +verifier_bounds/size_signed_32bit_overflow_test4 +verifier_bounds/var_off_insn_off_test1 +verifier_bounds/var_off_insn_off_test2 +verifier_bounds_deduction/deducing_bounds_from_const_1 +verifier_bounds_deduction/deducing_bounds_from_const_10 +verifier_bounds_deduction/deducing_bounds_from_const_3 +verifier_bounds_deduction/deducing_bounds_from_const_5 +verifier_bounds_deduction/deducing_bounds_from_const_6 +verifier_bounds_deduction/deducing_bounds_from_const_7 +verifier_bounds_deduction/deducing_bounds_from_const_8 +verifier_bounds_deduction/deducing_bounds_from_const_9 +verifier_bounds_mix_sign_unsign/checks_mixing_signed_and_unsigned +verifier_bounds_mix_sign_unsign/signed_and_unsigned_positive_bounds +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_10 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_11 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_12 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_13 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_14 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_15 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_2 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_3 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_5 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_6 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_8 +verifier_btf_ctx_access/ctx_access_u32_pointer_reject_16 +verifier_btf_ctx_access/ctx_access_u32_pointer_reject_32 +verifier_btf_ctx_access/ctx_access_u32_pointer_reject_8 +verifier_cfg/conditional_loop +verifier_cfg/loop2_back_edge +verifier_cfg/loop_back_edge +verifier_cfg/out_of_range_jump +verifier_cfg/out_of_range_jump2 +verifier_cfg/uncond_loop_after_cond_jmp +verifier_cfg/uncond_loop_in_subprog_after_cond_jmp +verifier_cfg/unreachable +verifier_cfg/unreachable2 +verifier_cgroup_inv_retcode/with_invalid_return_code_test1 +verifier_cgroup_inv_retcode/with_invalid_return_code_test3 +verifier_cgroup_inv_retcode/with_invalid_return_code_test5 +verifier_cgroup_inv_retcode/with_invalid_return_code_test6 +verifier_cgroup_inv_retcode/with_invalid_return_code_test7 +verifier_cgroup_skb/data_meta_for_cgroup_skb +verifier_cgroup_skb/flow_keys_for_cgroup_skb +verifier_cgroup_skb/napi_id_for_cgroup_skb +verifier_cgroup_skb/tc_classid_for_cgroup_skb +verifier_cgroup_storage/cpu_cgroup_storage_access_1 +verifier_cgroup_storage/cpu_cgroup_storage_access_2 +verifier_cgroup_storage/cpu_cgroup_storage_access_3 +verifier_cgroup_storage/cpu_cgroup_storage_access_4 +verifier_cgroup_storage/cpu_cgroup_storage_access_5 +verifier_cgroup_storage/cpu_cgroup_storage_access_6 +verifier_cgroup_storage/invalid_cgroup_storage_access_1 +verifier_cgroup_storage/invalid_cgroup_storage_access_2 +verifier_cgroup_storage/invalid_cgroup_storage_access_3 +verifier_cgroup_storage/invalid_cgroup_storage_access_4 +verifier_cgroup_storage/invalid_cgroup_storage_access_5 +verifier_cgroup_storage/invalid_cgroup_storage_access_6 +verifier_const/bprm +verifier_const/tcx1 +verifier_const/tcx4 +verifier_const/tcx7 +verifier_const_or/not_bypass_stack_boundary_checks_1 +verifier_const_or/not_bypass_stack_boundary_checks_2 +verifier_ctx/context_stores_via_bpf_atomic +verifier_ctx/ctx_pointer_to_helper_1 +verifier_ctx/ctx_pointer_to_helper_2 +verifier_ctx/ctx_pointer_to_helper_3 +verifier_ctx/make_ptr_to_ctx_unusable +verifier_ctx/null_check_4_ctx_const +verifier_ctx/null_check_8_null_bind +verifier_ctx/or_null_check_3_1 +verifier_ctx_sk_msg/of_size_in_sk_msg +verifier_ctx_sk_msg/past_end_of_sk_msg +verifier_ctx_sk_msg/read_offset_in_sk_msg +verifier_d_path/d_path_reject +verifier_direct_packet_access/access_test15_spill_with_xadd +verifier_direct_packet_access/direct_packet_access_test3 +verifier_direct_packet_access/id_in_regsafe_bad_access +verifier_direct_packet_access/packet_access_test10_write_invalid +verifier_direct_packet_access/pkt_end_reg_bad_access +verifier_direct_packet_access/pkt_end_reg_both_accesses +verifier_direct_packet_access/test16_arith_on_data_end +verifier_direct_packet_access/test23_x_pkt_ptr_4 +verifier_direct_packet_access/test26_marking_on_bad_access +verifier_direct_packet_access/test28_marking_on_bad_access +verifier_direct_stack_access_wraparound +verifier_global_ptr_args +verifier_global_subprogs +verifier_helper_access_var_len/bitwise_and_jmp_wrong_max +verifier_helper_access_var_len/jmp_signed_no_min_check +verifier_helper_access_var_len/map_adjusted_jmp_wrong_max +verifier_helper_access_var_len/memory_map_jmp_wrong_max +verifier_helper_access_var_len/memory_stack_jmp_bounds_offset +verifier_helper_access_var_len/memory_stack_jmp_wrong_max +verifier_helper_access_var_len/ptr_to_mem_or_null_2 +verifier_helper_access_var_len/ptr_to_mem_or_null_8 +verifier_helper_access_var_len/ptr_to_mem_or_null_9 +verifier_helper_access_var_len/stack_jmp_no_max_check +verifier_helper_packet_access/cls_helper_fail_range_1 +verifier_helper_packet_access/cls_helper_fail_range_2 +verifier_helper_packet_access/cls_helper_fail_range_3 +verifier_helper_packet_access/packet_ptr_with_bad_range_1 +verifier_helper_packet_access/packet_ptr_with_bad_range_2 +verifier_helper_packet_access/packet_test2_unchecked_packet_ptr +verifier_helper_packet_access/ptr_with_too_short_range_1 +verifier_helper_packet_access/ptr_with_too_short_range_2 +verifier_helper_packet_access/test11_cls_unsuitable_helper_1 +verifier_helper_packet_access/test12_cls_unsuitable_helper_2 +verifier_helper_packet_access/test15_cls_helper_fail_sub +verifier_helper_packet_access/test20_pkt_end_as_input +verifier_helper_packet_access/test7_cls_unchecked_packet_ptr +verifier_helper_packet_access/to_packet_test21_wrong_reg +verifier_helper_restricted +verifier_helper_value_access/access_to_map_empty_range +verifier_helper_value_access/access_to_map_negative_range +verifier_helper_value_access/access_to_map_possibly_empty_range +verifier_helper_value_access/access_to_map_wrong_size +verifier_helper_value_access/bounds_check_using_bad_access_1 +verifier_helper_value_access/bounds_check_using_bad_access_2 +verifier_helper_value_access/check_using_s_bad_access_1 +verifier_helper_value_access/check_using_s_bad_access_2 +verifier_helper_value_access/const_imm_negative_range_adjustment_1 +verifier_helper_value_access/const_imm_negative_range_adjustment_2 +verifier_helper_value_access/const_reg_negative_range_adjustment_1 +verifier_helper_value_access/const_reg_negative_range_adjustment_2 +verifier_helper_value_access/imm_out_of_bound_1 +verifier_helper_value_access/imm_out_of_bound_2 +verifier_helper_value_access/imm_out_of_bound_range +verifier_helper_value_access/map_out_of_bound_range +verifier_helper_value_access/map_via_variable_empty_range +verifier_helper_value_access/reg_out_of_bound_1 +verifier_helper_value_access/reg_out_of_bound_2 +verifier_helper_value_access/reg_out_of_bound_range +verifier_helper_value_access/via_const_imm_empty_range +verifier_helper_value_access/via_const_reg_empty_range +verifier_helper_value_access/via_variable_no_max_check_1 +verifier_helper_value_access/via_variable_no_max_check_2 +verifier_helper_value_access/via_variable_wrong_max_check_1 +verifier_helper_value_access/via_variable_wrong_max_check_2 +verifier_int_ptr/arg_ptr_to_long_misaligned +verifier_int_ptr/to_long_size_sizeof_long +verifier_iterating_callbacks/bpf_loop_iter_limit_overflow +verifier_iterating_callbacks/check_add_const_3regs +verifier_iterating_callbacks/check_add_const_3regs_2if +verifier_iterating_callbacks/check_add_const_regsafe_off +verifier_iterating_callbacks/iter_limit_bug +verifier_iterating_callbacks/jgt_imm64_and_may_goto +verifier_iterating_callbacks/loop_detection +verifier_iterating_callbacks/may_goto_self +verifier_iterating_callbacks/unsafe_find_vma +verifier_iterating_callbacks/unsafe_for_each_map_elem +verifier_iterating_callbacks/unsafe_on_2nd_iter +verifier_iterating_callbacks/unsafe_on_zero_iter +verifier_iterating_callbacks/unsafe_ringbuf_drain +verifier_jeq_infer_not_null/unchanged_for_jeq_false_branch +verifier_jeq_infer_not_null/unchanged_for_jne_true_branch +verifier_kfunc_prog_types/cgrp_kfunc_raw_tp +verifier_kfunc_prog_types/cpumask_kfunc_raw_tp +verifier_kfunc_prog_types/task_kfunc_raw_tp +verifier_ld_ind/ind_check_calling_conv_r1 +verifier_ld_ind/ind_check_calling_conv_r2 +verifier_ld_ind/ind_check_calling_conv_r3 +verifier_ld_ind/ind_check_calling_conv_r4 +verifier_ld_ind/ind_check_calling_conv_r5 +verifier_leak_ptr/leak_pointer_into_ctx_1 +verifier_leak_ptr/leak_pointer_into_ctx_2 +verifier_linked_scalars +verifier_loops1/bounded_recursion +verifier_loops1/infinite_loop_in_two_jumps +verifier_loops1/infinite_loop_three_jump_trick +verifier_loops1/loop_after_a_conditional_jump +verifier_lsm/bool_retval_test3 +verifier_lsm/bool_retval_test4 +verifier_lsm/disabled_hook_test1 +verifier_lsm/disabled_hook_test2 +verifier_lsm/disabled_hook_test3 +verifier_lsm/errno_zero_retval_test4 +verifier_lsm/errno_zero_retval_test5 +verifier_lsm/errno_zero_retval_test6 +verifier_lwt/not_permitted_for_lwt_prog +verifier_lwt/packet_write_for_lwt_in +verifier_lwt/packet_write_for_lwt_out +verifier_lwt/tc_classid_for_lwt_in +verifier_lwt/tc_classid_for_lwt_out +verifier_lwt/tc_classid_for_lwt_xmit +verifier_map_in_map/invalid_inner_map_pointer +verifier_map_in_map/on_the_inner_map_pointer +verifier_map_ptr/bpf_map_ptr_write_rejected +verifier_map_ptr/read_non_existent_field_rejected +verifier_map_ptr/read_with_negative_offset_rejected +verifier_map_ptr_mixing +verifier_map_ret_val +verifier_meta_access/meta_access_test10 +verifier_meta_access/meta_access_test2 +verifier_meta_access/meta_access_test3 +verifier_meta_access/meta_access_test4 +verifier_meta_access/meta_access_test5 +verifier_meta_access/meta_access_test6 +verifier_meta_access/meta_access_test9 +verifier_netfilter_ctx/with_invalid_ctx_access_test1 +verifier_netfilter_ctx/with_invalid_ctx_access_test2 +verifier_netfilter_ctx/with_invalid_ctx_access_test3 +verifier_netfilter_ctx/with_invalid_ctx_access_test4 +verifier_netfilter_ctx/with_invalid_ctx_access_test5 +verifier_netfilter_retcode/with_invalid_return_code_test1 +verifier_netfilter_retcode/with_invalid_return_code_test4 +verifier_or_jmp32_k +verifier_prevent_map_lookup +verifier_raw_stack/bytes_spilled_regs_corruption_2 +verifier_raw_stack/load_bytes_invalid_access_1 +verifier_raw_stack/load_bytes_invalid_access_2 +verifier_raw_stack/load_bytes_invalid_access_3 +verifier_raw_stack/load_bytes_invalid_access_4 +verifier_raw_stack/load_bytes_invalid_access_5 +verifier_raw_stack/load_bytes_invalid_access_6 +verifier_raw_stack/load_bytes_negative_len_2 +verifier_raw_stack/load_bytes_spilled_regs_corruption +verifier_raw_stack/skb_load_bytes_negative_len +verifier_raw_stack/skb_load_bytes_zero_len +verifier_raw_tp_writable +verifier_ref_tracking +verifier_reg_equal/subreg_equality_2 +verifier_regalloc/regalloc_and_spill_negative +verifier_regalloc/regalloc_negative +verifier_regalloc/regalloc_src_reg_negative +verifier_ringbuf/ringbuf_invalid_reservation_offset_1 +verifier_ringbuf/ringbuf_invalid_reservation_offset_2 +verifier_runtime_jit +verifier_scalar_ids/check_ids_in_regsafe +verifier_scalar_ids/check_ids_in_regsafe_2 +verifier_scalar_ids/linked_regs_broken_link_2 +verifier_search_pruning/for_u32_spills_u64_fill +verifier_search_pruning/liveness_pruning_and_write_screening +verifier_search_pruning/short_loop1 +verifier_search_pruning/should_be_verified_nop_operation +verifier_search_pruning/tracking_for_u32_spill_fill +verifier_search_pruning/varlen_map_value_access_pruning +verifier_sock/bpf_sk_fullsock_skb_sk +verifier_sock/bpf_sk_release_skb_sk +verifier_sock/bpf_tcp_sock_skb_sk +verifier_sock/dst_port_byte_load_invalid +verifier_sock/dst_port_half_load_invalid_1 +verifier_sock/dst_port_half_load_invalid_2 +verifier_sock/invalidate_pkt_pointers_by_tail_call +verifier_sock/invalidate_pkt_pointers_from_global_func +verifier_sock/map_lookup_elem_smap_key +verifier_sock/map_lookup_elem_sockhash_key +verifier_sock/map_lookup_elem_sockmap_key +verifier_sock/no_null_check_on_ret_1 +verifier_sock/no_null_check_on_ret_2 +verifier_sock/of_bpf_skc_to_helpers +verifier_sock/post_bind4_read_mark +verifier_sock/post_bind4_read_src_ip6 +verifier_sock/post_bind6_read_src_ip4 +verifier_sock/sk_1_1_value_1 +verifier_sock/sk_no_skb_sk_check_1 +verifier_sock/sk_no_skb_sk_check_2 +verifier_sock/sk_sk_type_fullsock_field_1 +verifier_sock/skb_sk_beyond_last_field_1 +verifier_sock/skb_sk_beyond_last_field_2 +verifier_sock/skb_sk_no_null_check +verifier_sock/sock_create_read_src_port +verifier_sock_addr/bind4_bad_return_code +verifier_sock_addr/bind6_bad_return_code +verifier_sock_addr/connect4_bad_return_code +verifier_sock_addr/connect6_bad_return_code +verifier_sock_addr/connect_unix_bad_return_code +verifier_sock_addr/getpeername4_bad_return_code +verifier_sock_addr/getpeername6_bad_return_code +verifier_sock_addr/getpeername_unix_bad_return_code +verifier_sock_addr/getsockname4_bad_return_code +verifier_sock_addr/getsockname6_bad_return_code +verifier_sock_addr/getsockname_unix_unix_bad_return_code +verifier_sock_addr/recvmsg4_bad_return_code +verifier_sock_addr/recvmsg6_bad_return_code +verifier_sock_addr/recvmsg_unix_bad_return_code +verifier_sock_addr/sendmsg4_bad_return_code +verifier_sock_addr/sendmsg6_bad_return_code +verifier_sock_addr/sendmsg_unix_bad_return_code +verifier_sockmap_mutate/test_flow_dissector_update +verifier_sockmap_mutate/test_raw_tp_delete +verifier_sockmap_mutate/test_raw_tp_update +verifier_sockmap_mutate/test_sockops_update +verifier_spill_fill/_6_offset_to_skb_data +verifier_spill_fill/addr_offset_to_skb_data +verifier_spill_fill/check_corrupted_spill_fill +verifier_spill_fill/fill_32bit_after_spill_64bit_clear_id +verifier_spill_fill/spill_16bit_of_32bit_fail +verifier_spill_fill/spill_32bit_of_64bit_fail +verifier_spill_fill/u64_offset_to_skb_data +verifier_spill_fill/with_invalid_reg_offset_0 +verifier_spin_lock/call_within_a_locked_region +verifier_spin_lock/lock_test2_direct_ld_st +verifier_spin_lock/lock_test3_direct_ld_st +verifier_spin_lock/lock_test4_direct_ld_st +verifier_spin_lock/lock_test7_unlock_without_lock +verifier_spin_lock/reg_id_for_map_value +verifier_spin_lock/spin_lock_test6_missing_unlock +verifier_spin_lock/spin_lock_test8_double_lock +verifier_spin_lock/spin_lock_test9_different_lock +verifier_spin_lock/test11_ld_abs_under_lock +verifier_stack_ptr/load_bad_alignment_on_off +verifier_stack_ptr/load_bad_alignment_on_reg +verifier_stack_ptr/load_out_of_bounds_high +verifier_stack_ptr/load_out_of_bounds_low +verifier_stack_ptr/to_stack_check_high_4 +verifier_stack_ptr/to_stack_check_high_5 +verifier_stack_ptr/to_stack_check_high_6 +verifier_stack_ptr/to_stack_check_high_7 +verifier_stack_ptr/to_stack_check_low_3 +verifier_stack_ptr/to_stack_check_low_4 +verifier_stack_ptr/to_stack_check_low_5 +verifier_stack_ptr/to_stack_check_low_6 +verifier_stack_ptr/to_stack_check_low_7 +verifier_subprog_precision/callback_precise_return_fail +verifier_tailcall_jit +verifier_uninit +verifier_unpriv +verifier_unpriv_perf +verifier_value/store_of_cleared_call_register +verifier_value_illegal_alu +verifier_value_or_null/map_access_from_else_condition +verifier_value_or_null/map_value_or_null_1 +verifier_value_or_null/map_value_or_null_2 +verifier_value_or_null/map_value_or_null_3 +verifier_value_or_null/multiple_map_lookup_elem_calls +verifier_value_or_null/null_check_ids_in_regsafe +verifier_value_ptr_arith/access_known_scalar_value_ptr_2 +verifier_value_ptr_arith/access_unknown_scalar_value_ptr +verifier_value_ptr_arith/access_value_ptr_known_scalar +verifier_value_ptr_arith/access_value_ptr_unknown_scalar +verifier_value_ptr_arith/access_value_ptr_value_ptr_1 +verifier_value_ptr_arith/access_value_ptr_value_ptr_2 +verifier_value_ptr_arith/lower_oob_arith_test_1 +verifier_value_ptr_arith/to_leak_tainted_dst_reg +verifier_value_ptr_arith/unknown_scalar_value_ptr_4 +verifier_value_ptr_arith/value_ptr_known_scalar_2_1 +verifier_value_ptr_arith/value_ptr_known_scalar_3 +verifier_var_off/access_max_out_of_bound +verifier_var_off/access_min_out_of_bound +verifier_var_off/stack_write_clobbers_spilled_regs +verifier_var_off/variable_offset_ctx_access +verifier_var_off/variable_offset_stack_access_unbounded +verifier_var_off/zero_sized_access_max_out_of_bound +verifier_vfs_reject +verifier_xadd/xadd_w_check_unaligned_map +verifier_xadd/xadd_w_check_unaligned_pkt +verifier_xadd/xadd_w_check_unaligned_stack +verifier_xdp_direct_packet_access/corner_case_1_bad_access_1 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_10 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_11 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_12 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_13 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_14 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_15 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_16 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_2 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_3 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_4 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_5 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_6 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_7 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_8 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_9 +verifier_xdp_direct_packet_access/end_mangling_bad_access_1 +verifier_xdp_direct_packet_access/end_mangling_bad_access_2 +verifier_xdp_direct_packet_access/pkt_data_bad_access_1_1 +verifier_xdp_direct_packet_access/pkt_data_bad_access_1_2 +verifier_xdp_direct_packet_access/pkt_data_bad_access_1_3 +verifier_xdp_direct_packet_access/pkt_data_bad_access_1_4 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_1 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_2 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_3 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_4 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_5 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_6 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_7 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_8 +verifier_xdp_direct_packet_access/pkt_end_bad_access_1_1 +verifier_xdp_direct_packet_access/pkt_end_bad_access_1_2 +verifier_xdp_direct_packet_access/pkt_end_bad_access_2_1 +verifier_xdp_direct_packet_access/pkt_end_bad_access_2_2 +verifier_xdp_direct_packet_access/pkt_end_bad_access_2_3 +verifier_xdp_direct_packet_access/pkt_end_bad_access_2_4 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_1_1 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_1_2 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_2_1 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_2_2 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_2_3 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_2_4 +verify_pkcs7_sig +xdp_synproxy diff --git a/ci/vmtest/configs/DENYLIST.test_progs_cpuv4 b/ci/vmtest/configs/DENYLIST.test_progs_cpuv4 new file mode 100644 index 0000000000000..0c02eae8f5cd1 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.test_progs_cpuv4 @@ -0,0 +1 @@ +verifier_arena/basic_alloc2 diff --git a/ci/vmtest/configs/DENYLIST.x86_64 b/ci/vmtest/configs/DENYLIST.x86_64 new file mode 100644 index 0000000000000..6fc3413daab9f --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.x86_64 @@ -0,0 +1 @@ +netcnt # with kvm enabled, fail with packets unexpected packets: actual 10001 != expected 10000 diff --git a/ci/vmtest/configs/run-vmtest.env b/ci/vmtest/configs/run-vmtest.env new file mode 100644 index 0000000000000..c60f1db6673c7 --- /dev/null +++ b/ci/vmtest/configs/run-vmtest.env @@ -0,0 +1,42 @@ +#!/bin/bash + +# This file is sourced by libbpf/ci/run-vmtest Github Action scripts. +# +# The primary reason it exists is that assembling ALLOWLIST and +# DENYLIST for a particular test run is not a trivial operation. +# +# Users of libbpf/ci/run-vmtest action need to be able to specify a +# list of allow/denylist **files**, that later has to be correctly +# merged into a single allow/denylist passed to a test runner. +# +# Obviously it's perferrable for the scripts merging many lists into +# one to be reusable, and not copy-pasted between repositories which +# use libbpf/ci actions. And specifying the lists should be trivial. +# This file is a solution to that. + +# $SELFTESTS_BPF and $VMTEST_CONFIGS are set in the workflow, before +# libbpf/ci/run-vmtest action is called +# See .github/workflows/kernel-test.yml + +ALLOWLIST_FILES=( + "${SELFTESTS_BPF}/ALLOWLIST" + "${SELFTESTS_BPF}/ALLOWLIST.${ARCH}" + "${VMTEST_CONFIGS}/ALLOWLIST" + "${VMTEST_CONFIGS}/ALLOWLIST.${ARCH}" + "${VMTEST_CONFIGS}/ALLOWLIST.${DEPLOYMENT}" + "${VMTEST_CONFIGS}/ALLOWLIST.${KERNEL_TEST}" +) + +DENYLIST_FILES=( + "${SELFTESTS_BPF}/DENYLIST" + "${SELFTESTS_BPF}/DENYLIST.${ARCH}" + "${VMTEST_CONFIGS}/DENYLIST" + "${VMTEST_CONFIGS}/DENYLIST.${ARCH}" + "${VMTEST_CONFIGS}/DENYLIST.${DEPLOYMENT}" + "${VMTEST_CONFIGS}/DENYLIST.${KERNEL_TEST}" +) + +# Export pipe-separated strings, because bash doesn't support array export +export SELFTESTS_BPF_ALLOWLIST_FILES=$(IFS="|"; echo "${ALLOWLIST_FILES[*]}") +export SELFTESTS_BPF_DENYLIST_FILES=$(IFS="|"; echo "${DENYLIST_FILES[*]}") + diff --git a/ci/vmtest/configs/run_veristat.kernel.cfg b/ci/vmtest/configs/run_veristat.kernel.cfg new file mode 100644 index 0000000000000..807efc251073f --- /dev/null +++ b/ci/vmtest/configs/run_veristat.kernel.cfg @@ -0,0 +1,4 @@ +VERISTAT_OBJECTS_DIR="${SELFTESTS_BPF}" +VERISTAT_OBJECTS_GLOB="*.bpf.o" +VERISTAT_CFG_FILE="${SELFTESTS_BPF}/veristat.cfg" +VERISTAT_OUTPUT="veristat-kernel" diff --git a/ci/vmtest/configs/run_veristat.meta.cfg b/ci/vmtest/configs/run_veristat.meta.cfg new file mode 100644 index 0000000000000..14f08d241d206 --- /dev/null +++ b/ci/vmtest/configs/run_veristat.meta.cfg @@ -0,0 +1,4 @@ +VERISTAT_OBJECTS_DIR="${WORKING_DIR}/bpf_objects" +VERISTAT_OBJECTS_GLOB="*.o" +VERISTAT_OUTPUT="veristat-meta" +VERISTAT_CFG_FILE="${VERISTAT_CONFIGS}/veristat_meta.cfg" diff --git a/ci/vmtest/configs/veristat_meta.cfg b/ci/vmtest/configs/veristat_meta.cfg new file mode 100644 index 0000000000000..a8c25d71cb9e2 --- /dev/null +++ b/ci/vmtest/configs/veristat_meta.cfg @@ -0,0 +1,10 @@ +# List of exceptions we know about that are not going to work with veristat. + +# needs 'migrate_misplaced_page' which went away in +# commit 73eab3ca481e ("mm: migrate: convert migrate_misplaced_page() to migrate_misplaced_folio()") +!numamove_bpf-numamove_bpf.o + +# use non-libbpf loader +!takeover_bpf_lib-takeover.bpf.o +!tcp_tuner_bpf_lib-tcptuner.bpf.o + diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 271520510b5fc..b29628d46be9b 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -115,6 +115,21 @@ config WIREGUARD_DEBUG Say N here unless you know what you're doing. +config OVPN + tristate "OpenVPN data channel offload" + depends on NET && INET + depends on IPV6 || !IPV6 + select DST_CACHE + select NET_UDP_TUNNEL + select CRYPTO + select CRYPTO_AES + select CRYPTO_GCM + select CRYPTO_CHACHA20POLY1305 + select STREAM_PARSER + help + This module enhances the performance of the OpenVPN userspace software + by offloading the data channel processing to kernelspace. + config EQUALIZER tristate "EQL (serial line load balancing) support" help diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 75333251a01a8..73bc63ecd65ff 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_IPVLAN) += ipvlan/ obj-$(CONFIG_IPVTAP) += ipvlan/ obj-$(CONFIG_DUMMY) += dummy.o obj-$(CONFIG_WIREGUARD) += wireguard/ +obj-$(CONFIG_OVPN) += ovpn/ obj-$(CONFIG_EQUALIZER) += eql.o obj-$(CONFIG_IFB) += ifb.o obj-$(CONFIG_MACSEC) += macsec.o diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index d1473c5f8eefc..a9dffdcac8057 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -777,27 +777,19 @@ static __net_init int bareudp_init_net(struct net *net) return 0; } -static void bareudp_destroy_tunnels(struct net *net, struct list_head *head) +static void __net_exit bareudp_exit_rtnl_net(struct net *net, + struct list_head *dev_kill_list) { struct bareudp_net *bn = net_generic(net, bareudp_net_id); struct bareudp_dev *bareudp, *next; list_for_each_entry_safe(bareudp, next, &bn->bareudp_list, next) - unregister_netdevice_queue(bareudp->dev, head); -} - -static void __net_exit bareudp_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_kill_list) -{ - struct net *net; - - list_for_each_entry(net, net_list, exit_list) - bareudp_destroy_tunnels(net, dev_kill_list); + bareudp_dellink(bareudp->dev, dev_kill_list); } static struct pernet_operations bareudp_net_ops = { .init = bareudp_init_net, - .exit_batch_rtnl = bareudp_exit_batch_rtnl, + .exit_rtnl = bareudp_exit_rtnl_net, .id = &bareudp_net_id, .size = sizeof(struct bareudp_net), }; diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 8ea183da8d539..665b9af684fcc 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -6563,7 +6563,7 @@ static int __net_init bond_net_init(struct net *net) /* According to commit 69b0216ac255 ("bonding: fix bonding_masters * race condition in bond unloading") we need to remove sysfs files - * before we remove our devices (done later in bond_net_exit_batch_rtnl()) + * before we remove our devices (done later in bond_net_exit_rtnl()) */ static void __net_exit bond_net_pre_exit(struct net *net) { @@ -6572,25 +6572,20 @@ static void __net_exit bond_net_pre_exit(struct net *net) bond_destroy_sysfs(bn); } -static void __net_exit bond_net_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_kill_list) +static void __net_exit bond_net_exit_rtnl(struct net *net, + struct list_head *dev_kill_list) { - struct bond_net *bn; - struct net *net; + struct bond_net *bn = net_generic(net, bond_net_id); + struct bonding *bond, *tmp_bond; /* Kill off any bonds created after unregistering bond rtnl ops */ - list_for_each_entry(net, net_list, exit_list) { - struct bonding *bond, *tmp_bond; - - bn = net_generic(net, bond_net_id); - list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list) - unregister_netdevice_queue(bond->dev, dev_kill_list); - } + list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list) + unregister_netdevice_queue(bond->dev, dev_kill_list); } /* According to commit 23fa5c2caae0 ("bonding: destroy proc directory * only after all bonds are gone") bond_destroy_proc_dir() is called - * after bond_net_exit_batch_rtnl() has completed. + * after bond_net_exit_rtnl() has completed. */ static void __net_exit bond_net_exit_batch(struct list_head *net_list) { @@ -6606,7 +6601,7 @@ static void __net_exit bond_net_exit_batch(struct list_head *net_list) static struct pernet_operations bond_net_ops = { .init = bond_net_init, .pre_exit = bond_net_pre_exit, - .exit_batch_rtnl = bond_net_exit_batch_rtnl, + .exit_rtnl = bond_net_exit_rtnl, .exit_batch = bond_net_exit_batch, .id = &bond_net_id, .size = sizeof(struct bond_net), diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 89f0796894af6..b45052497f8a2 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -3999,6 +3999,89 @@ static int ksz_ets_band_to_queue(struct tc_ets_qopt_offload_replace_params *p, return p->bands - 1 - band; } +/** + * ksz88x3_tc_ets_add - Configure ETS (Enhanced Transmission Selection) + * for a port on KSZ88x3 switch + * @dev: Pointer to the KSZ switch device structure + * @port: Port number to configure + * @p: Pointer to offload replace parameters describing ETS bands and mapping + * + * The KSZ88x3 supports two scheduling modes: Strict Priority and + * Weighted Fair Queuing (WFQ). Both modes have fixed behavior: + * - No configurable queue-to-priority mapping + * - No weight adjustment in WFQ mode + * + * This function configures the switch to use strict priority mode by + * clearing the WFQ enable bit for all queues associated with ETS bands. + * If strict priority is not explicitly requested, the switch will default + * to WFQ mode. + * + * Return: 0 on success, or a negative error code on failure + */ +static int ksz88x3_tc_ets_add(struct ksz_device *dev, int port, + struct tc_ets_qopt_offload_replace_params *p) +{ + int ret, band; + + /* Only strict priority mode is supported for now. + * WFQ is implicitly enabled when strict mode is disabled. + */ + for (band = 0; band < p->bands; band++) { + int queue = ksz_ets_band_to_queue(p, band); + u8 reg; + + /* Calculate TXQ Split Control register address for this + * port/queue + */ + reg = KSZ8873_TXQ_SPLIT_CTRL_REG(port, queue); + + /* Clear WFQ enable bit to select strict priority scheduling */ + ret = ksz_rmw8(dev, reg, KSZ8873_TXQ_WFQ_ENABLE, 0); + if (ret) + return ret; + } + + return 0; +} + +/** + * ksz88x3_tc_ets_del - Reset ETS (Enhanced Transmission Selection) config + * for a port on KSZ88x3 switch + * @dev: Pointer to the KSZ switch device structure + * @port: Port number to reset + * + * The KSZ88x3 supports only fixed scheduling modes: Strict Priority or + * Weighted Fair Queuing (WFQ), with no reconfiguration of weights or + * queue mapping. This function resets the port’s scheduling mode to + * the default, which is WFQ, by enabling the WFQ bit for all queues. + * + * Return: 0 on success, or a negative error code on failure + */ +static int ksz88x3_tc_ets_del(struct ksz_device *dev, int port) +{ + int ret, queue; + + /* Iterate over all transmit queues for this port */ + for (queue = 0; queue < dev->info->num_tx_queues; queue++) { + u8 reg; + + /* Calculate TXQ Split Control register address for this + * port/queue + */ + reg = KSZ8873_TXQ_SPLIT_CTRL_REG(port, queue); + + /* Set WFQ enable bit to revert back to default scheduling + * mode + */ + ret = ksz_rmw8(dev, reg, KSZ8873_TXQ_WFQ_ENABLE, + KSZ8873_TXQ_WFQ_ENABLE); + if (ret) + return ret; + } + + return 0; +} + static int ksz_queue_set_strict(struct ksz_device *dev, int port, int queue) { int ret; @@ -4080,6 +4163,7 @@ static int ksz_tc_ets_del(struct ksz_device *dev, int port) for (queue = 0; queue < dev->info->num_tx_queues; queue++) { ret = ksz_queue_set_wrr(dev, port, queue, KSZ9477_DEFAULT_WRR_WEIGHT); + if (ret) return ret; } @@ -4132,7 +4216,7 @@ static int ksz_tc_setup_qdisc_ets(struct dsa_switch *ds, int port, struct ksz_device *dev = ds->priv; int ret; - if (is_ksz8(dev)) + if (is_ksz8(dev) && !ksz_is_ksz88x3(dev)) return -EOPNOTSUPP; if (qopt->parent != TC_H_ROOT) { @@ -4146,9 +4230,16 @@ static int ksz_tc_setup_qdisc_ets(struct dsa_switch *ds, int port, if (ret) return ret; - return ksz_tc_ets_add(dev, port, &qopt->replace_params); + if (ksz_is_ksz88x3(dev)) + return ksz88x3_tc_ets_add(dev, port, + &qopt->replace_params); + else + return ksz_tc_ets_add(dev, port, &qopt->replace_params); case TC_ETS_DESTROY: - return ksz_tc_ets_del(dev, port); + if (ksz_is_ksz88x3(dev)) + return ksz88x3_tc_ets_del(dev, port); + else + return ksz_tc_ets_del(dev, port); case TC_ETS_STATS: case TC_ETS_GRAFT: return -EOPNOTSUPP; diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h index af17a9c030d41..dd5429ff16eee 100644 --- a/drivers/net/dsa/microchip/ksz_common.h +++ b/drivers/net/dsa/microchip/ksz_common.h @@ -836,6 +836,25 @@ static inline bool is_lan937x_tx_phy(struct ksz_device *dev, int port) #define SW_HI_SPEED_DRIVE_STRENGTH_S 4 #define SW_LO_SPEED_DRIVE_STRENGTH_S 0 +/* TXQ Split Control Register for per-port, per-queue configuration. + * Register 0xAF is TXQ Split for Q3 on Port 1. + * Register offset formula: 0xAF + (port * 4) + (3 - queue) + * where: port = 0..2, queue = 0..3 + */ +#define KSZ8873_TXQ_SPLIT_CTRL_REG(port, queue) \ + (0xAF + ((port) * 4) + (3 - (queue))) + +/* Bit 7 selects between: + * 0 = Strict priority mode (highest-priority queue first) + * 1 = Weighted Fair Queuing (WFQ) mode: + * Queue weights: Q3:Q2:Q1:Q0 = 8:4:2:1 + * If any queues are empty, weight is redistributed. + * + * Note: This is referred to as "Weighted Fair Queuing" (WFQ) in KSZ8863/8873 + * documentation, and as "Weighted Round Robin" (WRR) in KSZ9477 family docs. + */ +#define KSZ8873_TXQ_WFQ_ENABLE BIT(7) + #define KSZ9477_REG_PORT_OUT_RATE_0 0x0420 #define KSZ9477_OUT_RATE_NO_LIMIT 0 diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index d70399bce5b9e..0a33ca1dd7ca1 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -32,47 +32,15 @@ static struct mt753x_pcs *pcs_to_mt753x_pcs(struct phylink_pcs *pcs) /* String, offset, and register size in bytes if different from 4 bytes */ static const struct mt7530_mib_desc mt7530_mib[] = { - MIB_DESC(1, 0x00, "TxDrop"), - MIB_DESC(1, 0x04, "TxCrcErr"), - MIB_DESC(1, 0x08, "TxUnicast"), - MIB_DESC(1, 0x0c, "TxMulticast"), - MIB_DESC(1, 0x10, "TxBroadcast"), - MIB_DESC(1, 0x14, "TxCollision"), - MIB_DESC(1, 0x18, "TxSingleCollision"), - MIB_DESC(1, 0x1c, "TxMultipleCollision"), - MIB_DESC(1, 0x20, "TxDeferred"), - MIB_DESC(1, 0x24, "TxLateCollision"), - MIB_DESC(1, 0x28, "TxExcessiveCollistion"), - MIB_DESC(1, 0x2c, "TxPause"), - MIB_DESC(1, 0x30, "TxPktSz64"), - MIB_DESC(1, 0x34, "TxPktSz65To127"), - MIB_DESC(1, 0x38, "TxPktSz128To255"), - MIB_DESC(1, 0x3c, "TxPktSz256To511"), - MIB_DESC(1, 0x40, "TxPktSz512To1023"), - MIB_DESC(1, 0x44, "Tx1024ToMax"), - MIB_DESC(2, 0x48, "TxBytes"), - MIB_DESC(1, 0x60, "RxDrop"), - MIB_DESC(1, 0x64, "RxFiltering"), - MIB_DESC(1, 0x68, "RxUnicast"), - MIB_DESC(1, 0x6c, "RxMulticast"), - MIB_DESC(1, 0x70, "RxBroadcast"), - MIB_DESC(1, 0x74, "RxAlignErr"), - MIB_DESC(1, 0x78, "RxCrcErr"), - MIB_DESC(1, 0x7c, "RxUnderSizeErr"), - MIB_DESC(1, 0x80, "RxFragErr"), - MIB_DESC(1, 0x84, "RxOverSzErr"), - MIB_DESC(1, 0x88, "RxJabberErr"), - MIB_DESC(1, 0x8c, "RxPause"), - MIB_DESC(1, 0x90, "RxPktSz64"), - MIB_DESC(1, 0x94, "RxPktSz65To127"), - MIB_DESC(1, 0x98, "RxPktSz128To255"), - MIB_DESC(1, 0x9c, "RxPktSz256To511"), - MIB_DESC(1, 0xa0, "RxPktSz512To1023"), - MIB_DESC(1, 0xa4, "RxPktSz1024ToMax"), - MIB_DESC(2, 0xa8, "RxBytes"), - MIB_DESC(1, 0xb0, "RxCtrlDrop"), - MIB_DESC(1, 0xb4, "RxIngressDrop"), - MIB_DESC(1, 0xb8, "RxArlDrop"), + MIB_DESC(1, MT7530_PORT_MIB_TX_DROP, "TxDrop"), + MIB_DESC(1, MT7530_PORT_MIB_TX_CRC_ERR, "TxCrcErr"), + MIB_DESC(1, MT7530_PORT_MIB_TX_COLLISION, "TxCollision"), + MIB_DESC(1, MT7530_PORT_MIB_RX_DROP, "RxDrop"), + MIB_DESC(1, MT7530_PORT_MIB_RX_FILTERING, "RxFiltering"), + MIB_DESC(1, MT7530_PORT_MIB_RX_CRC_ERR, "RxCrcErr"), + MIB_DESC(1, MT7530_PORT_MIB_RX_CTRL_DROP, "RxCtrlDrop"), + MIB_DESC(1, MT7530_PORT_MIB_RX_INGRESS_DROP, "RxIngressDrop"), + MIB_DESC(1, MT7530_PORT_MIB_RX_ARL_DROP, "RxArlDrop"), }; static void @@ -789,24 +757,34 @@ mt7530_get_strings(struct dsa_switch *ds, int port, u32 stringset, ethtool_puts(&data, mt7530_mib[i].name); } +static void +mt7530_read_port_stats(struct mt7530_priv *priv, int port, + u32 offset, u8 size, uint64_t *data) +{ + u32 val, reg = MT7530_PORT_MIB_COUNTER(port) + offset; + + val = mt7530_read(priv, reg); + *data = val; + + if (size == 2) { + val = mt7530_read(priv, reg + 4); + *data |= (u64)val << 32; + } +} + static void mt7530_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data) { struct mt7530_priv *priv = ds->priv; const struct mt7530_mib_desc *mib; - u32 reg, i; - u64 hi; + int i; for (i = 0; i < ARRAY_SIZE(mt7530_mib); i++) { mib = &mt7530_mib[i]; - reg = MT7530_PORT_MIB_COUNTER(port) + mib->offset; - data[i] = mt7530_read(priv, reg); - if (mib->size == 2) { - hi = mt7530_read(priv, reg + 4); - data[i] |= hi << 32; - } + mt7530_read_port_stats(priv, port, mib->offset, mib->size, + data + i); } } @@ -819,6 +797,172 @@ mt7530_get_sset_count(struct dsa_switch *ds, int port, int sset) return ARRAY_SIZE(mt7530_mib); } +static void mt7530_get_eth_mac_stats(struct dsa_switch *ds, int port, + struct ethtool_eth_mac_stats *mac_stats) +{ + struct mt7530_priv *priv = ds->priv; + + /* MIB counter doesn't provide a FramesTransmittedOK but instead + * provide stats for Unicast, Broadcast and Multicast frames separately. + * To simulate a global frame counter, read Unicast and addition Multicast + * and Broadcast later + */ + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_UNICAST, 1, + &mac_stats->FramesTransmittedOK); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_SINGLE_COLLISION, 1, + &mac_stats->SingleCollisionFrames); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_MULTIPLE_COLLISION, 1, + &mac_stats->MultipleCollisionFrames); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_UNICAST, 1, + &mac_stats->FramesReceivedOK); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_BYTES, 2, + &mac_stats->OctetsTransmittedOK); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_ALIGN_ERR, 1, + &mac_stats->AlignmentErrors); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_DEFERRED, 1, + &mac_stats->FramesWithDeferredXmissions); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_LATE_COLLISION, 1, + &mac_stats->LateCollisions); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_EXCESSIVE_COLLISION, 1, + &mac_stats->FramesAbortedDueToXSColls); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_BYTES, 2, + &mac_stats->OctetsReceivedOK); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_MULTICAST, 1, + &mac_stats->MulticastFramesXmittedOK); + mac_stats->FramesTransmittedOK += mac_stats->MulticastFramesXmittedOK; + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_BROADCAST, 1, + &mac_stats->BroadcastFramesXmittedOK); + mac_stats->FramesTransmittedOK += mac_stats->BroadcastFramesXmittedOK; + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_MULTICAST, 1, + &mac_stats->MulticastFramesReceivedOK); + mac_stats->FramesReceivedOK += mac_stats->MulticastFramesReceivedOK; + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_BROADCAST, 1, + &mac_stats->BroadcastFramesReceivedOK); + mac_stats->FramesReceivedOK += mac_stats->BroadcastFramesReceivedOK; +} + +static const struct ethtool_rmon_hist_range mt7530_rmon_ranges[] = { + { 0, 64 }, + { 65, 127 }, + { 128, 255 }, + { 256, 511 }, + { 512, 1023 }, + { 1024, MT7530_MAX_MTU }, + {} +}; + +static void mt7530_get_rmon_stats(struct dsa_switch *ds, int port, + struct ethtool_rmon_stats *rmon_stats, + const struct ethtool_rmon_hist_range **ranges) +{ + struct mt7530_priv *priv = ds->priv; + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_UNDER_SIZE_ERR, 1, + &rmon_stats->undersize_pkts); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_OVER_SZ_ERR, 1, + &rmon_stats->oversize_pkts); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_FRAG_ERR, 1, + &rmon_stats->fragments); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_JABBER_ERR, 1, + &rmon_stats->jabbers); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_PKT_SZ_64, 1, + &rmon_stats->hist[0]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_PKT_SZ_65_TO_127, 1, + &rmon_stats->hist[1]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_PKT_SZ_128_TO_255, 1, + &rmon_stats->hist[2]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_PKT_SZ_256_TO_511, 1, + &rmon_stats->hist[3]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_PKT_SZ_512_TO_1023, 1, + &rmon_stats->hist[4]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_PKT_SZ_1024_TO_MAX, 1, + &rmon_stats->hist[5]); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_PKT_SZ_64, 1, + &rmon_stats->hist_tx[0]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_PKT_SZ_65_TO_127, 1, + &rmon_stats->hist_tx[1]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_PKT_SZ_128_TO_255, 1, + &rmon_stats->hist_tx[2]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_PKT_SZ_256_TO_511, 1, + &rmon_stats->hist_tx[3]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_PKT_SZ_512_TO_1023, 1, + &rmon_stats->hist_tx[4]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_PKT_SZ_1024_TO_MAX, 1, + &rmon_stats->hist_tx[5]); + + *ranges = mt7530_rmon_ranges; +} + +static void mt7530_get_stats64(struct dsa_switch *ds, int port, + struct rtnl_link_stats64 *storage) +{ + struct mt7530_priv *priv = ds->priv; + uint64_t data; + + /* MIB counter doesn't provide a FramesTransmittedOK but instead + * provide stats for Unicast, Broadcast and Multicast frames separately. + * To simulate a global frame counter, read Unicast and addition Multicast + * and Broadcast later + */ + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_UNICAST, 1, + &storage->rx_packets); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_MULTICAST, 1, + &storage->multicast); + storage->rx_packets += storage->multicast; + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_BROADCAST, 1, + &data); + storage->rx_packets += data; + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_UNICAST, 1, + &storage->tx_packets); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_MULTICAST, 1, + &data); + storage->tx_packets += data; + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_BROADCAST, 1, + &data); + storage->tx_packets += data; + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_BYTES, 2, + &storage->rx_bytes); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_BYTES, 2, + &storage->tx_bytes); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_DROP, 1, + &storage->rx_dropped); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_DROP, 1, + &storage->tx_dropped); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_CRC_ERR, 1, + &storage->rx_crc_errors); +} + +static void mt7530_get_eth_ctrl_stats(struct dsa_switch *ds, int port, + struct ethtool_eth_ctrl_stats *ctrl_stats) +{ + struct mt7530_priv *priv = ds->priv; + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_PAUSE, 1, + &ctrl_stats->MACControlFramesTransmitted); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_PAUSE, 1, + &ctrl_stats->MACControlFramesReceived); +} + static int mt7530_set_ageing_time(struct dsa_switch *ds, unsigned int msecs) { @@ -3105,6 +3249,10 @@ const struct dsa_switch_ops mt7530_switch_ops = { .get_strings = mt7530_get_strings, .get_ethtool_stats = mt7530_get_ethtool_stats, .get_sset_count = mt7530_get_sset_count, + .get_eth_mac_stats = mt7530_get_eth_mac_stats, + .get_rmon_stats = mt7530_get_rmon_stats, + .get_eth_ctrl_stats = mt7530_get_eth_ctrl_stats, + .get_stats64 = mt7530_get_stats64, .set_ageing_time = mt7530_set_ageing_time, .port_enable = mt7530_port_enable, .port_disable = mt7530_port_disable, diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index c3ea403d7acfd..d4b838a055ad9 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -423,6 +423,48 @@ enum mt7530_vlan_port_acc_frm { /* Register for MIB */ #define MT7530_PORT_MIB_COUNTER(x) (0x4000 + (x) * 0x100) +/* Each define is an offset of MT7530_PORT_MIB_COUNTER */ +#define MT7530_PORT_MIB_TX_DROP 0x00 +#define MT7530_PORT_MIB_TX_CRC_ERR 0x04 +#define MT7530_PORT_MIB_TX_UNICAST 0x08 +#define MT7530_PORT_MIB_TX_MULTICAST 0x0c +#define MT7530_PORT_MIB_TX_BROADCAST 0x10 +#define MT7530_PORT_MIB_TX_COLLISION 0x14 +#define MT7530_PORT_MIB_TX_SINGLE_COLLISION 0x18 +#define MT7530_PORT_MIB_TX_MULTIPLE_COLLISION 0x1c +#define MT7530_PORT_MIB_TX_DEFERRED 0x20 +#define MT7530_PORT_MIB_TX_LATE_COLLISION 0x24 +#define MT7530_PORT_MIB_TX_EXCESSIVE_COLLISION 0x28 +#define MT7530_PORT_MIB_TX_PAUSE 0x2c +#define MT7530_PORT_MIB_TX_PKT_SZ_64 0x30 +#define MT7530_PORT_MIB_TX_PKT_SZ_65_TO_127 0x34 +#define MT7530_PORT_MIB_TX_PKT_SZ_128_TO_255 0x38 +#define MT7530_PORT_MIB_TX_PKT_SZ_256_TO_511 0x3c +#define MT7530_PORT_MIB_TX_PKT_SZ_512_TO_1023 0x40 +#define MT7530_PORT_MIB_TX_PKT_SZ_1024_TO_MAX 0x44 +#define MT7530_PORT_MIB_TX_BYTES 0x48 /* 64 bytes */ +#define MT7530_PORT_MIB_RX_DROP 0x60 +#define MT7530_PORT_MIB_RX_FILTERING 0x64 +#define MT7530_PORT_MIB_RX_UNICAST 0x68 +#define MT7530_PORT_MIB_RX_MULTICAST 0x6c +#define MT7530_PORT_MIB_RX_BROADCAST 0x70 +#define MT7530_PORT_MIB_RX_ALIGN_ERR 0x74 +#define MT7530_PORT_MIB_RX_CRC_ERR 0x78 +#define MT7530_PORT_MIB_RX_UNDER_SIZE_ERR 0x7c +#define MT7530_PORT_MIB_RX_FRAG_ERR 0x80 +#define MT7530_PORT_MIB_RX_OVER_SZ_ERR 0x84 +#define MT7530_PORT_MIB_RX_JABBER_ERR 0x88 +#define MT7530_PORT_MIB_RX_PAUSE 0x8c +#define MT7530_PORT_MIB_RX_PKT_SZ_64 0x90 +#define MT7530_PORT_MIB_RX_PKT_SZ_65_TO_127 0x94 +#define MT7530_PORT_MIB_RX_PKT_SZ_128_TO_255 0x98 +#define MT7530_PORT_MIB_RX_PKT_SZ_256_TO_511 0x9c +#define MT7530_PORT_MIB_RX_PKT_SZ_512_TO_1023 0xa0 +#define MT7530_PORT_MIB_RX_PKT_SZ_1024_TO_MAX 0xa4 +#define MT7530_PORT_MIB_RX_BYTES 0xa8 /* 64 bytes */ +#define MT7530_PORT_MIB_RX_CTRL_DROP 0xb0 +#define MT7530_PORT_MIB_RX_INGRESS_DROP 0xb4 +#define MT7530_PORT_MIB_RX_ARL_DROP 0xb8 #define MT7530_MIB_CCR 0x4fe0 #define CCR_MIB_ENABLE BIT(31) #define CCR_RX_OCT_CNT_GOOD BIT(7) diff --git a/drivers/net/dsa/mv88e6xxx/ptp.c b/drivers/net/dsa/mv88e6xxx/ptp.c index aed4a4b07f34b..1d3b2c94c53ee 100644 --- a/drivers/net/dsa/mv88e6xxx/ptp.c +++ b/drivers/net/dsa/mv88e6xxx/ptp.c @@ -332,13 +332,6 @@ static int mv88e6352_ptp_enable_extts(struct mv88e6xxx_chip *chip, int pin; int err; - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Reject requests to enable time stamping on both edges. */ if ((rq->extts.flags & PTP_STRICT_FLAGS) && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -566,6 +559,10 @@ int mv88e6xxx_ptp_setup(struct mv88e6xxx_chip *chip) chip->ptp_clock_info.verify = ptp_ops->ptp_verify; chip->ptp_clock_info.do_aux_work = mv88e6xxx_hwtstamp_work; + chip->ptp_clock_info.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; + if (ptp_ops->set_ptp_cpu_port) { struct dsa_port *dp; int upstream = 0; diff --git a/drivers/net/dsa/rzn1_a5psw.c b/drivers/net/dsa/rzn1_a5psw.c index 31ea8130a495f..df7466d4fe8f0 100644 --- a/drivers/net/dsa/rzn1_a5psw.c +++ b/drivers/net/dsa/rzn1_a5psw.c @@ -337,8 +337,9 @@ static void a5psw_port_rx_block_set(struct a5psw *a5psw, int port, bool block) static void a5psw_flooding_set_resolution(struct a5psw *a5psw, int port, bool set) { - u8 offsets[] = {A5PSW_UCAST_DEF_MASK, A5PSW_BCAST_DEF_MASK, - A5PSW_MCAST_DEF_MASK}; + static const u8 offsets[] = { + A5PSW_UCAST_DEF_MASK, A5PSW_BCAST_DEF_MASK, A5PSW_MCAST_DEF_MASK + }; int i; for (i = 0; i < ARRAY_SIZE(offsets); i++) diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index 198e787e8560c..3abc64aec411a 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -737,10 +737,6 @@ static int sja1105_per_out_enable(struct sja1105_private *priv, if (perout->index != 0) return -EOPNOTSUPP; - /* Reject requests with unsupported flags */ - if (perout->flags) - return -EOPNOTSUPP; - mutex_lock(&ptp_data->lock); rc = sja1105_change_ptp_clk_pin_func(priv, PTP_PF_PEROUT); @@ -820,13 +816,6 @@ static int sja1105_extts_enable(struct sja1105_private *priv, if (extts->index != 0) return -EOPNOTSUPP; - /* Reject requests with unsupported flags */ - if (extts->flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* We can only enable time stamping on both edges, sadly. */ if ((extts->flags & PTP_STRICT_FLAGS) && (extts->flags & PTP_ENABLE_FEATURE) && @@ -912,6 +901,9 @@ int sja1105_ptp_clock_register(struct dsa_switch *ds) .n_pins = 1, .n_ext_ts = 1, .n_per_out = 1, + .supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS, }; /* Only used on SJA1105 */ diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index d748dc6de9236..c773b5ea9c051 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -527,6 +527,25 @@ static int airoha_fe_init(struct airoha_eth *eth) /* disable IFC by default */ airoha_fe_clear(eth, REG_FE_CSR_IFC_CFG, FE_IFC_EN_MASK); + airoha_fe_wr(eth, REG_PPE_DFT_CPORT0(0), + FIELD_PREP(DFT_CPORT_MASK(7), FE_PSE_PORT_CDM1) | + FIELD_PREP(DFT_CPORT_MASK(6), FE_PSE_PORT_CDM1) | + FIELD_PREP(DFT_CPORT_MASK(5), FE_PSE_PORT_CDM1) | + FIELD_PREP(DFT_CPORT_MASK(4), FE_PSE_PORT_CDM1) | + FIELD_PREP(DFT_CPORT_MASK(3), FE_PSE_PORT_CDM1) | + FIELD_PREP(DFT_CPORT_MASK(2), FE_PSE_PORT_CDM1) | + FIELD_PREP(DFT_CPORT_MASK(1), FE_PSE_PORT_CDM1) | + FIELD_PREP(DFT_CPORT_MASK(0), FE_PSE_PORT_CDM1)); + airoha_fe_wr(eth, REG_PPE_DFT_CPORT0(1), + FIELD_PREP(DFT_CPORT_MASK(7), FE_PSE_PORT_CDM2) | + FIELD_PREP(DFT_CPORT_MASK(6), FE_PSE_PORT_CDM2) | + FIELD_PREP(DFT_CPORT_MASK(5), FE_PSE_PORT_CDM2) | + FIELD_PREP(DFT_CPORT_MASK(4), FE_PSE_PORT_CDM2) | + FIELD_PREP(DFT_CPORT_MASK(3), FE_PSE_PORT_CDM2) | + FIELD_PREP(DFT_CPORT_MASK(2), FE_PSE_PORT_CDM2) | + FIELD_PREP(DFT_CPORT_MASK(1), FE_PSE_PORT_CDM2) | + FIELD_PREP(DFT_CPORT_MASK(0), FE_PSE_PORT_CDM2)); + /* enable 1:N vlan action, init vlan table */ airoha_fe_set(eth, REG_MC_VLAN_EN, MC_VLAN_EN_MASK); @@ -694,7 +713,7 @@ static int airoha_qdma_rx_process(struct airoha_queue *q, int budget) reason = FIELD_GET(AIROHA_RXD4_PPE_CPU_REASON, msg1); if (reason == PPE_CPU_REASON_HIT_UNBIND_RATE_REACHED) - airoha_ppe_check_skb(eth->ppe, hash); + airoha_ppe_check_skb(eth->ppe, q->skb, hash); done++; napi_gro_receive(&q->napi, q->skb); @@ -1631,7 +1650,6 @@ static void airhoha_set_gdm2_loopback(struct airoha_gdm_port *port) if (port->id == 3) { /* FIXME: handle XSI_PCE1_PORT */ - airoha_fe_wr(eth, REG_PPE_DFT_CPORT0(0), 0x5500); airoha_fe_rmw(eth, REG_FE_WAN_PORT, WAN1_EN_MASK | WAN1_MASK | WAN0_MASK, FIELD_PREP(WAN0_MASK, HSGMII_LAN_PCIE0_SRCPORT)); @@ -2109,6 +2127,125 @@ static int airoha_tc_setup_qdisc_ets(struct airoha_gdm_port *port, } } +static int airoha_qdma_get_rl_param(struct airoha_qdma *qdma, int queue_id, + u32 addr, enum trtcm_param_type param, + u32 *val_low, u32 *val_high) +{ + u32 idx = QDMA_METER_IDX(queue_id), group = QDMA_METER_GROUP(queue_id); + u32 val, config = FIELD_PREP(RATE_LIMIT_PARAM_TYPE_MASK, param) | + FIELD_PREP(RATE_LIMIT_METER_GROUP_MASK, group) | + FIELD_PREP(RATE_LIMIT_PARAM_INDEX_MASK, idx); + + airoha_qdma_wr(qdma, REG_TRTCM_CFG_PARAM(addr), config); + if (read_poll_timeout(airoha_qdma_rr, val, + val & RATE_LIMIT_PARAM_RW_DONE_MASK, + USEC_PER_MSEC, 10 * USEC_PER_MSEC, true, qdma, + REG_TRTCM_CFG_PARAM(addr))) + return -ETIMEDOUT; + + *val_low = airoha_qdma_rr(qdma, REG_TRTCM_DATA_LOW(addr)); + if (val_high) + *val_high = airoha_qdma_rr(qdma, REG_TRTCM_DATA_HIGH(addr)); + + return 0; +} + +static int airoha_qdma_set_rl_param(struct airoha_qdma *qdma, int queue_id, + u32 addr, enum trtcm_param_type param, + u32 val) +{ + u32 idx = QDMA_METER_IDX(queue_id), group = QDMA_METER_GROUP(queue_id); + u32 config = RATE_LIMIT_PARAM_RW_MASK | + FIELD_PREP(RATE_LIMIT_PARAM_TYPE_MASK, param) | + FIELD_PREP(RATE_LIMIT_METER_GROUP_MASK, group) | + FIELD_PREP(RATE_LIMIT_PARAM_INDEX_MASK, idx); + + airoha_qdma_wr(qdma, REG_TRTCM_DATA_LOW(addr), val); + airoha_qdma_wr(qdma, REG_TRTCM_CFG_PARAM(addr), config); + + return read_poll_timeout(airoha_qdma_rr, val, + val & RATE_LIMIT_PARAM_RW_DONE_MASK, + USEC_PER_MSEC, 10 * USEC_PER_MSEC, true, + qdma, REG_TRTCM_CFG_PARAM(addr)); +} + +static int airoha_qdma_set_rl_config(struct airoha_qdma *qdma, int queue_id, + u32 addr, bool enable, u32 enable_mask) +{ + u32 val; + int err; + + err = airoha_qdma_get_rl_param(qdma, queue_id, addr, TRTCM_MISC_MODE, + &val, NULL); + if (err) + return err; + + val = enable ? val | enable_mask : val & ~enable_mask; + + return airoha_qdma_set_rl_param(qdma, queue_id, addr, TRTCM_MISC_MODE, + val); +} + +static int airoha_qdma_set_rl_token_bucket(struct airoha_qdma *qdma, + int queue_id, u32 rate_val, + u32 bucket_size) +{ + u32 val, config, tick, unit, rate, rate_frac; + int err; + + err = airoha_qdma_get_rl_param(qdma, queue_id, REG_INGRESS_TRTCM_CFG, + TRTCM_MISC_MODE, &config, NULL); + if (err) + return err; + + val = airoha_qdma_rr(qdma, REG_INGRESS_TRTCM_CFG); + tick = FIELD_GET(INGRESS_FAST_TICK_MASK, val); + if (config & TRTCM_TICK_SEL) + tick *= FIELD_GET(INGRESS_SLOW_TICK_RATIO_MASK, val); + if (!tick) + return -EINVAL; + + unit = (config & TRTCM_PKT_MODE) ? 1000000 / tick : 8000 / tick; + if (!unit) + return -EINVAL; + + rate = rate_val / unit; + rate_frac = rate_val % unit; + rate_frac = FIELD_PREP(TRTCM_TOKEN_RATE_MASK, rate_frac) / unit; + rate = FIELD_PREP(TRTCM_TOKEN_RATE_MASK, rate) | + FIELD_PREP(TRTCM_TOKEN_RATE_FRACTION_MASK, rate_frac); + + err = airoha_qdma_set_rl_param(qdma, queue_id, REG_INGRESS_TRTCM_CFG, + TRTCM_TOKEN_RATE_MODE, rate); + if (err) + return err; + + val = bucket_size; + if (!(config & TRTCM_PKT_MODE)) + val = max_t(u32, val, MIN_TOKEN_SIZE); + val = min_t(u32, __fls(val), MAX_TOKEN_SIZE_OFFSET); + + return airoha_qdma_set_rl_param(qdma, queue_id, REG_INGRESS_TRTCM_CFG, + TRTCM_BUCKETSIZE_SHIFT_MODE, val); +} + +static int airoha_qdma_init_rl_config(struct airoha_qdma *qdma, int queue_id, + bool enable, enum trtcm_unit_type unit) +{ + bool tick_sel = queue_id == 0 || queue_id == 2 || queue_id == 8; + enum trtcm_param mode = TRTCM_METER_MODE; + int err; + + mode |= unit == TRTCM_PACKET_UNIT ? TRTCM_PKT_MODE : 0; + err = airoha_qdma_set_rl_config(qdma, queue_id, REG_INGRESS_TRTCM_CFG, + enable, mode); + if (err) + return err; + + return airoha_qdma_set_rl_config(qdma, queue_id, REG_INGRESS_TRTCM_CFG, + tick_sel, TRTCM_TICK_SEL); +} + static int airoha_qdma_get_trtcm_param(struct airoha_qdma *qdma, int channel, u32 addr, enum trtcm_param_type param, enum trtcm_mode_type mode, @@ -2273,10 +2410,142 @@ static int airoha_tc_htb_alloc_leaf_queue(struct airoha_gdm_port *port, return 0; } +static int airoha_qdma_set_rx_meter(struct airoha_gdm_port *port, + u32 rate, u32 bucket_size, + enum trtcm_unit_type unit_type) +{ + struct airoha_qdma *qdma = port->qdma; + int i; + + for (i = 0; i < ARRAY_SIZE(qdma->q_rx); i++) { + int err; + + if (!qdma->q_rx[i].ndesc) + continue; + + err = airoha_qdma_init_rl_config(qdma, i, !!rate, unit_type); + if (err) + return err; + + err = airoha_qdma_set_rl_token_bucket(qdma, i, rate, + bucket_size); + if (err) + return err; + } + + return 0; +} + +static int airoha_tc_matchall_act_validate(struct tc_cls_matchall_offload *f) +{ + const struct flow_action *actions = &f->rule->action; + const struct flow_action_entry *act; + + if (!flow_action_has_entries(actions)) { + NL_SET_ERR_MSG_MOD(f->common.extack, + "filter run with no actions"); + return -EINVAL; + } + + if (!flow_offload_has_one_action(actions)) { + NL_SET_ERR_MSG_MOD(f->common.extack, + "only once action per filter is supported"); + return -EOPNOTSUPP; + } + + act = &actions->entries[0]; + if (act->id != FLOW_ACTION_POLICE) { + NL_SET_ERR_MSG_MOD(f->common.extack, "unsupported action"); + return -EOPNOTSUPP; + } + + if (act->police.exceed.act_id != FLOW_ACTION_DROP) { + NL_SET_ERR_MSG_MOD(f->common.extack, + "invalid exceed action id"); + return -EOPNOTSUPP; + } + + if (act->police.notexceed.act_id != FLOW_ACTION_ACCEPT) { + NL_SET_ERR_MSG_MOD(f->common.extack, + "invalid notexceed action id"); + return -EOPNOTSUPP; + } + + if (act->police.notexceed.act_id == FLOW_ACTION_ACCEPT && + !flow_action_is_last_entry(actions, act)) { + NL_SET_ERR_MSG_MOD(f->common.extack, + "action accept must be last"); + return -EOPNOTSUPP; + } + + if (act->police.peakrate_bytes_ps || act->police.avrate || + act->police.overhead || act->police.mtu) { + NL_SET_ERR_MSG_MOD(f->common.extack, + "peakrate/avrate/overhead/mtu unsupported"); + return -EOPNOTSUPP; + } + + return 0; +} + +static int airoha_dev_tc_matchall(struct net_device *dev, + struct tc_cls_matchall_offload *f) +{ + enum trtcm_unit_type unit_type = TRTCM_BYTE_UNIT; + struct airoha_gdm_port *port = netdev_priv(dev); + u32 rate = 0, bucket_size = 0; + + switch (f->command) { + case TC_CLSMATCHALL_REPLACE: { + const struct flow_action_entry *act; + int err; + + err = airoha_tc_matchall_act_validate(f); + if (err) + return err; + + act = &f->rule->action.entries[0]; + if (act->police.rate_pkt_ps) { + rate = act->police.rate_pkt_ps; + bucket_size = act->police.burst_pkt; + unit_type = TRTCM_PACKET_UNIT; + } else { + rate = div_u64(act->police.rate_bytes_ps, 1000); + rate = rate << 3; /* Kbps */ + bucket_size = act->police.burst; + } + fallthrough; + } + case TC_CLSMATCHALL_DESTROY: + return airoha_qdma_set_rx_meter(port, rate, bucket_size, + unit_type); + default: + return -EOPNOTSUPP; + } +} + +static int airoha_dev_setup_tc_block_cb(enum tc_setup_type type, + void *type_data, void *cb_priv) +{ + struct net_device *dev = cb_priv; + + if (!tc_can_offload(dev)) + return -EOPNOTSUPP; + + switch (type) { + case TC_SETUP_CLSFLOWER: + return airoha_ppe_setup_tc_block_cb(dev, type_data); + case TC_SETUP_CLSMATCHALL: + return airoha_dev_tc_matchall(dev, type_data); + default: + return -EOPNOTSUPP; + } +} + static int airoha_dev_setup_tc_block(struct airoha_gdm_port *port, struct flow_block_offload *f) { - flow_setup_cb_t *cb = airoha_ppe_setup_tc_block_cb; + flow_setup_cb_t *cb = airoha_dev_setup_tc_block_cb; static LIST_HEAD(block_cb_list); struct flow_block_cb *block_cb; diff --git a/drivers/net/ethernet/airoha/airoha_eth.h b/drivers/net/ethernet/airoha/airoha_eth.h index ec8908f904c61..da5371bcd1473 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.h +++ b/drivers/net/ethernet/airoha/airoha_eth.h @@ -127,6 +127,11 @@ enum tx_sched_mode { TC_SCH_WRR2, }; +enum trtcm_unit_type { + TRTCM_BYTE_UNIT, + TRTCM_PACKET_UNIT, +}; + enum trtcm_param_type { TRTCM_MISC_MODE, /* meter_en, pps_mode, tick_sel */ TRTCM_TOKEN_RATE_MODE, @@ -422,12 +427,27 @@ struct airoha_flow_data { } pppoe; }; +enum airoha_flow_entry_type { + FLOW_TYPE_L4, + FLOW_TYPE_L2, + FLOW_TYPE_L2_SUBFLOW, +}; + struct airoha_flow_table_entry { - struct hlist_node list; + union { + struct hlist_node list; /* PPE L3 flow entry */ + struct { + struct rhash_head l2_node; /* L2 flow entry */ + struct hlist_head l2_flows; /* PPE L2 subflows list */ + }; + }; struct airoha_foe_entry data; + struct hlist_node l2_subflow_node; /* PPE L2 subflow entry */ u32 hash; + enum airoha_flow_entry_type type; + struct rhash_head node; unsigned long cookie; }; @@ -480,6 +500,8 @@ struct airoha_ppe { void *foe; dma_addr_t foe_dma; + struct rhashtable l2_flows; + struct hlist_head *foe_flow; u16 foe_check_time[PPE_NUM_ENTRIES]; @@ -535,9 +557,9 @@ u32 airoha_rmw(void __iomem *base, u32 offset, u32 mask, u32 val); bool airoha_is_valid_gdm_port(struct airoha_eth *eth, struct airoha_gdm_port *port); -void airoha_ppe_check_skb(struct airoha_ppe *ppe, u16 hash); -int airoha_ppe_setup_tc_block_cb(enum tc_setup_type type, void *type_data, - void *cb_priv); +void airoha_ppe_check_skb(struct airoha_ppe *ppe, struct sk_buff *skb, + u16 hash); +int airoha_ppe_setup_tc_block_cb(struct net_device *dev, void *type_data); int airoha_ppe_init(struct airoha_eth *eth); void airoha_ppe_deinit(struct airoha_eth *eth); struct airoha_foe_entry *airoha_ppe_foe_get_entry(struct airoha_ppe *ppe, diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index f10dab935cab6..6e9787c2843bc 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -24,6 +24,13 @@ static const struct rhashtable_params airoha_flow_table_params = { .automatic_shrinking = true, }; +static const struct rhashtable_params airoha_l2_flow_table_params = { + .head_offset = offsetof(struct airoha_flow_table_entry, l2_node), + .key_offset = offsetof(struct airoha_flow_table_entry, data.bridge), + .key_len = 2 * ETH_ALEN, + .automatic_shrinking = true, +}; + static bool airoha_ppe2_is_enabled(struct airoha_eth *eth) { return airoha_fe_rr(eth, REG_PPE_GLO_CFG(1)) & PPE_GLO_CFG_EN_MASK; @@ -197,6 +204,15 @@ static int airoha_get_dsa_port(struct net_device **dev) #endif } +static void airoha_ppe_foe_set_bridge_addrs(struct airoha_foe_bridge *br, + struct ethhdr *eh) +{ + br->dest_mac_hi = get_unaligned_be32(eh->h_dest); + br->dest_mac_lo = get_unaligned_be16(eh->h_dest + 4); + br->src_mac_hi = get_unaligned_be16(eh->h_source); + br->src_mac_lo = get_unaligned_be32(eh->h_source + 2); +} + static int airoha_ppe_foe_entry_prepare(struct airoha_eth *eth, struct airoha_foe_entry *hwe, struct net_device *dev, int type, @@ -247,13 +263,7 @@ static int airoha_ppe_foe_entry_prepare(struct airoha_eth *eth, qdata = FIELD_PREP(AIROHA_FOE_SHAPER_ID, 0x7f); if (type == PPE_PKT_TYPE_BRIDGE) { - hwe->bridge.dest_mac_hi = get_unaligned_be32(data->eth.h_dest); - hwe->bridge.dest_mac_lo = - get_unaligned_be16(data->eth.h_dest + 4); - hwe->bridge.src_mac_hi = - get_unaligned_be16(data->eth.h_source); - hwe->bridge.src_mac_lo = - get_unaligned_be32(data->eth.h_source + 2); + airoha_ppe_foe_set_bridge_addrs(&hwe->bridge, &data->eth); hwe->bridge.data = qdata; hwe->bridge.ib2 = val; l2 = &hwe->bridge.l2.common; @@ -378,6 +388,19 @@ static u32 airoha_ppe_foe_get_entry_hash(struct airoha_foe_entry *hwe) hv3 = hwe->ipv6.src_ip[1] ^ hwe->ipv6.dest_ip[1]; hv3 ^= hwe->ipv6.src_ip[0]; break; + case PPE_PKT_TYPE_BRIDGE: { + struct airoha_foe_mac_info *l2 = &hwe->bridge.l2; + + hv1 = l2->common.src_mac_hi & 0xffff; + hv1 = hv1 << 16 | l2->src_mac_lo; + + hv2 = l2->common.dest_mac_lo; + hv2 = hv2 << 16; + hv2 = hv2 | ((l2->common.src_mac_hi & 0xffff0000) >> 16); + + hv3 = l2->common.dest_mac_hi; + break; + } case PPE_PKT_TYPE_IPV4_DSLITE: case PPE_PKT_TYPE_IPV6_6RD: default: @@ -476,10 +499,102 @@ static int airoha_ppe_foe_commit_entry(struct airoha_ppe *ppe, return 0; } -static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, u32 hash) +static void airoha_ppe_foe_remove_flow(struct airoha_ppe *ppe, + struct airoha_flow_table_entry *e) +{ + lockdep_assert_held(&ppe_lock); + + hlist_del_init(&e->list); + if (e->hash != 0xffff) { + e->data.ib1 &= ~AIROHA_FOE_IB1_BIND_STATE; + e->data.ib1 |= FIELD_PREP(AIROHA_FOE_IB1_BIND_STATE, + AIROHA_FOE_STATE_INVALID); + airoha_ppe_foe_commit_entry(ppe, &e->data, e->hash); + e->hash = 0xffff; + } + if (e->type == FLOW_TYPE_L2_SUBFLOW) { + hlist_del_init(&e->l2_subflow_node); + kfree(e); + } +} + +static void airoha_ppe_foe_remove_l2_flow(struct airoha_ppe *ppe, + struct airoha_flow_table_entry *e) +{ + struct hlist_head *head = &e->l2_flows; + struct hlist_node *n; + + lockdep_assert_held(&ppe_lock); + + rhashtable_remove_fast(&ppe->l2_flows, &e->l2_node, + airoha_l2_flow_table_params); + hlist_for_each_entry_safe(e, n, head, l2_subflow_node) + airoha_ppe_foe_remove_flow(ppe, e); +} + +static void airoha_ppe_foe_flow_remove_entry(struct airoha_ppe *ppe, + struct airoha_flow_table_entry *e) +{ + spin_lock_bh(&ppe_lock); + + if (e->type == FLOW_TYPE_L2) + airoha_ppe_foe_remove_l2_flow(ppe, e); + else + airoha_ppe_foe_remove_flow(ppe, e); + + spin_unlock_bh(&ppe_lock); +} + +static int +airoha_ppe_foe_commit_subflow_entry(struct airoha_ppe *ppe, + struct airoha_flow_table_entry *e, + u32 hash) +{ + u32 mask = AIROHA_FOE_IB1_BIND_PACKET_TYPE | AIROHA_FOE_IB1_BIND_UDP; + struct airoha_foe_entry *hwe_p, hwe; + struct airoha_flow_table_entry *f; + struct airoha_foe_mac_info *l2; + int type; + + hwe_p = airoha_ppe_foe_get_entry(ppe, hash); + if (!hwe_p) + return -EINVAL; + + f = kzalloc(sizeof(*f), GFP_ATOMIC); + if (!f) + return -ENOMEM; + + hlist_add_head(&f->l2_subflow_node, &e->l2_flows); + f->type = FLOW_TYPE_L2_SUBFLOW; + f->hash = hash; + + memcpy(&hwe, hwe_p, sizeof(*hwe_p)); + hwe.ib1 = (hwe.ib1 & mask) | (e->data.ib1 & ~mask); + l2 = &hwe.bridge.l2; + memcpy(l2, &e->data.bridge.l2, sizeof(*l2)); + + type = FIELD_GET(AIROHA_FOE_IB1_BIND_PACKET_TYPE, hwe.ib1); + if (type == PPE_PKT_TYPE_IPV4_HNAPT) + memcpy(&hwe.ipv4.new_tuple, &hwe.ipv4.orig_tuple, + sizeof(hwe.ipv4.new_tuple)); + else if (type >= PPE_PKT_TYPE_IPV6_ROUTE_3T && + l2->common.etype == ETH_P_IP) + l2->common.etype = ETH_P_IPV6; + + hwe.bridge.ib2 = e->data.bridge.ib2; + airoha_ppe_foe_commit_entry(ppe, &hwe, hash); + + return 0; +} + +static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, + struct sk_buff *skb, + u32 hash) { struct airoha_flow_table_entry *e; + struct airoha_foe_bridge br = {}; struct airoha_foe_entry *hwe; + bool commit_done = false; struct hlist_node *n; u32 index, state; @@ -495,21 +610,68 @@ static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, u32 hash) index = airoha_ppe_foe_get_entry_hash(hwe); hlist_for_each_entry_safe(e, n, &ppe->foe_flow[index], list) { - if (airoha_ppe_foe_compare_entry(e, hwe)) { - airoha_ppe_foe_commit_entry(ppe, &e->data, hash); - e->hash = hash; - break; + if (e->type == FLOW_TYPE_L2_SUBFLOW) { + state = FIELD_GET(AIROHA_FOE_IB1_BIND_STATE, hwe->ib1); + if (state != AIROHA_FOE_STATE_BIND) { + e->hash = 0xffff; + airoha_ppe_foe_remove_flow(ppe, e); + } + continue; } + + if (commit_done || !airoha_ppe_foe_compare_entry(e, hwe)) { + e->hash = 0xffff; + continue; + } + + airoha_ppe_foe_commit_entry(ppe, &e->data, hash); + commit_done = true; + e->hash = hash; } + + if (commit_done) + goto unlock; + + airoha_ppe_foe_set_bridge_addrs(&br, eth_hdr(skb)); + e = rhashtable_lookup_fast(&ppe->l2_flows, &br, + airoha_l2_flow_table_params); + if (e) + airoha_ppe_foe_commit_subflow_entry(ppe, e, hash); unlock: spin_unlock_bh(&ppe_lock); } +static int +airoha_ppe_foe_l2_flow_commit_entry(struct airoha_ppe *ppe, + struct airoha_flow_table_entry *e) +{ + struct airoha_flow_table_entry *prev; + + e->type = FLOW_TYPE_L2; + prev = rhashtable_lookup_get_insert_fast(&ppe->l2_flows, &e->l2_node, + airoha_l2_flow_table_params); + if (!prev) + return 0; + + if (IS_ERR(prev)) + return PTR_ERR(prev); + + return rhashtable_replace_fast(&ppe->l2_flows, &prev->l2_node, + &e->l2_node, + airoha_l2_flow_table_params); +} + static int airoha_ppe_foe_flow_commit_entry(struct airoha_ppe *ppe, struct airoha_flow_table_entry *e) { - u32 hash = airoha_ppe_foe_get_entry_hash(&e->data); + int type = FIELD_GET(AIROHA_FOE_IB1_BIND_PACKET_TYPE, e->data.ib1); + u32 hash; + + if (type == PPE_PKT_TYPE_BRIDGE) + return airoha_ppe_foe_l2_flow_commit_entry(ppe, e); + hash = airoha_ppe_foe_get_entry_hash(&e->data); + e->type = FLOW_TYPE_L4; e->hash = 0xffff; spin_lock_bh(&ppe_lock); @@ -519,23 +681,6 @@ static int airoha_ppe_foe_flow_commit_entry(struct airoha_ppe *ppe, return 0; } -static void airoha_ppe_foe_flow_remove_entry(struct airoha_ppe *ppe, - struct airoha_flow_table_entry *e) -{ - spin_lock_bh(&ppe_lock); - - hlist_del_init(&e->list); - if (e->hash != 0xffff) { - e->data.ib1 &= ~AIROHA_FOE_IB1_BIND_STATE; - e->data.ib1 |= FIELD_PREP(AIROHA_FOE_IB1_BIND_STATE, - AIROHA_FOE_STATE_INVALID); - airoha_ppe_foe_commit_entry(ppe, &e->data, e->hash); - e->hash = 0xffff; - } - - spin_unlock_bh(&ppe_lock); -} - static int airoha_ppe_flow_offload_replace(struct airoha_gdm_port *port, struct flow_cls_offload *f) { @@ -822,18 +967,13 @@ static int airoha_ppe_offload_setup(struct airoha_eth *eth) return err; } -int airoha_ppe_setup_tc_block_cb(enum tc_setup_type type, void *type_data, - void *cb_priv) +int airoha_ppe_setup_tc_block_cb(struct net_device *dev, void *type_data) { - struct flow_cls_offload *cls = type_data; - struct net_device *dev = cb_priv; struct airoha_gdm_port *port = netdev_priv(dev); + struct flow_cls_offload *cls = type_data; struct airoha_eth *eth = port->qdma->eth; int err = 0; - if (!tc_can_offload(dev) || type != TC_SETUP_CLSFLOWER) - return -EOPNOTSUPP; - mutex_lock(&flow_offload_mutex); if (!eth->npu) @@ -846,7 +986,8 @@ int airoha_ppe_setup_tc_block_cb(enum tc_setup_type type, void *type_data, return err; } -void airoha_ppe_check_skb(struct airoha_ppe *ppe, u16 hash) +void airoha_ppe_check_skb(struct airoha_ppe *ppe, struct sk_buff *skb, + u16 hash) { u16 now, diff; @@ -859,7 +1000,7 @@ void airoha_ppe_check_skb(struct airoha_ppe *ppe, u16 hash) return; ppe->foe_check_time[hash] = now; - airoha_ppe_foe_insert_entry(ppe, hash); + airoha_ppe_foe_insert_entry(ppe, skb, hash); } int airoha_ppe_init(struct airoha_eth *eth) @@ -890,9 +1031,20 @@ int airoha_ppe_init(struct airoha_eth *eth) if (err) return err; + err = rhashtable_init(&ppe->l2_flows, &airoha_l2_flow_table_params); + if (err) + goto error_flow_table_destroy; + err = airoha_ppe_debugfs_init(ppe); if (err) - rhashtable_destroy(ð->flow_table); + goto error_l2_flow_table_destroy; + + return 0; + +error_l2_flow_table_destroy: + rhashtable_destroy(&ppe->l2_flows); +error_flow_table_destroy: + rhashtable_destroy(ð->flow_table); return err; } @@ -909,6 +1061,7 @@ void airoha_ppe_deinit(struct airoha_eth *eth) } rcu_read_unlock(); + rhashtable_destroy(ð->ppe->l2_flows); rhashtable_destroy(ð->flow_table); debugfs_remove(eth->ppe->debugfs_dir); } diff --git a/drivers/net/ethernet/airoha/airoha_regs.h b/drivers/net/ethernet/airoha/airoha_regs.h index 8146cde4e8ba3..29c8f046b9910 100644 --- a/drivers/net/ethernet/airoha/airoha_regs.h +++ b/drivers/net/ethernet/airoha/airoha_regs.h @@ -283,6 +283,7 @@ #define PPE_HASH_SEED 0x12345678 #define REG_PPE_DFT_CPORT0(_n) (((_n) ? PPE2_BASE : PPE1_BASE) + 0x248) +#define DFT_CPORT_MASK(_n) GENMASK(3 + ((_n) << 2), ((_n) << 2)) #define REG_PPE_DFT_CPORT1(_n) (((_n) ? PPE2_BASE : PPE1_BASE) + 0x24c) @@ -691,6 +692,12 @@ #define REG_TRTCM_DATA_LOW(_n) ((_n) + 0x8) #define REG_TRTCM_DATA_HIGH(_n) ((_n) + 0xc) +#define RATE_LIMIT_PARAM_RW_MASK BIT(31) +#define RATE_LIMIT_PARAM_RW_DONE_MASK BIT(30) +#define RATE_LIMIT_PARAM_TYPE_MASK GENMASK(29, 28) +#define RATE_LIMIT_METER_GROUP_MASK GENMASK(27, 26) +#define RATE_LIMIT_PARAM_INDEX_MASK GENMASK(23, 16) + #define REG_TXWRR_MODE_CFG 0x1020 #define TWRR_WEIGHT_SCALE_MASK BIT(31) #define TWRR_WEIGHT_BASE_MASK BIT(3) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 897720fdf5d89..7d9ee9867a400 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -1777,7 +1777,7 @@ static void ena_init_napi_in_range(struct ena_adapter *adapter, if (ENA_IS_XDP_INDEX(adapter, i)) napi_handler = ena_xdp_io_poll; - netif_napi_add(adapter->netdev, &napi->napi, napi_handler); + netif_napi_add_config(adapter->netdev, &napi->napi, napi_handler, i); if (!ENA_IS_XDP_INDEX(adapter, i)) napi->rx_ring = rx_ring; diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h b/drivers/net/ethernet/amd/xgbe/xgbe-common.h index 3b70f67376331..e3d33f5b96427 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #ifndef __XGBE_COMMON_H__ diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dcb.c b/drivers/net/ethernet/amd/xgbe/xgbe-dcb.c index c68ace804e376..1474df5544fab 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-dcb.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-dcb.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c index b35808d3d07f7..d9157c4acde93 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c index 230726d7b74f6..7dba849bcd6b0 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include "xgbe.h" diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c index f393228d41c7b..b51a3666dddb2 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index d84a310dfcd40..0697eed1cb720 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c index 4431ab1c18b32..be0d2c7d08dc2 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-i2c.c b/drivers/net/ethernet/amd/xgbe/xgbe-i2c.c index 7a833894f52ad..d40011e8ddf2b 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-i2c.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-i2c.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-main.c b/drivers/net/ethernet/amd/xgbe/xgbe-main.c index 0e8698928e4d7..4ebdd123c4355 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-main.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-main.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c index 07f4f3418d018..71449edbb76de 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c index c636999a6a849..3e9f31256dce4 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v1.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v1.c index d16eae415f721..2e6b8ffe785cb 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v1.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v1.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c index 268399dfcf22f..7a4dfa4e19c73 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-platform.c b/drivers/net/ethernet/amd/xgbe/xgbe-platform.c index 4365bd62942cf..47d53e59ccf63 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-platform.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-platform.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ptp.c b/drivers/net/ethernet/amd/xgbe/xgbe-ptp.c index 7051bd7cf6dc3..978c4dd01fa0a 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-ptp.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-ptp.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h index d85386cac8d16..e5f5104342aad 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe.h @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #ifndef __XGBE_H__ diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c8e3468eee612..18359fabe0876 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -893,9 +893,9 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget) bnapi->events &= ~BNXT_TX_CMP_EVENT; } -static bool bnxt_separate_head_pool(void) +static bool bnxt_separate_head_pool(struct bnxt_rx_ring_info *rxr) { - return PAGE_SIZE > BNXT_RX_PAGE_SIZE; + return rxr->need_head_pool || PAGE_SIZE > BNXT_RX_PAGE_SIZE; } static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, @@ -919,6 +919,20 @@ static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, return page; } +static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping, + struct bnxt_rx_ring_info *rxr, + gfp_t gfp) +{ + netmem_ref netmem; + + netmem = page_pool_alloc_netmems(rxr->page_pool, gfp); + if (!netmem) + return 0; + + *mapping = page_pool_get_dma_addr_netmem(netmem); + return netmem; +} + static inline u8 *__bnxt_alloc_rx_frag(struct bnxt *bp, dma_addr_t *mapping, struct bnxt_rx_ring_info *rxr, gfp_t gfp) @@ -999,21 +1013,19 @@ static inline u16 bnxt_find_next_agg_idx(struct bnxt_rx_ring_info *rxr, u16 idx) return next; } -static inline int bnxt_alloc_rx_page(struct bnxt *bp, - struct bnxt_rx_ring_info *rxr, - u16 prod, gfp_t gfp) +static int bnxt_alloc_rx_netmem(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, + u16 prod, gfp_t gfp) { struct rx_bd *rxbd = &rxr->rx_agg_desc_ring[RX_AGG_RING(bp, prod)][RX_IDX(prod)]; struct bnxt_sw_rx_agg_bd *rx_agg_buf; - struct page *page; - dma_addr_t mapping; u16 sw_prod = rxr->rx_sw_agg_prod; unsigned int offset = 0; + dma_addr_t mapping; + netmem_ref netmem; - page = __bnxt_alloc_rx_page(bp, &mapping, rxr, &offset, gfp); - - if (!page) + netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, gfp); + if (!netmem) return -ENOMEM; if (unlikely(test_bit(sw_prod, rxr->rx_agg_bmap))) @@ -1023,7 +1035,7 @@ static inline int bnxt_alloc_rx_page(struct bnxt *bp, rx_agg_buf = &rxr->rx_agg_ring[sw_prod]; rxr->rx_sw_agg_prod = RING_RX_AGG(bp, NEXT_RX_AGG(sw_prod)); - rx_agg_buf->page = page; + rx_agg_buf->netmem = netmem; rx_agg_buf->offset = offset; rx_agg_buf->mapping = mapping; rxbd->rx_bd_haddr = cpu_to_le64(mapping); @@ -1067,11 +1079,11 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx, p5_tpa = true; for (i = 0; i < agg_bufs; i++) { - u16 cons; - struct rx_agg_cmp *agg; struct bnxt_sw_rx_agg_bd *cons_rx_buf, *prod_rx_buf; + struct rx_agg_cmp *agg; struct rx_bd *prod_bd; - struct page *page; + netmem_ref netmem; + u16 cons; if (p5_tpa) agg = bnxt_get_tpa_agg_p5(bp, rxr, idx, start + i); @@ -1088,11 +1100,11 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx, cons_rx_buf = &rxr->rx_agg_ring[cons]; /* It is possible for sw_prod to be equal to cons, so - * set cons_rx_buf->page to NULL first. + * set cons_rx_buf->netmem to 0 first. */ - page = cons_rx_buf->page; - cons_rx_buf->page = NULL; - prod_rx_buf->page = page; + netmem = cons_rx_buf->netmem; + cons_rx_buf->netmem = 0; + prod_rx_buf->netmem = netmem; prod_rx_buf->offset = cons_rx_buf->offset; prod_rx_buf->mapping = cons_rx_buf->mapping; @@ -1218,29 +1230,35 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp, return skb; } -static u32 __bnxt_rx_agg_pages(struct bnxt *bp, - struct bnxt_cp_ring_info *cpr, - struct skb_shared_info *shinfo, - u16 idx, u32 agg_bufs, bool tpa, - struct xdp_buff *xdp) +static u32 __bnxt_rx_agg_netmems(struct bnxt *bp, + struct bnxt_cp_ring_info *cpr, + u16 idx, u32 agg_bufs, bool tpa, + struct sk_buff *skb, + struct xdp_buff *xdp) { struct bnxt_napi *bnapi = cpr->bnapi; - struct pci_dev *pdev = bp->pdev; - struct bnxt_rx_ring_info *rxr = bnapi->rx_ring; - u16 prod = rxr->rx_agg_prod; + struct skb_shared_info *shinfo; + struct bnxt_rx_ring_info *rxr; u32 i, total_frag_len = 0; bool p5_tpa = false; + u16 prod; + + rxr = bnapi->rx_ring; + prod = rxr->rx_agg_prod; if ((bp->flags & BNXT_FLAG_CHIP_P5_PLUS) && tpa) p5_tpa = true; + if (skb) + shinfo = skb_shinfo(skb); + else + shinfo = xdp_get_shared_info_from_buff(xdp); + for (i = 0; i < agg_bufs; i++) { - skb_frag_t *frag = &shinfo->frags[i]; - u16 cons, frag_len; - struct rx_agg_cmp *agg; struct bnxt_sw_rx_agg_bd *cons_rx_buf; - struct page *page; - dma_addr_t mapping; + struct rx_agg_cmp *agg; + u16 cons, frag_len; + netmem_ref netmem; if (p5_tpa) agg = bnxt_get_tpa_agg_p5(bp, rxr, idx, i); @@ -1251,27 +1269,41 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT; cons_rx_buf = &rxr->rx_agg_ring[cons]; - skb_frag_fill_page_desc(frag, cons_rx_buf->page, - cons_rx_buf->offset, frag_len); - shinfo->nr_frags = i + 1; + if (skb) { + skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem, + cons_rx_buf->offset, + frag_len, BNXT_RX_PAGE_SIZE); + } else { + skb_frag_t *frag = &shinfo->frags[i]; + + skb_frag_fill_netmem_desc(frag, cons_rx_buf->netmem, + cons_rx_buf->offset, + frag_len); + shinfo->nr_frags = i + 1; + } __clear_bit(cons, rxr->rx_agg_bmap); - /* It is possible for bnxt_alloc_rx_page() to allocate + /* It is possible for bnxt_alloc_rx_netmem() to allocate * a sw_prod index that equals the cons index, so we * need to clear the cons entry now. */ - mapping = cons_rx_buf->mapping; - page = cons_rx_buf->page; - cons_rx_buf->page = NULL; + netmem = cons_rx_buf->netmem; + cons_rx_buf->netmem = 0; - if (xdp && page_is_pfmemalloc(page)) + if (xdp && netmem_is_pfmemalloc(netmem)) xdp_buff_set_frag_pfmemalloc(xdp); - if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_ATOMIC) != 0) { + if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_ATOMIC) != 0) { + if (skb) { + skb->len -= frag_len; + skb->data_len -= frag_len; + skb->truesize -= BNXT_RX_PAGE_SIZE; + } + --shinfo->nr_frags; - cons_rx_buf->page = page; + cons_rx_buf->netmem = netmem; - /* Update prod since possibly some pages have been + /* Update prod since possibly some netmems have been * allocated already. */ rxr->rx_agg_prod = prod; @@ -1279,8 +1311,8 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, return 0; } - dma_sync_single_for_cpu(&pdev->dev, mapping, BNXT_RX_PAGE_SIZE, - bp->rx_dir); + page_pool_dma_sync_netmem_for_cpu(rxr->page_pool, netmem, 0, + BNXT_RX_PAGE_SIZE); total_frag_len += frag_len; prod = NEXT_RX_AGG(prod); @@ -1289,32 +1321,28 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, return total_frag_len; } -static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp, - struct bnxt_cp_ring_info *cpr, - struct sk_buff *skb, u16 idx, - u32 agg_bufs, bool tpa) +static struct sk_buff *bnxt_rx_agg_netmems_skb(struct bnxt *bp, + struct bnxt_cp_ring_info *cpr, + struct sk_buff *skb, u16 idx, + u32 agg_bufs, bool tpa) { - struct skb_shared_info *shinfo = skb_shinfo(skb); u32 total_frag_len = 0; - total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo, idx, - agg_bufs, tpa, NULL); + total_frag_len = __bnxt_rx_agg_netmems(bp, cpr, idx, agg_bufs, tpa, + skb, NULL); if (!total_frag_len) { skb_mark_for_recycle(skb); dev_kfree_skb(skb); return NULL; } - skb->data_len += total_frag_len; - skb->len += total_frag_len; - skb->truesize += BNXT_RX_PAGE_SIZE * agg_bufs; return skb; } -static u32 bnxt_rx_agg_pages_xdp(struct bnxt *bp, - struct bnxt_cp_ring_info *cpr, - struct xdp_buff *xdp, u16 idx, - u32 agg_bufs, bool tpa) +static u32 bnxt_rx_agg_netmems_xdp(struct bnxt *bp, + struct bnxt_cp_ring_info *cpr, + struct xdp_buff *xdp, u16 idx, + u32 agg_bufs, bool tpa) { struct skb_shared_info *shinfo = xdp_get_shared_info_from_buff(xdp); u32 total_frag_len = 0; @@ -1322,8 +1350,8 @@ static u32 bnxt_rx_agg_pages_xdp(struct bnxt *bp, if (!xdp_buff_has_frags(xdp)) shinfo->nr_frags = 0; - total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo, - idx, agg_bufs, tpa, xdp); + total_frag_len = __bnxt_rx_agg_netmems(bp, cpr, idx, agg_bufs, tpa, + NULL, xdp); if (total_frag_len) { xdp_buff_set_frags_flag(xdp); shinfo->nr_frags = agg_bufs; @@ -1895,7 +1923,8 @@ static inline struct sk_buff *bnxt_tpa_end(struct bnxt *bp, } if (agg_bufs) { - skb = bnxt_rx_agg_pages_skb(bp, cpr, skb, idx, agg_bufs, true); + skb = bnxt_rx_agg_netmems_skb(bp, cpr, skb, idx, agg_bufs, + true); if (!skb) { /* Page reuse already handled by bnxt_rx_pages(). */ cpr->sw_stats->rx.rx_oom_discards += 1; @@ -2175,9 +2204,10 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, if (bnxt_xdp_attached(bp, rxr)) { bnxt_xdp_buff_init(bp, rxr, cons, data_ptr, len, &xdp); if (agg_bufs) { - u32 frag_len = bnxt_rx_agg_pages_xdp(bp, cpr, &xdp, - cp_cons, agg_bufs, - false); + u32 frag_len = bnxt_rx_agg_netmems_xdp(bp, cpr, &xdp, + cp_cons, + agg_bufs, + false); if (!frag_len) goto oom_next_rx; @@ -2229,7 +2259,8 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, if (agg_bufs) { if (!xdp_active) { - skb = bnxt_rx_agg_pages_skb(bp, cpr, skb, cp_cons, agg_bufs, false); + skb = bnxt_rx_agg_netmems_skb(bp, cpr, skb, cp_cons, + agg_bufs, false); if (!skb) goto oom_next_rx; } else { @@ -3445,15 +3476,15 @@ static void bnxt_free_one_rx_agg_ring(struct bnxt *bp, struct bnxt_rx_ring_info for (i = 0; i < max_idx; i++) { struct bnxt_sw_rx_agg_bd *rx_agg_buf = &rxr->rx_agg_ring[i]; - struct page *page = rx_agg_buf->page; + netmem_ref netmem = rx_agg_buf->netmem; - if (!page) + if (!netmem) continue; - rx_agg_buf->page = NULL; + rx_agg_buf->netmem = 0; __clear_bit(i, rxr->rx_agg_bmap); - page_pool_recycle_direct(rxr->page_pool, page); + page_pool_recycle_direct_netmem(rxr->page_pool, netmem); } } @@ -3746,7 +3777,7 @@ static void bnxt_free_rx_rings(struct bnxt *bp) xdp_rxq_info_unreg(&rxr->xdp_rxq); page_pool_destroy(rxr->page_pool); - if (bnxt_separate_head_pool()) + if (bnxt_separate_head_pool(rxr)) page_pool_destroy(rxr->head_pool); rxr->page_pool = rxr->head_pool = NULL; @@ -3777,15 +3808,19 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, pp.dev = &bp->pdev->dev; pp.dma_dir = bp->rx_dir; pp.max_len = PAGE_SIZE; - pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | + PP_FLAG_ALLOW_UNREADABLE_NETMEM; + pp.queue_idx = rxr->bnapi->index; pool = page_pool_create(&pp); if (IS_ERR(pool)) return PTR_ERR(pool); rxr->page_pool = pool; - if (bnxt_separate_head_pool()) { + rxr->need_head_pool = page_pool_is_unreadable(pool); + if (bnxt_separate_head_pool(rxr)) { pp.pool_size = max(bp->rx_ring_size, 1024); + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; pool = page_pool_create(&pp); if (IS_ERR(pool)) goto err_destroy_pp; @@ -4197,6 +4232,8 @@ static void bnxt_reset_rx_ring_struct(struct bnxt *bp, rxr->page_pool->p.napi = NULL; rxr->page_pool = NULL; + rxr->head_pool->p.napi = NULL; + rxr->head_pool = NULL; memset(&rxr->xdp_rxq, 0, sizeof(struct xdp_rxq_info)); ring = &rxr->rx_ring_struct; @@ -4321,16 +4358,16 @@ static void bnxt_alloc_one_rx_ring_skb(struct bnxt *bp, rxr->rx_prod = prod; } -static void bnxt_alloc_one_rx_ring_page(struct bnxt *bp, - struct bnxt_rx_ring_info *rxr, - int ring_nr) +static void bnxt_alloc_one_rx_ring_netmem(struct bnxt *bp, + struct bnxt_rx_ring_info *rxr, + int ring_nr) { u32 prod; int i; prod = rxr->rx_agg_prod; for (i = 0; i < bp->rx_agg_ring_size; i++) { - if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_KERNEL)) { + if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_KERNEL)) { netdev_warn(bp->dev, "init'ed rx ring %d with %d/%d pages only\n", ring_nr, i, bp->rx_ring_size); break; @@ -4371,7 +4408,7 @@ static int bnxt_alloc_one_rx_ring(struct bnxt *bp, int ring_nr) if (!(bp->flags & BNXT_FLAG_AGG_RINGS)) return 0; - bnxt_alloc_one_rx_ring_page(bp, rxr, ring_nr); + bnxt_alloc_one_rx_ring_netmem(bp, rxr, ring_nr); if (rxr->rx_tpa) { rc = bnxt_alloc_one_tpa_info_data(bp, rxr); @@ -10073,7 +10110,7 @@ static int bnxt_hwrm_ver_get(struct bnxt *bp) struct hwrm_ver_get_input *req; u16 fw_maj, fw_min, fw_bld, fw_rsv; u32 dev_caps_cfg, hwrm_ver; - int rc, len; + int rc, len, max_tmo_secs; rc = hwrm_req_init(bp, req, HWRM_VER_GET); if (rc) @@ -10146,9 +10183,14 @@ static int bnxt_hwrm_ver_get(struct bnxt *bp) bp->hwrm_cmd_max_timeout = le16_to_cpu(resp->max_req_timeout) * 1000; if (!bp->hwrm_cmd_max_timeout) bp->hwrm_cmd_max_timeout = HWRM_CMD_MAX_TIMEOUT; - else if (bp->hwrm_cmd_max_timeout > HWRM_CMD_MAX_TIMEOUT) - netdev_warn(bp->dev, "Device requests max timeout of %d seconds, may trigger hung task watchdog\n", - bp->hwrm_cmd_max_timeout / 1000); + max_tmo_secs = bp->hwrm_cmd_max_timeout / 1000; +#ifdef CONFIG_DETECT_HUNG_TASK + if (bp->hwrm_cmd_max_timeout > HWRM_CMD_MAX_TIMEOUT || + max_tmo_secs > CONFIG_DEFAULT_HUNG_TASK_TIMEOUT) { + netdev_warn(bp->dev, "Device requests max timeout of %d seconds, may trigger hung task watchdog (kernel default %ds)\n", + max_tmo_secs, CONFIG_DEFAULT_HUNG_TASK_TIMEOUT); + } +#endif if (resp->hwrm_intf_maj_8b >= 1) { bp->hwrm_max_req_len = le16_to_cpu(resp->max_req_win_len); @@ -15708,6 +15750,7 @@ static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx) clone->rx_agg_prod = 0; clone->rx_sw_agg_prod = 0; clone->rx_next_cons = 0; + clone->need_head_pool = false; rc = bnxt_alloc_rx_page_pool(bp, clone, rxr->page_pool->p.nid); if (rc) @@ -15750,7 +15793,7 @@ static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx) bnxt_alloc_one_rx_ring_skb(bp, clone, idx); if (bp->flags & BNXT_FLAG_AGG_RINGS) - bnxt_alloc_one_rx_ring_page(bp, clone, idx); + bnxt_alloc_one_rx_ring_netmem(bp, clone, idx); if (bp->flags & BNXT_FLAG_TPA) bnxt_alloc_one_tpa_info_data(bp, clone); @@ -15766,7 +15809,7 @@ static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx) xdp_rxq_info_unreg(&clone->xdp_rxq); err_page_pool_destroy: page_pool_destroy(clone->page_pool); - if (bnxt_separate_head_pool()) + if (bnxt_separate_head_pool(clone)) page_pool_destroy(clone->head_pool); clone->page_pool = NULL; clone->head_pool = NULL; @@ -15785,7 +15828,7 @@ static void bnxt_queue_mem_free(struct net_device *dev, void *qmem) xdp_rxq_info_unreg(&rxr->xdp_rxq); page_pool_destroy(rxr->page_pool); - if (bnxt_separate_head_pool()) + if (bnxt_separate_head_pool(rxr)) page_pool_destroy(rxr->head_pool); rxr->page_pool = NULL; rxr->head_pool = NULL; @@ -15876,6 +15919,7 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx) rxr->page_pool = clone->page_pool; rxr->head_pool = clone->head_pool; rxr->xdp_rxq = clone->xdp_rxq; + rxr->need_head_pool = clone->need_head_pool; bnxt_copy_rx_ring(bp, rxr, clone); @@ -15961,7 +16005,7 @@ static int bnxt_queue_stop(struct net_device *dev, void *qmem, int idx) bnxt_hwrm_rx_ring_free(bp, rxr, false); bnxt_hwrm_rx_agg_ring_free(bp, rxr, false); page_pool_disable_direct_recycling(rxr->page_pool); - if (bnxt_separate_head_pool()) + if (bnxt_separate_head_pool(rxr)) page_pool_disable_direct_recycling(rxr->head_pool); if (bp->flags & BNXT_FLAG_SHARED_RINGS) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 21726cf565866..868a2e5a5b023 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -903,7 +903,7 @@ struct bnxt_sw_rx_bd { }; struct bnxt_sw_rx_agg_bd { - struct page *page; + netmem_ref netmem; unsigned int offset; dma_addr_t mapping; }; @@ -1106,6 +1106,7 @@ struct bnxt_rx_ring_info { unsigned long *rx_agg_bmap; u16 rx_agg_bmap_size; + bool need_head_pool; dma_addr_t rx_desc_mapping[MAX_RX_PAGES]; dma_addr_t rx_agg_desc_mapping[MAX_RX_AGG_PAGES]; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c index 5576e7cf84631..9b6489e417fc1 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c @@ -496,9 +496,16 @@ static int __bnxt_get_coredump(struct bnxt *bp, u16 dump_type, void *buf, start_utc, coredump.total_segs + 1, rc); kfree(coredump.data); - *dump_len += sizeof(struct bnxt_coredump_record); - if (rc == -ENOBUFS) + if (!rc) { + *dump_len += sizeof(struct bnxt_coredump_record); + /* The actual coredump length can be smaller than the FW + * reported length earlier. Use the ethtool provided length. + */ + if (buf_len) + *dump_len = buf_len; + } else if (rc == -ENOBUFS) { netdev_err(bp->dev, "Firmware returned large coredump buffer\n"); + } return rc; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h index 15ca51b5d204e..fb5f5b063c3d8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h @@ -58,7 +58,7 @@ void hwrm_update_token(struct bnxt *bp, u16 seq, enum bnxt_hwrm_wait_state s); #define BNXT_HWRM_MAX_REQ_LEN (bp->hwrm_max_req_len) #define BNXT_HWRM_SHORT_REQ_LEN sizeof(struct hwrm_short_input) -#define HWRM_CMD_MAX_TIMEOUT 40000U +#define HWRM_CMD_MAX_TIMEOUT 60000U #define SHORT_HWRM_CMD_TIMEOUT 20 #define HWRM_CMD_TIMEOUT (bp->hwrm_cmd_timeout) #define HWRM_RESET_TIMEOUT ((HWRM_CMD_TIMEOUT) * 4) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index a8e930d5dbb09..238db9a1aebf8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -148,7 +148,6 @@ void bnxt_unregister_dev(struct bnxt_en_dev *edev) struct net_device *dev = edev->net; struct bnxt *bp = netdev_priv(dev); struct bnxt_ulp *ulp; - int i = 0; ulp = edev->ulp_tbl; netdev_lock(dev); @@ -164,10 +163,6 @@ void bnxt_unregister_dev(struct bnxt_en_dev *edev) synchronize_rcu(); ulp->max_async_event_id = 0; ulp->async_events_bmap = NULL; - while (atomic_read(&ulp->ref_count) != 0 && i < 10) { - msleep(100); - i++; - } mutex_unlock(&edev->en_dev_lock); netdev_unlock(dev); return; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h index 7fa3b8d1ebd28..7b9dd8ebe4bcc 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h @@ -10,9 +10,6 @@ #ifndef BNXT_ULP_H #define BNXT_ULP_H -#define BNXT_ROCE_ULP 0 -#define BNXT_MAX_ULP 1 - #define BNXT_MIN_ROCE_CP_RINGS 2 #define BNXT_MIN_ROCE_STAT_CTXS 1 @@ -50,7 +47,6 @@ struct bnxt_ulp { unsigned long *async_events_bmap; u16 max_async_event_id; u16 msix_requested; - atomic_t ref_count; }; struct bnxt_en_dev { diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c index a03eff3d44259..50eb54ecf1bad 100644 --- a/drivers/net/ethernet/brocade/bna/bnad.c +++ b/drivers/net/ethernet/brocade/bna/bnad.c @@ -1735,7 +1735,7 @@ bnad_iocpf_sem_timeout(struct timer_list *t) * Time CPU m CPU n * 0 1 = test_bit * 1 clear_bit - * 2 del_timer_sync + * 2 timer_delete_sync * 3 mod_timer */ diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c index 3b7068832f95e..4a0e2d2eb60a4 100644 --- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c @@ -351,7 +351,7 @@ static void set_msglevel(struct net_device *dev, u32 val) adapter->msg_enable = val; } -static const char stats_strings[][ETH_GSTRING_LEN] = { +static const char stats_strings[][ETH_GSTRING_LEN] __nonstring_array = { "TxOctetsOK", "TxOctetsBad", "TxUnicastFramesOK", diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 517a15904fb08..6a2004bbe87f9 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -1144,6 +1144,7 @@ static int gmac_map_tx_bufs(struct net_device *netdev, struct sk_buff *skb, struct gmac_txdesc *txd; skb_frag_t *skb_frag; dma_addr_t mapping; + bool tcp = false; void *buffer; u16 mss; int ret; @@ -1151,6 +1152,13 @@ static int gmac_map_tx_bufs(struct net_device *netdev, struct sk_buff *skb, word1 = skb->len; word3 = SOF_BIT; + /* Determine if we are doing TCP */ + if (skb->protocol == htons(ETH_P_IP)) + tcp = (ip_hdr(skb)->protocol == IPPROTO_TCP); + else + /* IPv6 */ + tcp = (ipv6_hdr(skb)->nexthdr == IPPROTO_TCP); + mss = skb_shinfo(skb)->gso_size; if (mss) { /* This means we are dealing with TCP and skb->len is the @@ -1163,8 +1171,26 @@ static int gmac_map_tx_bufs(struct net_device *netdev, struct sk_buff *skb, mss, skb->len); word1 |= TSS_MTU_ENABLE_BIT; word3 |= mss; + } else if (tcp) { + /* Even if we are not using TSO, use the hardware offloader + * for transferring the TCP frame: this hardware has partial + * TCP awareness (called TOE - TCP Offload Engine) and will + * according to the datasheet put packets belonging to the + * same TCP connection in the same queue for the TOE/TSO + * engine to process. The engine will deal with chopping + * up frames that exceed ETH_DATA_LEN which the + * checksumming engine cannot handle (see below) into + * manageable chunks. It flawlessly deals with quite big + * frames and frames containing custom DSA EtherTypes. + */ + mss = netdev->mtu + skb_tcp_all_headers(skb); + mss = min(mss, skb->len); + netdev_dbg(netdev, "TOE/TSO len %04x mtu %04x mss %04x\n", + skb->len, netdev->mtu, mss); + word1 |= TSS_MTU_ENABLE_BIT; + word3 |= mss; } else if (skb->len >= ETH_FRAME_LEN) { - /* Hardware offloaded checksumming isn't working on frames + /* Hardware offloaded checksumming isn't working on non-TCP frames * bigger than 1514 bytes. A hypothesis about this is that the * checksum buffer is only 1518 bytes, so when the frames get * bigger they get truncated, or the last few bytes get @@ -1181,21 +1207,16 @@ static int gmac_map_tx_bufs(struct net_device *netdev, struct sk_buff *skb, } if (skb->ip_summed == CHECKSUM_PARTIAL) { - int tcp = 0; - /* We do not switch off the checksumming on non TCP/UDP * frames: as is shown from tests, the checksumming engine * is smart enough to see that a frame is not actually TCP * or UDP and then just pass it through without any changes * to the frame. */ - if (skb->protocol == htons(ETH_P_IP)) { + if (skb->protocol == htons(ETH_P_IP)) word1 |= TSS_IP_CHKSUM_BIT; - tcp = ip_hdr(skb)->protocol == IPPROTO_TCP; - } else { /* IPv6 */ + else word1 |= TSS_IPV6_ENABLE_BIT; - tcp = ipv6_hdr(skb)->nexthdr == IPPROTO_TCP; - } word1 |= tcp ? TSS_TCP_CHKSUM_BIT : TSS_UDP_CHKSUM_BIT; } diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c index 51b8377edd1d0..adb441b36581a 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.c +++ b/drivers/net/ethernet/emulex/benet/be_cmds.c @@ -2615,7 +2615,11 @@ static int be_cmd_get_flash_crc(struct be_adapter *adapter, u8 *flashed_crc, return status; } -static char flash_cookie[2][16] = {"*** SE FLAS", "H DIRECTORY *** "}; +/* + * Since the cookie is text, add a parsing-skipped space to keep it from + * ever being matched on storage holding this source file. + */ +static const char flash_cookie[32] __nonstring = "*** SE FLAS" "H DIRECTORY *** "; static bool phy_flashing_required(struct be_adapter *adapter) { diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h index d70818f06be73..5e2d3ddb5d438 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.h +++ b/drivers/net/ethernet/emulex/benet/be_cmds.h @@ -1415,7 +1415,7 @@ struct flash_section_entry { } __packed; struct flash_section_info { - u8 cookie[32]; + u8 cookie[32] __nonstring; struct flash_section_hdr fsec_hdr; struct flash_section_entry fsec_entry[32]; } __packed; diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c index ece3ae28ba827..f47c8b6cc5118 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c @@ -141,7 +141,7 @@ static const struct { static const struct { int reg; - char name[ETH_GSTRING_LEN]; + char name[ETH_GSTRING_LEN] __nonstring; } enetc_port_counters[] = { { ENETC_PM_REOCT(0), "MAC rx ethernet octets" }, { ENETC_PM_RALN(0), "MAC rx alignment errors" }, @@ -264,7 +264,7 @@ static void enetc_get_strings(struct net_device *ndev, u32 stringset, u8 *data) break; for (i = 0; i < ARRAY_SIZE(enetc_port_counters); i++) - ethtool_puts(&data, enetc_port_counters[i].name); + ethtool_cpy(&data, enetc_port_counters[i].name); break; } diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index eae1a7595a695..3c1da0cf3f61e 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -67,7 +67,7 @@ static const char gve_gstrings_tx_stats[][ETH_GSTRING_LEN] = { "tx_xsk_sent[%u]", "tx_xdp_xmit[%u]", "tx_xdp_xmit_errors[%u]" }; -static const char gve_gstrings_adminq_stats[][ETH_GSTRING_LEN] = { +static const char gve_gstrings_adminq_stats[][ETH_GSTRING_LEN] __nonstring_array = { "adminq_prod_cnt", "adminq_cmd_fail", "adminq_timeouts", "adminq_describe_device_cnt", "adminq_cfg_device_resources_cnt", "adminq_register_page_list_cnt", "adminq_unregister_page_list_cnt", @@ -113,7 +113,7 @@ static void gve_get_strings(struct net_device *netdev, u32 stringset, u8 *data) i); for (i = 0; i < ARRAY_SIZE(gve_gstrings_adminq_stats); i++) - ethtool_puts(&s, gve_gstrings_adminq_stats[i]); + ethtool_cpy(&s, gve_gstrings_adminq_stats[i]); break; diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index c3791cf23c876..446e4b6fd3f17 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1830,7 +1830,7 @@ static void gve_turndown(struct gve_priv *priv) /* Stop tx queues */ netif_tx_disable(priv->dev); - xdp_features_clear_redirect_target(priv->dev); + xdp_features_clear_redirect_target_locked(priv->dev); gve_clear_napi_enabled(priv); gve_clear_report_stats(priv); @@ -1902,7 +1902,7 @@ static void gve_turnup(struct gve_priv *priv) } if (priv->tx_cfg.num_xdp_queues && gve_supports_xdp_xmit(priv)) - xdp_features_set_redirect_target(priv->dev, false); + xdp_features_set_redirect_target_locked(priv->dev, false); gve_set_napi_enabled(priv); } @@ -2185,7 +2185,7 @@ static void gve_set_netdev_xdp_features(struct gve_priv *priv) xdp_features = 0; } - xdp_set_features_flag(priv->dev, xdp_features); + xdp_set_features_flag_locked(priv->dev, xdp_features); } static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device) diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index 1640d2f278330..b2adb40a59210 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -147,6 +147,8 @@ config IXGBE depends on PCI depends on PTP_1588_CLOCK_OPTIONAL select MDIO + select NET_DEVLINK + select PLDMFW select PHYLIB help This driver supports Intel(R) 10GbE PCI Express family of diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h index ba9c19e6994c9..952898151565f 100644 --- a/drivers/net/ethernet/intel/e1000e/e1000.h +++ b/drivers/net/ethernet/intel/e1000e/e1000.h @@ -319,7 +319,7 @@ struct e1000_adapter { u16 tx_ring_count; u16 rx_ring_count; - struct hwtstamp_config hwtstamp_config; + struct kernel_hwtstamp_config hwtstamp_config; struct delayed_work systim_overflow_work; struct sk_buff *tx_hwtstamp_skb; unsigned long tx_hwtstamp_start; diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 8ebcb6a7d608a..e0f492a6723fa 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -3574,6 +3574,7 @@ s32 e1000e_get_base_timinca(struct e1000_adapter *adapter, u32 *timinca) * e1000e_config_hwtstamp - configure the hwtstamp registers and enable/disable * @adapter: board private structure * @config: timestamp configuration + * @extack: netlink extended ACK for error report * * Outgoing time stamping can be enabled and disabled. Play nice and * disable it when requested, although it shouldn't cause any overhead @@ -3587,7 +3588,8 @@ s32 e1000e_get_base_timinca(struct e1000_adapter *adapter, u32 *timinca) * exception of "all V2 events regardless of level 2 or 4". **/ static int e1000e_config_hwtstamp(struct e1000_adapter *adapter, - struct hwtstamp_config *config) + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) { struct e1000_hw *hw = &adapter->hw; u32 tsync_tx_ctl = E1000_TSYNCTXCTL_ENABLED; @@ -3598,8 +3600,10 @@ static int e1000e_config_hwtstamp(struct e1000_adapter *adapter, bool is_l2 = false; u32 regval; - if (!(adapter->flags & FLAG_HAS_HW_TIMESTAMP)) + if (!(adapter->flags & FLAG_HAS_HW_TIMESTAMP)) { + NL_SET_ERR_MSG(extack, "No HW timestamp support"); return -EINVAL; + } switch (config->tx_type) { case HWTSTAMP_TX_OFF: @@ -3608,6 +3612,7 @@ static int e1000e_config_hwtstamp(struct e1000_adapter *adapter, case HWTSTAMP_TX_ON: break; default: + NL_SET_ERR_MSG(extack, "Unsupported TX HW timestamp type"); return -ERANGE; } @@ -3681,6 +3686,7 @@ static int e1000e_config_hwtstamp(struct e1000_adapter *adapter, config->rx_filter = HWTSTAMP_FILTER_ALL; break; default: + NL_SET_ERR_MSG(extack, "Unsupported RX HW timestamp filter"); return -ERANGE; } @@ -3693,7 +3699,8 @@ static int e1000e_config_hwtstamp(struct e1000_adapter *adapter, ew32(TSYNCTXCTL, regval); if ((er32(TSYNCTXCTL) & E1000_TSYNCTXCTL_ENABLED) != (regval & E1000_TSYNCTXCTL_ENABLED)) { - e_err("Timesync Tx Control register not set as expected\n"); + NL_SET_ERR_MSG(extack, + "Timesync Tx Control register not set as expected"); return -EAGAIN; } @@ -3706,7 +3713,8 @@ static int e1000e_config_hwtstamp(struct e1000_adapter *adapter, E1000_TSYNCRXCTL_TYPE_MASK)) != (regval & (E1000_TSYNCRXCTL_ENABLED | E1000_TSYNCRXCTL_TYPE_MASK))) { - e_err("Timesync Rx Control register not set as expected\n"); + NL_SET_ERR_MSG(extack, + "Timesync Rx Control register not set as expected"); return -EAGAIN; } @@ -3901,6 +3909,7 @@ static void e1000e_systim_reset(struct e1000_adapter *adapter) { struct ptp_clock_info *info = &adapter->ptp_clock_info; struct e1000_hw *hw = &adapter->hw; + struct netlink_ext_ack extack = {}; unsigned long flags; u32 timinca; s32 ret_val; @@ -3932,7 +3941,12 @@ static void e1000e_systim_reset(struct e1000_adapter *adapter) spin_unlock_irqrestore(&adapter->systim_lock, flags); /* restore the previous hwtstamp configuration settings */ - e1000e_config_hwtstamp(adapter, &adapter->hwtstamp_config); + ret_val = e1000e_config_hwtstamp(adapter, &adapter->hwtstamp_config, + &extack); + if (ret_val) { + if (extack._msg) + e_err("%s\n", extack._msg); + } } /** @@ -6079,8 +6093,7 @@ static int e1000_change_mtu(struct net_device *netdev, int new_mtu) return 0; } -static int e1000_mii_ioctl(struct net_device *netdev, struct ifreq *ifr, - int cmd) +static int e1000_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) { struct e1000_adapter *adapter = netdev_priv(netdev); struct mii_ioctl_data *data = if_mii(ifr); @@ -6140,7 +6153,8 @@ static int e1000_mii_ioctl(struct net_device *netdev, struct ifreq *ifr, /** * e1000e_hwtstamp_set - control hardware time stamping * @netdev: network interface device structure - * @ifr: interface request + * @config: timestamp configuration + * @extack: netlink extended ACK report * * Outgoing time stamping can be enabled and disabled. Play nice and * disable it when requested, although it shouldn't cause any overhead @@ -6153,20 +6167,18 @@ static int e1000_mii_ioctl(struct net_device *netdev, struct ifreq *ifr, * specified. Matching the kind of event packet is not supported, with the * exception of "all V2 events regardless of level 2 or 4". **/ -static int e1000e_hwtstamp_set(struct net_device *netdev, struct ifreq *ifr) +static int e1000e_hwtstamp_set(struct net_device *netdev, + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) { struct e1000_adapter *adapter = netdev_priv(netdev); - struct hwtstamp_config config; int ret_val; - if (copy_from_user(&config, ifr->ifr_data, sizeof(config))) - return -EFAULT; - - ret_val = e1000e_config_hwtstamp(adapter, &config); + ret_val = e1000e_config_hwtstamp(adapter, config, extack); if (ret_val) return ret_val; - switch (config.rx_filter) { + switch (config->rx_filter) { case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: case HWTSTAMP_FILTER_PTP_V2_SYNC: @@ -6178,38 +6190,23 @@ static int e1000e_hwtstamp_set(struct net_device *netdev, struct ifreq *ifr) * by hardware so notify the caller the requested packets plus * some others are time stamped. */ - config.rx_filter = HWTSTAMP_FILTER_SOME; + config->rx_filter = HWTSTAMP_FILTER_SOME; break; default: break; } - return copy_to_user(ifr->ifr_data, &config, - sizeof(config)) ? -EFAULT : 0; + return 0; } -static int e1000e_hwtstamp_get(struct net_device *netdev, struct ifreq *ifr) +static int e1000e_hwtstamp_get(struct net_device *netdev, + struct kernel_hwtstamp_config *kernel_config) { struct e1000_adapter *adapter = netdev_priv(netdev); - return copy_to_user(ifr->ifr_data, &adapter->hwtstamp_config, - sizeof(adapter->hwtstamp_config)) ? -EFAULT : 0; -} + *kernel_config = adapter->hwtstamp_config; -static int e1000_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) -{ - switch (cmd) { - case SIOCGMIIPHY: - case SIOCGMIIREG: - case SIOCSMIIREG: - return e1000_mii_ioctl(netdev, ifr, cmd); - case SIOCSHWTSTAMP: - return e1000e_hwtstamp_set(netdev, ifr); - case SIOCGHWTSTAMP: - return e1000e_hwtstamp_get(netdev, ifr); - default: - return -EOPNOTSUPP; - } + return 0; } static int e1000_init_phy_wakeup(struct e1000_adapter *adapter, u32 wufc) @@ -7346,9 +7343,11 @@ static const struct net_device_ops e1000e_netdev_ops = { #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = e1000_netpoll, #endif - .ndo_set_features = e1000_set_features, - .ndo_fix_features = e1000_fix_features, + .ndo_set_features = e1000_set_features, + .ndo_fix_features = e1000_fix_features, .ndo_features_check = passthru_features_check, + .ndo_hwtstamp_get = e1000e_hwtstamp_get, + .ndo_hwtstamp_set = e1000e_hwtstamp_set, }; /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index 370b4bddee441..b11c35e307ca9 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -817,10 +817,11 @@ int i40e_pf_reset(struct i40e_hw *hw) void i40e_clear_hw(struct i40e_hw *hw) { u32 num_queues, base_queue; - u32 num_pf_int; - u32 num_vf_int; + s32 num_pf_int; + s32 num_vf_int; u32 num_vfs; - u32 i, j; + s32 i; + u32 j; u32 val; u32 eol = 0x7ff; diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index fd083647c14ac..e42572ae7631c 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -193,8 +193,6 @@ #define ice_pf_to_dev(pf) (&((pf)->pdev->dev)) -#define ice_pf_src_tmr_owned(pf) ((pf)->hw.func_caps.ts_func_info.src_tmr_owned) - enum ice_feature { ICE_F_DSCP, ICE_F_PHY_RCLK, @@ -515,6 +513,7 @@ enum ice_pf_flags { ICE_FLAG_MTU_CHANGED, ICE_FLAG_GNSS, /* GNSS successfully initialized */ ICE_FLAG_DPLL, /* SyncE/PTP dplls initialized */ + ICE_FLAG_LLDP_AQ_FLTR, ICE_PF_FLAGS_NBITS /* must be last */ }; @@ -1045,4 +1044,62 @@ static inline void ice_clear_rdma_cap(struct ice_pf *pf) } extern const struct xdp_metadata_ops ice_xdp_md_ops; + +/** + * ice_is_dual - Check if given config is multi-NAC + * @hw: pointer to HW structure + * + * Return: true if the device is running in mutli-NAC (Network + * Acceleration Complex) configuration variant, false otherwise + * (always false for non-E825 devices). + */ +static inline bool ice_is_dual(struct ice_hw *hw) +{ + return hw->mac_type == ICE_MAC_GENERIC_3K_E825 && + (hw->dev_caps.nac_topo.mode & ICE_NAC_TOPO_DUAL_M); +} + +/** + * ice_is_primary - Check if given device belongs to the primary complex + * @hw: pointer to HW structure + * + * Check if given PF/HW is running on primary complex in multi-NAC + * configuration. + * + * Return: true if the device is dual, false otherwise (always true + * for non-E825 devices). + */ +static inline bool ice_is_primary(struct ice_hw *hw) +{ + return hw->mac_type != ICE_MAC_GENERIC_3K_E825 || + !ice_is_dual(hw) || + (hw->dev_caps.nac_topo.mode & ICE_NAC_TOPO_PRIMARY_M); +} + +/** + * ice_pf_src_tmr_owned - Check if a primary timer is owned by PF + * @pf: pointer to PF structure + * + * Return: true if PF owns primary timer, false otherwise. + */ +static inline bool ice_pf_src_tmr_owned(struct ice_pf *pf) +{ + return pf->hw.func_caps.ts_func_info.src_tmr_owned && + ice_is_primary(&pf->hw); +} + +/** + * ice_get_primary_hw - Get pointer to primary ice_hw structure + * @pf: pointer to PF structure + * + * Return: A pointer to ice_hw structure with access to timesync + * register space. + */ +static inline struct ice_hw *ice_get_primary_hw(struct ice_pf *pf) +{ + if (!pf->adapter->ctrl_pf) + return &pf->hw; + else + return &pf->adapter->ctrl_pf->hw; +} #endif /* _ICE_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 59df31c2c83f7..4fedf0181c4e1 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -1135,6 +1135,8 @@ int ice_init_hw(struct ice_hw *hw) } } + hw->lane_num = ice_get_phy_lane_number(hw); + return 0; err_unroll_fltr_mgmt_struct: ice_cleanup_fltr_mgmt_struct(hw); @@ -3434,7 +3436,7 @@ int ice_aq_get_fec_stats(struct ice_hw *hw, u16 pcs_quad, u16 pcs_port, msg.msg_addr_low = lower_16_bits(reg_offset); msg.msg_addr_high = receiver_id; msg.opcode = ice_sbq_msg_rd; - msg.dest_dev = rmn_0; + msg.dest_dev = ice_sbq_dev_phy_0; err = ice_sbq_rw_reg(hw, &msg, flag); if (err) @@ -4082,10 +4084,12 @@ int ice_get_phy_lane_number(struct ice_hw *hw) continue; if (hw->pf_id == lport) { + if (hw->mac_type == ICE_MAC_GENERIC_3K_E825 && + ice_is_dual(hw) && !ice_is_primary(hw)) + lane += ICE_PORTS_PER_QUAD; kfree(options); return lane; } - lport++; } @@ -6011,15 +6015,21 @@ bool ice_fw_supports_lldp_fltr_ctrl(struct ice_hw *hw) /** * ice_lldp_fltr_add_remove - add or remove a LLDP Rx switch filter * @hw: pointer to HW struct - * @vsi_num: absolute HW index for VSI + * @vsi: VSI to add the filter to * @add: boolean for if adding or removing a filter + * + * Return: 0 on success, -EOPNOTSUPP if the operation cannot be performed + * with this HW or VSI, otherwise an error corresponding to + * the AQ transaction result. */ -int -ice_lldp_fltr_add_remove(struct ice_hw *hw, u16 vsi_num, bool add) +int ice_lldp_fltr_add_remove(struct ice_hw *hw, struct ice_vsi *vsi, bool add) { struct ice_aqc_lldp_filter_ctrl *cmd; struct ice_aq_desc desc; + if (vsi->type != ICE_VSI_PF || !ice_fw_supports_lldp_fltr_ctrl(hw)) + return -EOPNOTSUPP; + cmd = &desc.params.lldp_filter_ctrl; ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_filter_ctrl); @@ -6029,7 +6039,7 @@ ice_lldp_fltr_add_remove(struct ice_hw *hw, u16 vsi_num, bool add) else cmd->cmd_flags = ICE_AQC_LLDP_FILTER_ACTION_DELETE; - cmd->vsi_num = cpu_to_le16(vsi_num); + cmd->vsi_num = cpu_to_le16(vsi->vsi_num); return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL); } diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h index 9b00aa0ddf10e..64c530b391917 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.h +++ b/drivers/net/ethernet/intel/ice/ice_common.h @@ -290,8 +290,7 @@ int ice_aq_set_lldp_mib(struct ice_hw *hw, u8 mib_type, void *buf, u16 buf_size, struct ice_sq_cd *cd); bool ice_fw_supports_lldp_fltr_ctrl(struct ice_hw *hw); -int -ice_lldp_fltr_add_remove(struct ice_hw *hw, u16 vsi_num, bool add); +int ice_lldp_fltr_add_remove(struct ice_hw *hw, struct ice_vsi *vsi, bool add); int ice_lldp_execute_pending_mib(struct ice_hw *hw); int ice_aq_read_i2c(struct ice_hw *hw, struct ice_aqc_link_topo_addr topo_addr, diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c index a7c5108328240..67988c7ab08e6 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c @@ -846,7 +846,7 @@ int ice_init_pf_dcb(struct ice_pf *pf, bool locked) goto dcb_init_err; } - ice_cfg_sw_lldp(pf_vsi, false, true); + ice_cfg_sw_rx_lldp(pf, true); pf->dcbx_cap = ice_dcb_get_mode(port_info, true); return 0; diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c index ed21d7f55ac11..6aae03771746d 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c @@ -29,6 +29,7 @@ static int ice_eswitch_setup_env(struct ice_pf *pf) return -ENODEV; ice_remove_vsi_fltr(&pf->hw, uplink_vsi->idx); + ice_vsi_cfg_sw_lldp(uplink_vsi, true, false); netif_addr_lock_bh(netdev); __dev_uc_unsync(netdev, NULL); @@ -245,6 +246,10 @@ ice_eswitch_set_target_vsi(struct sk_buff *skb, u64 cd_cmd, dst_vsi; if (!dst) { + struct ethhdr *eth = (struct ethhdr *)skb_mac_header(skb); + + if (unlikely(eth->h_proto == htons(ETH_P_LLDP))) + return; cd_cmd = ICE_TX_CTX_DESC_SWTCH_UPLINK << ICE_TXD_CTX_QW1_CMD_S; off->cd_qw1 |= (cd_cmd | ICE_TX_DESC_DTYPE_CTX); } else { @@ -278,6 +283,7 @@ static void ice_eswitch_release_env(struct ice_pf *pf) ice_fltr_add_mac_and_broadcast(uplink_vsi, uplink_vsi->port_info->mac.perm_addr, ICE_FWD_TO_VSI); + ice_vsi_cfg_sw_lldp(uplink_vsi, true, true); } /** diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 7c2dc347e4e57..648815170477c 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -1818,7 +1818,7 @@ static int ice_set_priv_flags(struct net_device *netdev, u32 flags) /* Remove rule to direct LLDP packets to default VSI. * The FW LLDP engine will now be consuming them. */ - ice_cfg_sw_lldp(vsi, false, false); + ice_cfg_sw_rx_lldp(vsi->back, false); /* AQ command to start FW LLDP agent will return an * error if the agent is already started diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c b/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c index 1d118171de378..aceec184e89b2 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c @@ -1605,7 +1605,7 @@ void ice_fdir_replay_fltrs(struct ice_pf *pf) */ int ice_fdir_create_dflt_rules(struct ice_pf *pf) { - const enum ice_fltr_ptype dflt_rules[] = { + static const enum ice_fltr_ptype dflt_rules[] = { ICE_FLTR_PTYPE_NONF_IPV4_TCP, ICE_FLTR_PTYPE_NONF_IPV4_UDP, ICE_FLTR_PTYPE_NONF_IPV6_TCP, ICE_FLTR_PTYPE_NONF_IPV6_UDP, }; diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 0bcf9d127ac9e..03bb16191237e 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2065,12 +2065,15 @@ static void ice_vsi_set_tc_cfg(struct ice_vsi *vsi) } /** - * ice_cfg_sw_lldp - Config switch rules for LLDP packet handling + * ice_vsi_cfg_sw_lldp - Config switch rules for LLDP packet handling * @vsi: the VSI being configured * @tx: bool to determine Tx or Rx rule * @create: bool to determine create or remove Rule + * + * Adding an ethtype Tx rule to the uplink VSI results in it being applied + * to the whole port, so LLDP transmission for VFs will be blocked too. */ -void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create) +void ice_vsi_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create) { int (*eth_fltr)(struct ice_vsi *v, u16 type, u16 flag, enum ice_sw_fwd_act_type act); @@ -2085,19 +2088,59 @@ void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create) status = eth_fltr(vsi, ETH_P_LLDP, ICE_FLTR_TX, ICE_DROP_PACKET); } else { - if (ice_fw_supports_lldp_fltr_ctrl(&pf->hw)) { - status = ice_lldp_fltr_add_remove(&pf->hw, vsi->vsi_num, - create); - } else { + if (!test_bit(ICE_FLAG_LLDP_AQ_FLTR, pf->flags)) { status = eth_fltr(vsi, ETH_P_LLDP, ICE_FLTR_RX, ICE_FWD_TO_VSI); + if (!status || !create) + goto report; + + dev_info(dev, + "Failed to add generic LLDP Rx filter on VSI %i error: %d, falling back to specialized AQ control\n", + vsi->vsi_num, status); } + + status = ice_lldp_fltr_add_remove(&pf->hw, vsi, create); + if (!status) + set_bit(ICE_FLAG_LLDP_AQ_FLTR, pf->flags); + } +report: if (status) - dev_dbg(dev, "Fail %s %s LLDP rule on VSI %i error: %d\n", - create ? "adding" : "removing", tx ? "TX" : "RX", - vsi->vsi_num, status); + dev_warn(dev, "Failed to %s %s LLDP rule on VSI %i error: %d\n", + create ? "add" : "remove", tx ? "Tx" : "Rx", + vsi->vsi_num, status); +} + +/** + * ice_cfg_sw_rx_lldp - Enable/disable software handling of LLDP + * @pf: the PF being configured + * @enable: enable or disable + * + * Configure switch rules to enable/disable LLDP handling by software + * across PF. + */ +void ice_cfg_sw_rx_lldp(struct ice_pf *pf, bool enable) +{ + struct ice_vsi *vsi; + struct ice_vf *vf; + unsigned int bkt; + + vsi = ice_get_main_vsi(pf); + ice_vsi_cfg_sw_lldp(vsi, false, enable); + + if (!test_bit(ICE_FLAG_SRIOV_ENA, pf->flags)) + return; + + ice_for_each_vf(pf, bkt, vf) { + vsi = ice_get_vf_vsi(vf); + + if (WARN_ON(!vsi)) + continue; + + if (ice_vf_is_lldp_ena(vf)) + ice_vsi_cfg_sw_lldp(vsi, false, enable); + } } /** @@ -2528,7 +2571,7 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_vsi_cfg_params *params) if (!ice_is_safe_mode(pf) && vsi->type == ICE_VSI_PF) { ice_fltr_add_eth(vsi, ETH_P_PAUSE, ICE_FLTR_TX, ICE_DROP_PACKET); - ice_cfg_sw_lldp(vsi, true, true); + ice_vsi_cfg_sw_lldp(vsi, true, true); } if (!vsi->agg_node) @@ -2825,9 +2868,11 @@ int ice_vsi_release(struct ice_vsi *vsi) /* The Rx rule will only exist to remove if the LLDP FW * engine is currently stopped */ - if (!ice_is_safe_mode(pf) && vsi->type == ICE_VSI_PF && - !test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags)) - ice_cfg_sw_lldp(vsi, false, false); + if (!ice_is_safe_mode(pf) && + !test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags) && + (vsi->type == ICE_VSI_PF || (vsi->type == ICE_VSI_VF && + ice_vf_is_lldp_ena(vsi->vf)))) + ice_vsi_cfg_sw_lldp(vsi, false, false); ice_vsi_decfg(vsi); diff --git a/drivers/net/ethernet/intel/ice/ice_lib.h b/drivers/net/ethernet/intel/ice/ice_lib.h index b4c9cb28a016e..654516c5fc3ed 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_lib.h @@ -29,7 +29,8 @@ ice_vsi_stop_lan_tx_rings(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src, int ice_vsi_stop_xdp_tx_rings(struct ice_vsi *vsi); -void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create); +void ice_vsi_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create); +void ice_cfg_sw_rx_lldp(struct ice_pf *pf, bool enable); int ice_set_link(struct ice_vsi *vsi, bool ena); diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index d390157b59fe1..1fbe13ee93a8d 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -8330,11 +8330,16 @@ void ice_tx_timeout(struct net_device *netdev, unsigned int txqueue) * @np: net device to configure * @filter_dev: device on which filter is added * @cls_flower: offload data + * @ingress: if the rule is added to an ingress block + * + * Return: 0 if the flower was successfully added or deleted, + * negative error code otherwise. */ static int ice_setup_tc_cls_flower(struct ice_netdev_priv *np, struct net_device *filter_dev, - struct flow_cls_offload *cls_flower) + struct flow_cls_offload *cls_flower, + bool ingress) { struct ice_vsi *vsi = np->vsi; @@ -8343,7 +8348,7 @@ ice_setup_tc_cls_flower(struct ice_netdev_priv *np, switch (cls_flower->command) { case FLOW_CLS_REPLACE: - return ice_add_cls_flower(filter_dev, vsi, cls_flower); + return ice_add_cls_flower(filter_dev, vsi, cls_flower, ingress); case FLOW_CLS_DESTROY: return ice_del_cls_flower(vsi, cls_flower); default: @@ -8352,20 +8357,46 @@ ice_setup_tc_cls_flower(struct ice_netdev_priv *np, } /** - * ice_setup_tc_block_cb - callback handler registered for TC block + * ice_setup_tc_block_cb_ingress - callback handler for ingress TC block * @type: TC SETUP type * @type_data: TC flower offload data that contains user input * @cb_priv: netdev private data + * + * Return: 0 if the setup was successful, negative error code otherwise. */ static int -ice_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) +ice_setup_tc_block_cb_ingress(enum tc_setup_type type, void *type_data, + void *cb_priv) { struct ice_netdev_priv *np = cb_priv; switch (type) { case TC_SETUP_CLSFLOWER: return ice_setup_tc_cls_flower(np, np->vsi->netdev, - type_data); + type_data, true); + default: + return -EOPNOTSUPP; + } +} + +/** + * ice_setup_tc_block_cb_egress - callback handler for egress TC block + * @type: TC SETUP type + * @type_data: TC flower offload data that contains user input + * @cb_priv: netdev private data + * + * Return: 0 if the setup was successful, negative error code otherwise. + */ +static int +ice_setup_tc_block_cb_egress(enum tc_setup_type type, void *type_data, + void *cb_priv) +{ + struct ice_netdev_priv *np = cb_priv; + + switch (type) { + case TC_SETUP_CLSFLOWER: + return ice_setup_tc_cls_flower(np, np->vsi->netdev, + type_data, false); default: return -EOPNOTSUPP; } @@ -9310,16 +9341,32 @@ ice_setup_tc(struct net_device *netdev, enum tc_setup_type type, void *type_data) { struct ice_netdev_priv *np = netdev_priv(netdev); + enum flow_block_binder_type binder_type; struct ice_pf *pf = np->vsi->back; + flow_setup_cb_t *flower_handler; bool locked = false; int err; switch (type) { case TC_SETUP_BLOCK: + binder_type = + ((struct flow_block_offload *)type_data)->binder_type; + + switch (binder_type) { + case FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS: + flower_handler = ice_setup_tc_block_cb_ingress; + break; + case FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS: + flower_handler = ice_setup_tc_block_cb_egress; + break; + default: + return -EOPNOTSUPP; + } + return flow_block_cb_setup_simple(type_data, &ice_block_cb_list, - ice_setup_tc_block_cb, - np, np, true); + flower_handler, + np, np, false); case TC_SETUP_QDISC_MQPRIO: if (ice_is_eswitch_mode_switchdev(pf)) { netdev_err(netdev, "TC MQPRIO offload not supported, switchdev is enabled\n"); @@ -9380,7 +9427,7 @@ ice_indr_setup_block_cb(enum tc_setup_type type, void *type_data, case TC_SETUP_CLSFLOWER: return ice_setup_tc_cls_flower(np, priv->netdev, (struct flow_cls_offload *) - type_data); + type_data, false); default: return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index 1fd1ae03eb909..b79a148ed0f28 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -305,6 +305,9 @@ u64 ice_ptp_read_src_clk_reg(struct ice_pf *pf, u32 hi, lo, lo2; u8 tmr_idx; + if (!ice_is_primary(hw)) + hw = ice_get_primary_hw(pf); + tmr_idx = ice_get_ptp_src_clock_index(hw); guard(spinlock)(&pf->adapter->ptp_gltsyn_time_lock); /* Read the system timestamp pre PHC read */ @@ -1624,14 +1627,6 @@ static int ice_ptp_cfg_extts(struct ice_pf *pf, struct ptp_extts_request *rq, int pin_desc_idx; u8 tmr_idx; - /* Reject requests with unsupported flags */ - - if (rq->flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned; chan = rq->index; @@ -1802,9 +1797,6 @@ static int ice_ptp_cfg_perout(struct ice_pf *pf, struct ptp_perout_request *rq, struct ice_hw *hw = &pf->hw; int pin_desc_idx; - if (rq->flags & ~PTP_PEROUT_PHASE) - return -EOPNOTSUPP; - pin_desc_idx = ice_ptp_find_pin_idx(pf, PTP_PF_PEROUT, rq->index); if (pin_desc_idx < 0) return -EIO; @@ -2737,6 +2729,11 @@ static void ice_ptp_set_caps(struct ice_pf *pf) info->enable = ice_ptp_gpio_enable; info->verify = ice_verify_pin; + info->supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; + info->supported_perout_flags = PTP_PEROUT_PHASE; + switch (pf->hw.mac_type) { case ICE_MAC_E810: ice_ptp_set_funcs_e810(pf); @@ -2985,6 +2982,32 @@ static void ice_ptp_periodic_work(struct kthread_work *work) msecs_to_jiffies(err ? 10 : 500)); } +/** + * ice_ptp_prepare_rebuild_sec - Prepare second NAC for PTP reset or rebuild + * @pf: Board private structure + * @rebuild: rebuild if true, prepare if false + * @reset_type: the reset type being performed + */ +static void ice_ptp_prepare_rebuild_sec(struct ice_pf *pf, bool rebuild, + enum ice_reset_req reset_type) +{ + struct list_head *entry; + + list_for_each(entry, &pf->adapter->ports.ports) { + struct ice_ptp_port *port = list_entry(entry, + struct ice_ptp_port, + list_node); + struct ice_pf *peer_pf = ptp_port_to_pf(port); + + if (!ice_is_primary(&peer_pf->hw)) { + if (rebuild) + ice_ptp_rebuild(peer_pf, reset_type); + else + ice_ptp_prepare_for_reset(peer_pf, reset_type); + } + } +} + /** * ice_ptp_prepare_for_reset - Prepare PTP for reset * @pf: Board private structure @@ -2993,6 +3016,7 @@ static void ice_ptp_periodic_work(struct kthread_work *work) void ice_ptp_prepare_for_reset(struct ice_pf *pf, enum ice_reset_req reset_type) { struct ice_ptp *ptp = &pf->ptp; + struct ice_hw *hw = &pf->hw; u8 src_tmr; if (ptp->state != ICE_PTP_READY) @@ -3008,6 +3032,9 @@ void ice_ptp_prepare_for_reset(struct ice_pf *pf, enum ice_reset_req reset_type) if (reset_type == ICE_RESET_PFR) return; + if (ice_pf_src_tmr_owned(pf) && hw->mac_type == ICE_MAC_GENERIC_3K_E825) + ice_ptp_prepare_rebuild_sec(pf, false, reset_type); + ice_ptp_release_tx_tracker(pf, &pf->ptp.port.tx); /* Disable periodic outputs */ @@ -3129,13 +3156,6 @@ void ice_ptp_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) dev_err(ice_pf_to_dev(pf), "PTP reset failed %d\n", err); } -static bool ice_is_primary(struct ice_hw *hw) -{ - return hw->mac_type == ICE_MAC_GENERIC_3K_E825 && ice_is_dual(hw) ? - !!(hw->dev_caps.nac_topo.mode & ICE_NAC_TOPO_PRIMARY_M) : - true; -} - static int ice_ptp_setup_adapter(struct ice_pf *pf) { if (!ice_pf_src_tmr_owned(pf) || !ice_is_primary(&pf->hw)) @@ -3355,17 +3375,16 @@ void ice_ptp_init(struct ice_pf *pf) { struct ice_ptp *ptp = &pf->ptp; struct ice_hw *hw = &pf->hw; - int lane_num, err; + int err; ptp->state = ICE_PTP_INITIALIZING; - lane_num = ice_get_phy_lane_number(hw); - if (lane_num < 0) { - err = lane_num; + if (hw->lane_num < 0) { + err = hw->lane_num; goto err_exit; } + ptp->port.port_num = hw->lane_num; - ptp->port.port_num = (u8)lane_num; ice_ptp_init_hw(hw); ice_ptp_init_tx_interrupt_mode(pf); diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c index 89bb8461284ad..ccac84eb34c9e 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c @@ -240,7 +240,7 @@ static int ice_read_cgu_reg_e82x(struct ice_hw *hw, u32 addr, u32 *val) { struct ice_sbq_msg_input cgu_msg = { .opcode = ice_sbq_msg_rd, - .dest_dev = cgu, + .dest_dev = ice_sbq_dev_cgu, .msg_addr_low = addr }; int err; @@ -272,7 +272,7 @@ static int ice_write_cgu_reg_e82x(struct ice_hw *hw, u32 addr, u32 val) { struct ice_sbq_msg_input cgu_msg = { .opcode = ice_sbq_msg_wr, - .dest_dev = cgu, + .dest_dev = ice_sbq_dev_cgu, .msg_addr_low = addr, .data = val }; @@ -874,8 +874,12 @@ static u32 ice_ptp_tmr_cmd_to_port_reg(struct ice_hw *hw, */ void ice_ptp_src_cmd(struct ice_hw *hw, enum ice_ptp_tmr_cmd cmd) { + struct ice_pf *pf = container_of(hw, struct ice_pf, hw); u32 cmd_val = ice_ptp_tmr_cmd_to_src_reg(hw, cmd); + if (!ice_is_primary(hw)) + hw = ice_get_primary_hw(pf); + wr32(hw, GLTSYN_CMD, cmd_val); } @@ -891,6 +895,9 @@ static void ice_ptp_exec_tmr_cmd(struct ice_hw *hw) { struct ice_pf *pf = container_of(hw, struct ice_pf, hw); + if (!ice_is_primary(hw)) + hw = ice_get_primary_hw(pf); + guard(spinlock)(&pf->adapter->ptp_gltsyn_time_lock); wr32(hw, GLTSYN_CMD_SYNC, SYNC_EXEC_CMD); ice_flush(hw); @@ -919,16 +926,24 @@ static void ice_ptp_cfg_sync_delay(const struct ice_hw *hw, u32 delay) * * Return: destination sideband queue PHY device. */ -static enum ice_sbq_msg_dev ice_ptp_get_dest_dev_e825(struct ice_hw *hw, - u8 port) +static enum ice_sbq_dev_id ice_ptp_get_dest_dev_e825(struct ice_hw *hw, + u8 port) { - /* On a single complex E825, PHY 0 is always destination device phy_0 + u8 curr_phy, tgt_phy; + + tgt_phy = port >= hw->ptp.ports_per_phy; + curr_phy = hw->lane_num >= hw->ptp.ports_per_phy; + /* In the driver, lanes 4..7 are in fact 0..3 on a second PHY. + * On a single complex E825C, PHY 0 is always destination device phy_0 * and PHY 1 is phy_0_peer. + * On dual complex E825C, device phy_0 points to PHY on a current + * complex and phy_0_peer to PHY on a different complex. */ - if (port >= hw->ptp.ports_per_phy) - return eth56g_phy_1; + if ((!ice_is_dual(hw) && tgt_phy == 1) || + (ice_is_dual(hw) && tgt_phy != curr_phy)) + return ice_sbq_dev_phy_0_peer; else - return eth56g_phy_0; + return ice_sbq_dev_phy_0; } /** @@ -2417,6 +2432,7 @@ int ice_phy_cfg_intr_eth56g(struct ice_hw *hw, u8 port, bool ena, u8 threshold) static int ice_read_phy_and_phc_time_eth56g(struct ice_hw *hw, u8 port, u64 *phy_time, u64 *phc_time) { + struct ice_pf *pf = container_of(hw, struct ice_pf, hw); u64 tx_time, rx_time; u32 zo, lo; u8 tmr_idx; @@ -2436,8 +2452,13 @@ static int ice_read_phy_and_phc_time_eth56g(struct ice_hw *hw, u8 port, ice_ptp_exec_tmr_cmd(hw); /* Read the captured PHC time from the shadow time registers */ - zo = rd32(hw, GLTSYN_SHTIME_0(tmr_idx)); - lo = rd32(hw, GLTSYN_SHTIME_L(tmr_idx)); + if (ice_is_primary(hw)) { + zo = rd32(hw, GLTSYN_SHTIME_0(tmr_idx)); + lo = rd32(hw, GLTSYN_SHTIME_L(tmr_idx)); + } else { + zo = rd32(ice_get_primary_hw(pf), GLTSYN_SHTIME_0(tmr_idx)); + lo = rd32(ice_get_primary_hw(pf), GLTSYN_SHTIME_L(tmr_idx)); + } *phc_time = (u64)lo << 32 | zo; /* Read the captured PHY time from the PHY shadow registers */ @@ -2574,6 +2595,7 @@ int ice_stop_phy_timer_eth56g(struct ice_hw *hw, u8 port, bool soft_reset) */ int ice_start_phy_timer_eth56g(struct ice_hw *hw, u8 port) { + struct ice_pf *pf = container_of(hw, struct ice_pf, hw); u32 lo, hi; u64 incval; u8 tmr_idx; @@ -2599,8 +2621,13 @@ int ice_start_phy_timer_eth56g(struct ice_hw *hw, u8 port) if (err) return err; - lo = rd32(hw, GLTSYN_INCVAL_L(tmr_idx)); - hi = rd32(hw, GLTSYN_INCVAL_H(tmr_idx)); + if (ice_is_primary(hw)) { + lo = rd32(hw, GLTSYN_INCVAL_L(tmr_idx)); + hi = rd32(hw, GLTSYN_INCVAL_H(tmr_idx)); + } else { + lo = rd32(ice_get_primary_hw(pf), GLTSYN_INCVAL_L(tmr_idx)); + hi = rd32(ice_get_primary_hw(pf), GLTSYN_INCVAL_H(tmr_idx)); + } incval = (u64)hi << 32 | lo; err = ice_write_40b_ptp_reg_eth56g(hw, port, PHY_REG_TIMETUS_L, incval); @@ -2630,25 +2657,6 @@ int ice_start_phy_timer_eth56g(struct ice_hw *hw, u8 port) return 0; } -/** - * ice_sb_access_ena_eth56g - Enable SB devices (PHY and others) access - * @hw: pointer to HW struct - * @enable: Enable or disable access - * - * Enable sideband devices (PHY and others) access. - */ -static void ice_sb_access_ena_eth56g(struct ice_hw *hw, bool enable) -{ - u32 val = rd32(hw, PF_SB_REM_DEV_CTL); - - if (enable) - val |= BIT(eth56g_phy_0) | BIT(cgu) | BIT(eth56g_phy_1); - else - val &= ~(BIT(eth56g_phy_0) | BIT(cgu) | BIT(eth56g_phy_1)); - - wr32(hw, PF_SB_REM_DEV_CTL, val); -} - /** * ice_ptp_init_phc_e825 - Perform E825 specific PHC initialization * @hw: pointer to HW struct @@ -2659,8 +2667,6 @@ static void ice_sb_access_ena_eth56g(struct ice_hw *hw, bool enable) */ static int ice_ptp_init_phc_e825(struct ice_hw *hw) { - ice_sb_access_ena_eth56g(hw, true); - /* Initialize the Clock Generation Unit */ return ice_init_cgu_e82x(hw); } @@ -2747,8 +2753,6 @@ static void ice_ptp_init_phy_e825(struct ice_hw *hw) params->num_phys = 2; ptp->ports_per_phy = 4; ptp->num_lports = params->num_phys * ptp->ports_per_phy; - - ice_sb_access_ena_eth56g(hw, true); } /* E822 family functions @@ -2781,7 +2785,7 @@ static void ice_fill_phy_msg_e82x(struct ice_hw *hw, msg->msg_addr_high = P_Q1_H(P_4_BASE + offset, phy_port); } - msg->dest_dev = rmn_0; + msg->dest_dev = ice_sbq_dev_phy_0; } /** @@ -3104,7 +3108,7 @@ static int ice_fill_quad_msg_e82x(struct ice_hw *hw, if (quad >= ICE_GET_QUAD_NUM(hw->ptp.num_lports)) return -EINVAL; - msg->dest_dev = rmn_0; + msg->dest_dev = ice_sbq_dev_phy_0; if (!(quad % ICE_GET_QUAD_NUM(hw->ptp.ports_per_phy))) addr = Q_0_BASE + offset; @@ -4823,7 +4827,7 @@ static int ice_read_phy_reg_e810(struct ice_hw *hw, u32 addr, u32 *val) msg.msg_addr_low = lower_16_bits(addr); msg.msg_addr_high = upper_16_bits(addr); msg.opcode = ice_sbq_msg_rd; - msg.dest_dev = rmn_0; + msg.dest_dev = ice_sbq_dev_phy_0; err = ice_sbq_rw_reg(hw, &msg, ICE_AQ_FLAG_RD); if (err) { @@ -4853,7 +4857,7 @@ static int ice_write_phy_reg_e810(struct ice_hw *hw, u32 addr, u32 val) msg.msg_addr_low = lower_16_bits(addr); msg.msg_addr_high = upper_16_bits(addr); msg.opcode = ice_sbq_msg_wr; - msg.dest_dev = rmn_0; + msg.dest_dev = ice_sbq_dev_phy_0; msg.data = val; err = ice_sbq_rw_reg(hw, &msg, ICE_AQ_FLAG_RD); diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.h b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h index e5925ccc26132..83f20fa7ace74 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.h +++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h @@ -444,11 +444,6 @@ static inline u64 ice_get_base_incval(struct ice_hw *hw) } } -static inline bool ice_is_dual(struct ice_hw *hw) -{ - return !!(hw->dev_caps.nac_topo.mode & ICE_NAC_TOPO_DUAL_M); -} - #define PFTSYN_SEM_BYTES 4 #define ICE_PTP_CLOCK_INDEX_0 0x00 diff --git a/drivers/net/ethernet/intel/ice/ice_repr.c b/drivers/net/ethernet/intel/ice/ice_repr.c index fb7a1b9a43139..cb08746556a67 100644 --- a/drivers/net/ethernet/intel/ice/ice_repr.c +++ b/drivers/net/ethernet/intel/ice/ice_repr.c @@ -219,7 +219,8 @@ ice_repr_setup_tc_cls_flower(struct ice_repr *repr, { switch (flower->command) { case FLOW_CLS_REPLACE: - return ice_add_cls_flower(repr->netdev, repr->src_vsi, flower); + return ice_add_cls_flower(repr->netdev, repr->src_vsi, flower, + true); case FLOW_CLS_DESTROY: return ice_del_cls_flower(repr->src_vsi, flower); default: @@ -336,6 +337,7 @@ void ice_repr_destroy(struct ice_repr *repr) static void ice_repr_rem_vf(struct ice_repr *repr) { ice_eswitch_decfg_vsi(repr->src_vsi, repr->parent_mac); + ice_pass_vf_tx_lldp(repr->src_vsi, true); unregister_netdev(repr->netdev); ice_devlink_destroy_vf_port(repr->vf); ice_virtchnl_set_dflt_ops(repr->vf); @@ -417,6 +419,10 @@ static int ice_repr_add_vf(struct ice_repr *repr) if (err) goto err_netdev; + err = ice_drop_vf_tx_lldp(repr->src_vsi, true); + if (err) + goto err_drop_lldp; + err = ice_eswitch_cfg_vsi(repr->src_vsi, repr->parent_mac); if (err) goto err_cfg_vsi; @@ -429,6 +435,8 @@ static int ice_repr_add_vf(struct ice_repr *repr) return 0; err_cfg_vsi: + ice_pass_vf_tx_lldp(repr->src_vsi, true); +err_drop_lldp: unregister_netdev(repr->netdev); err_netdev: ice_devlink_destroy_vf_port(vf); diff --git a/drivers/net/ethernet/intel/ice/ice_sbq_cmd.h b/drivers/net/ethernet/intel/ice/ice_sbq_cmd.h index 3b0054faf70c7..183dd5457d6ae 100644 --- a/drivers/net/ethernet/intel/ice/ice_sbq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_sbq_cmd.h @@ -46,13 +46,10 @@ struct ice_sbq_evt_desc { u8 data[24]; }; -enum ice_sbq_msg_dev { - eth56g_phy_0 = 0x02, - rmn_0 = 0x02, - rmn_1 = 0x03, - rmn_2 = 0x04, - cgu = 0x06, - eth56g_phy_1 = 0x0D, +enum ice_sbq_dev_id { + ice_sbq_dev_phy_0 = 0x02, + ice_sbq_dev_cgu = 0x06, + ice_sbq_dev_phy_0_peer = 0x0D, }; enum ice_sbq_msg_opcode { diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c b/drivers/net/ethernet/intel/ice/ice_sriov.c index f1648cf103b74..0e4dc1a5cff0f 100644 --- a/drivers/net/ethernet/intel/ice/ice_sriov.c +++ b/drivers/net/ethernet/intel/ice/ice_sriov.c @@ -63,6 +63,7 @@ static void ice_free_vf_res(struct ice_vf *vf) if (vf->lan_vsi_idx != ICE_NO_VSI) { ice_vf_vsi_release(vf); vf->num_mac = 0; + vf->num_mac_lldp = 0; } last_vector_idx = vf->first_vector_idx + vf->num_msix - 1; @@ -1402,6 +1403,9 @@ int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool trusted) mutex_lock(&vf->cfg_lock); + while (!trusted && vf->num_mac_lldp) + ice_vf_update_mac_lldp_num(vf, ice_get_vf_vsi(vf), false); + vf->trusted = trusted; ice_reset_vf(vf, ICE_VF_RESET_NOTIFY); dev_info(ice_pf_to_dev(pf), "VF %u is now %strusted\n", diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index 4a91e0aaf0a5e..9d9a7edd3618a 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -3146,7 +3146,7 @@ ice_add_update_vsi_list(struct ice_hw *hw, u16 vsi_handle_arr[2]; /* A rule already exists with the new VSI being added */ - if (cur_fltr->fwd_id.hw_vsi_id == new_fltr->fwd_id.hw_vsi_id) + if (cur_fltr->vsi_handle == new_fltr->vsi_handle) return -EEXIST; vsi_handle_arr[0] = cur_fltr->vsi_handle; @@ -5978,7 +5978,7 @@ ice_adv_add_update_vsi_list(struct ice_hw *hw, /* A rule already exists with the new VSI being added */ if (test_bit(vsi_handle, m_entry->vsi_list_info->vsi_map)) - return 0; + return -EEXIST; /* Update the previously created VSI list set with * the new VSI ID passed in diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index ea39b999a0d00..fb9ea7f8ef444 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -12,14 +12,11 @@ /** * ice_tc_count_lkups - determine lookup count for switch filter * @flags: TC-flower flags - * @headers: Pointer to TC flower filter header structure * @fltr: Pointer to outer TC filter structure * - * Determine lookup count based on TC flower input for switch filter. + * Return: lookup count based on TC flower input for a switch filter. */ -static int -ice_tc_count_lkups(u32 flags, struct ice_tc_flower_lyr_2_4_hdrs *headers, - struct ice_tc_flower_fltr *fltr) +static int ice_tc_count_lkups(u32 flags, struct ice_tc_flower_fltr *fltr) { int lkups_cnt = 1; /* 0th lookup is metadata */ @@ -684,26 +681,26 @@ static int ice_tc_setup_action(struct net_device *filter_dev, fltr->action.fltr_act = action; if (ice_is_port_repr_netdev(filter_dev) && - ice_is_port_repr_netdev(target_dev)) { + ice_is_port_repr_netdev(target_dev) && + fltr->direction == ICE_ESWITCH_FLTR_EGRESS) { repr = ice_netdev_to_repr(target_dev); fltr->dest_vsi = repr->src_vsi; - fltr->direction = ICE_ESWITCH_FLTR_EGRESS; } else if (ice_is_port_repr_netdev(filter_dev) && - ice_tc_is_dev_uplink(target_dev)) { + ice_tc_is_dev_uplink(target_dev) && + fltr->direction == ICE_ESWITCH_FLTR_EGRESS) { repr = ice_netdev_to_repr(filter_dev); fltr->dest_vsi = repr->src_vsi->back->eswitch.uplink_vsi; - fltr->direction = ICE_ESWITCH_FLTR_EGRESS; } else if (ice_tc_is_dev_uplink(filter_dev) && - ice_is_port_repr_netdev(target_dev)) { + ice_is_port_repr_netdev(target_dev) && + fltr->direction == ICE_ESWITCH_FLTR_INGRESS) { repr = ice_netdev_to_repr(target_dev); fltr->dest_vsi = repr->src_vsi; - fltr->direction = ICE_ESWITCH_FLTR_INGRESS; } else { NL_SET_ERR_MSG_MOD(fltr->extack, - "Unsupported netdevice in switchdev mode"); + "The action is not supported for this netdevice"); return -EINVAL; } @@ -716,13 +713,11 @@ ice_tc_setup_drop_action(struct net_device *filter_dev, { fltr->action.fltr_act = ICE_DROP_PACKET; - if (ice_is_port_repr_netdev(filter_dev)) { - fltr->direction = ICE_ESWITCH_FLTR_EGRESS; - } else if (ice_tc_is_dev_uplink(filter_dev)) { - fltr->direction = ICE_ESWITCH_FLTR_INGRESS; - } else { + if (!ice_tc_is_dev_uplink(filter_dev) && + !(ice_is_port_repr_netdev(filter_dev) && + fltr->direction == ICE_ESWITCH_FLTR_INGRESS)) { NL_SET_ERR_MSG_MOD(fltr->extack, - "Unsupported netdevice in switchdev mode"); + "The action is not supported for this netdevice"); return -EINVAL; } @@ -767,10 +762,157 @@ static int ice_eswitch_tc_parse_action(struct net_device *filter_dev, return 0; } +static bool ice_is_fltr_lldp(struct ice_tc_flower_fltr *fltr) +{ + return fltr->outer_headers.l2_key.n_proto == htons(ETH_P_LLDP); +} + +static bool ice_is_fltr_pf_tx_lldp(struct ice_tc_flower_fltr *fltr) +{ + struct ice_vsi *vsi = fltr->src_vsi, *uplink; + + if (!ice_is_switchdev_running(vsi->back)) + return false; + + uplink = vsi->back->eswitch.uplink_vsi; + return vsi == uplink && fltr->action.fltr_act == ICE_DROP_PACKET && + ice_is_fltr_lldp(fltr) && + fltr->direction == ICE_ESWITCH_FLTR_EGRESS && + fltr->flags == ICE_TC_FLWR_FIELD_ETH_TYPE_ID; +} + +static bool ice_is_fltr_vf_tx_lldp(struct ice_tc_flower_fltr *fltr) +{ + struct ice_vsi *vsi = fltr->src_vsi, *uplink; + + uplink = vsi->back->eswitch.uplink_vsi; + return fltr->src_vsi->type == ICE_VSI_VF && ice_is_fltr_lldp(fltr) && + fltr->direction == ICE_ESWITCH_FLTR_EGRESS && + fltr->dest_vsi == uplink; +} + +static struct ice_tc_flower_fltr * +ice_find_pf_tx_lldp_fltr(struct ice_pf *pf) +{ + struct ice_tc_flower_fltr *fltr; + + hlist_for_each_entry(fltr, &pf->tc_flower_fltr_list, tc_flower_node) + if (ice_is_fltr_pf_tx_lldp(fltr)) + return fltr; + + return NULL; +} + +static bool ice_any_vf_lldp_tx_ena(struct ice_pf *pf) +{ + struct ice_vf *vf; + unsigned int bkt; + + ice_for_each_vf(pf, bkt, vf) + if (vf->lldp_tx_ena) + return true; + + return false; +} + +int ice_pass_vf_tx_lldp(struct ice_vsi *vsi, bool deinit) +{ + struct ice_rule_query_data remove_entry = { + .rid = vsi->vf->lldp_recipe_id, + .rule_id = vsi->vf->lldp_rule_id, + .vsi_handle = vsi->idx, + }; + struct ice_pf *pf = vsi->back; + int err; + + if (vsi->vf->lldp_tx_ena) + return 0; + + if (!deinit && !ice_find_pf_tx_lldp_fltr(vsi->back)) + return -EINVAL; + + if (!deinit && ice_any_vf_lldp_tx_ena(pf)) + return -EINVAL; + + err = ice_rem_adv_rule_by_id(&pf->hw, &remove_entry); + if (!err) + vsi->vf->lldp_tx_ena = true; + + return err; +} + +int ice_drop_vf_tx_lldp(struct ice_vsi *vsi, bool init) +{ + struct ice_rule_query_data rule_added; + struct ice_adv_rule_info rinfo = { + .priority = 7, + .src_vsi = vsi->idx, + .sw_act = { + .src = vsi->idx, + .flag = ICE_FLTR_TX, + .fltr_act = ICE_DROP_PACKET, + .vsi_handle = vsi->idx, + }, + .flags_info.act_valid = true, + }; + struct ice_adv_lkup_elem list[3]; + struct ice_pf *pf = vsi->back; + int err; + + if (!init && !vsi->vf->lldp_tx_ena) + return 0; + + memset(list, 0, sizeof(list)); + ice_rule_add_direction_metadata(&list[0]); + ice_rule_add_src_vsi_metadata(&list[1]); + list[2].type = ICE_ETYPE_OL; + list[2].h_u.ethertype.ethtype_id = htons(ETH_P_LLDP); + list[2].m_u.ethertype.ethtype_id = htons(0xFFFF); + + err = ice_add_adv_rule(&pf->hw, list, ARRAY_SIZE(list), &rinfo, + &rule_added); + if (err) { + dev_err(&pf->pdev->dev, + "Failed to add an LLDP rule to VSI 0x%X: %d\n", + vsi->idx, err); + } else { + vsi->vf->lldp_recipe_id = rule_added.rid; + vsi->vf->lldp_rule_id = rule_added.rule_id; + vsi->vf->lldp_tx_ena = false; + } + + return err; +} + +static void ice_handle_add_pf_lldp_drop_rule(struct ice_vsi *vsi) +{ + struct ice_tc_flower_fltr *fltr; + struct ice_pf *pf = vsi->back; + + hlist_for_each_entry(fltr, &pf->tc_flower_fltr_list, tc_flower_node) { + if (!ice_is_fltr_vf_tx_lldp(fltr)) + continue; + ice_pass_vf_tx_lldp(fltr->src_vsi, true); + break; + } +} + +static void ice_handle_del_pf_lldp_drop_rule(struct ice_pf *pf) +{ + int i; + + /* Make the VF LLDP fwd to uplink rule dormant */ + ice_for_each_vsi(pf, i) { + struct ice_vsi *vf_vsi = pf->vsi[i]; + + if (vf_vsi && vf_vsi->type == ICE_VSI_VF) + ice_drop_vf_tx_lldp(vf_vsi, false); + } +} + static int ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) { - struct ice_tc_flower_lyr_2_4_hdrs *headers = &fltr->outer_headers; struct ice_adv_rule_info rule_info = { 0 }; struct ice_rule_query_data rule_added; struct ice_hw *hw = &vsi->back->hw; @@ -785,7 +927,10 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) return -EOPNOTSUPP; } - lkups_cnt = ice_tc_count_lkups(flags, headers, fltr); + if (ice_is_fltr_vf_tx_lldp(fltr)) + return ice_pass_vf_tx_lldp(vsi, false); + + lkups_cnt = ice_tc_count_lkups(flags, fltr); list = kcalloc(lkups_cnt, sizeof(*list), GFP_ATOMIC); if (!list) return -ENOMEM; @@ -813,6 +958,11 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) rule_info.sw_act.flag |= ICE_FLTR_RX; rule_info.sw_act.src = hw->pf_id; rule_info.flags_info.act = ICE_SINGLE_ACT_LB_ENABLE; + } else if (fltr->direction == ICE_ESWITCH_FLTR_EGRESS && + !fltr->dest_vsi && vsi == vsi->back->eswitch.uplink_vsi) { + /* PF to Uplink */ + rule_info.sw_act.flag |= ICE_FLTR_TX; + rule_info.sw_act.src = vsi->idx; } else if (fltr->direction == ICE_ESWITCH_FLTR_EGRESS && fltr->dest_vsi == vsi->back->eswitch.uplink_vsi) { /* VF to Uplink */ @@ -846,11 +996,17 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) NL_SET_ERR_MSG_MOD(fltr->extack, "Unable to add filter because it already exist"); ret = -EINVAL; goto exit; + } else if (ret == -ENOSPC) { + NL_SET_ERR_MSG_MOD(fltr->extack, "Unable to add filter: insufficient space available."); + goto exit; } else if (ret) { NL_SET_ERR_MSG_MOD(fltr->extack, "Unable to add filter due to error"); goto exit; } + if (ice_is_fltr_pf_tx_lldp(fltr)) + ice_handle_add_pf_lldp_drop_rule(vsi); + /* store the output params, which are needed later for removing * advanced switch filter */ @@ -985,7 +1141,6 @@ static int ice_add_tc_flower_adv_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *tc_fltr) { - struct ice_tc_flower_lyr_2_4_hdrs *headers = &tc_fltr->outer_headers; struct ice_adv_rule_info rule_info = {0}; struct ice_rule_query_data rule_added; struct ice_adv_lkup_elem *list; @@ -1021,7 +1176,7 @@ ice_add_tc_flower_adv_fltr(struct ice_vsi *vsi, return PTR_ERR(dest_vsi); } - lkups_cnt = ice_tc_count_lkups(flags, headers, tc_fltr); + lkups_cnt = ice_tc_count_lkups(flags, tc_fltr); list = kcalloc(lkups_cnt, sizeof(*list), GFP_ATOMIC); if (!list) return -ENOMEM; @@ -1056,8 +1211,13 @@ ice_add_tc_flower_adv_fltr(struct ice_vsi *vsi, tc_fltr->action.fwd.q.hw_queue, lkups_cnt); break; case ICE_DROP_PACKET: - rule_info.sw_act.flag |= ICE_FLTR_RX; - rule_info.sw_act.src = hw->pf_id; + if (tc_fltr->direction == ICE_ESWITCH_FLTR_EGRESS) { + rule_info.sw_act.flag |= ICE_FLTR_TX; + rule_info.sw_act.src = vsi->idx; + } else { + rule_info.sw_act.flag |= ICE_FLTR_RX; + rule_info.sw_act.src = hw->pf_id; + } rule_info.priority = ICE_SWITCH_FLTR_PRIO_VSI; break; default: @@ -1071,6 +1231,10 @@ ice_add_tc_flower_adv_fltr(struct ice_vsi *vsi, "Unable to add filter because it already exist"); ret = -EINVAL; goto exit; + } else if (ret == -ENOSPC) { + NL_SET_ERR_MSG_MOD(tc_fltr->extack, + "Unable to add filter: insufficient space available."); + goto exit; } else if (ret) { NL_SET_ERR_MSG_MOD(tc_fltr->extack, "Unable to add filter due to error"); @@ -1463,11 +1627,16 @@ ice_parse_tunnel_attr(struct net_device *dev, struct flow_rule *rule, * @filter_dev: Pointer to device on which filter is being added * @f: Pointer to struct flow_cls_offload * @fltr: Pointer to filter structure + * @ingress: if the rule is added to an ingress block + * + * Return: 0 if the flower was parsed successfully, -EINVAL if the flower + * cannot be parsed, -EOPNOTSUPP if such filter cannot be configured + * for the given VSI. */ static int ice_parse_cls_flower(struct net_device *filter_dev, struct ice_vsi *vsi, struct flow_cls_offload *f, - struct ice_tc_flower_fltr *fltr) + struct ice_tc_flower_fltr *fltr, bool ingress) { struct ice_tc_flower_lyr_2_4_hdrs *headers = &fltr->outer_headers; struct flow_rule *rule = flow_cls_offload_flow_rule(f); @@ -1551,6 +1720,20 @@ ice_parse_cls_flower(struct net_device *filter_dev, struct ice_vsi *vsi, fltr->flags |= ICE_TC_FLWR_FIELD_ETH_TYPE_ID; } + if (!ingress) { + bool switchdev = + ice_is_eswitch_mode_switchdev(vsi->back); + + if (switchdev != (n_proto_key == ETH_P_LLDP)) { + NL_SET_ERR_MSG_FMT_MOD(fltr->extack, + "%sLLDP filtering is not supported on egress in %s mode", + switchdev ? "Non-" : "", + switchdev ? "switchdev" : + "legacy"); + return -EOPNOTSUPP; + } + } + headers->l2_key.n_proto = cpu_to_be16(n_proto_key); headers->l2_mask.n_proto = cpu_to_be16(n_proto_mask); headers->l3_key.ip_proto = match.key->ip_proto; @@ -1726,6 +1909,14 @@ ice_parse_cls_flower(struct net_device *filter_dev, struct ice_vsi *vsi, return -EINVAL; } } + + /* Ingress filter on representor results in an egress filter in HW + * and vice versa + */ + ingress = ice_is_port_repr_netdev(filter_dev) ? !ingress : ingress; + fltr->direction = ingress ? ICE_ESWITCH_FLTR_INGRESS : + ICE_ESWITCH_FLTR_EGRESS; + return 0; } @@ -1939,6 +2130,12 @@ static int ice_del_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) struct ice_pf *pf = vsi->back; int err; + if (ice_is_fltr_pf_tx_lldp(fltr)) + ice_handle_del_pf_lldp_drop_rule(pf); + + if (ice_is_fltr_vf_tx_lldp(fltr)) + return ice_drop_vf_tx_lldp(vsi, false); + rule_rem.rid = fltr->rid; rule_rem.rule_id = fltr->rule_id; rule_rem.vsi_handle = fltr->dest_vsi_handle; @@ -1975,14 +2172,18 @@ static int ice_del_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) * @vsi: Pointer to VSI * @f: Pointer to flower offload structure * @__fltr: Pointer to struct ice_tc_flower_fltr + * @ingress: if the rule is added to an ingress block * * This function parses TC-flower input fields, parses action, * and adds a filter. + * + * Return: 0 if the filter was successfully added, + * negative error code otherwise. */ static int ice_add_tc_fltr(struct net_device *netdev, struct ice_vsi *vsi, struct flow_cls_offload *f, - struct ice_tc_flower_fltr **__fltr) + struct ice_tc_flower_fltr **__fltr, bool ingress) { struct ice_tc_flower_fltr *fltr; int err; @@ -1999,7 +2200,7 @@ ice_add_tc_fltr(struct net_device *netdev, struct ice_vsi *vsi, fltr->src_vsi = vsi; INIT_HLIST_NODE(&fltr->tc_flower_node); - err = ice_parse_cls_flower(netdev, vsi, f, fltr); + err = ice_parse_cls_flower(netdev, vsi, f, fltr, ingress); if (err < 0) goto err; @@ -2042,10 +2243,13 @@ ice_find_tc_flower_fltr(struct ice_pf *pf, unsigned long cookie) * @netdev: Pointer to filter device * @vsi: Pointer to VSI * @cls_flower: Pointer to flower offload structure + * @ingress: if the rule is added to an ingress block + * + * Return: 0 if the flower was successfully added, + * negative error code otherwise. */ -int -ice_add_cls_flower(struct net_device *netdev, struct ice_vsi *vsi, - struct flow_cls_offload *cls_flower) +int ice_add_cls_flower(struct net_device *netdev, struct ice_vsi *vsi, + struct flow_cls_offload *cls_flower, bool ingress) { struct netlink_ext_ack *extack = cls_flower->common.extack; struct net_device *vsi_netdev = vsi->netdev; @@ -2080,7 +2284,7 @@ ice_add_cls_flower(struct net_device *netdev, struct ice_vsi *vsi, } /* prep and add TC-flower filter in HW */ - err = ice_add_tc_fltr(netdev, vsi, cls_flower, &fltr); + err = ice_add_tc_fltr(netdev, vsi, cls_flower, &fltr, ingress); if (err) return err; diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.h b/drivers/net/ethernet/intel/ice/ice_tc_lib.h index d84f153517ec5..8a3ab2f22af9b 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.h @@ -211,13 +211,14 @@ static inline int ice_chnl_dmac_fltr_cnt(struct ice_pf *pf) } struct ice_vsi *ice_locate_vsi_using_queue(struct ice_vsi *vsi, int queue); -int -ice_add_cls_flower(struct net_device *netdev, struct ice_vsi *vsi, - struct flow_cls_offload *cls_flower); -int -ice_del_cls_flower(struct ice_vsi *vsi, struct flow_cls_offload *cls_flower); +int ice_add_cls_flower(struct net_device *netdev, struct ice_vsi *vsi, + struct flow_cls_offload *cls_flower, bool ingress); +int ice_del_cls_flower(struct ice_vsi *vsi, + struct flow_cls_offload *cls_flower); void ice_replay_tc_fltrs(struct ice_pf *pf); bool ice_is_tunnel_supported(struct net_device *dev); +int ice_drop_vf_tx_lldp(struct ice_vsi *vsi, bool init); +int ice_pass_vf_tx_lldp(struct ice_vsi *vsi, bool deinit); static inline bool ice_is_forward_action(enum ice_sw_fwd_act_type fltr_act) { diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 1e4f6f6ee449c..0e5107fe62ad5 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -2440,19 +2440,20 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_tx_ring *tx_ring) /* allow CONTROL frames egress from main VSI if FW LLDP disabled */ eth = (struct ethhdr *)skb_mac_header(skb); - if (unlikely((skb->priority == TC_PRIO_CONTROL || - eth->h_proto == htons(ETH_P_LLDP)) && - vsi->type == ICE_VSI_PF && - vsi->port_info->qos_cfg.is_sw_lldp)) - offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX | - ICE_TX_CTX_DESC_SWTCH_UPLINK << - ICE_TXD_CTX_QW1_CMD_S); - ice_tstamp(tx_ring, skb, first, &offload); if ((ice_is_switchdev_running(vsi->back) || ice_lag_is_switchdev_running(vsi->back)) && vsi->type != ICE_VSI_SF) ice_eswitch_set_target_vsi(skb, &offload); + else if (unlikely((skb->priority == TC_PRIO_CONTROL || + eth->h_proto == htons(ETH_P_LLDP)) && + vsi->type == ICE_VSI_PF && + vsi->port_info->qos_cfg.is_sw_lldp)) + offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX | + ICE_TX_CTX_DESC_SWTCH_UPLINK << + ICE_TXD_CTX_QW1_CMD_S); + + ice_tstamp(tx_ring, skb, first, &offload); if (offload.cd_qw1 & ICE_TX_DESC_DTYPE_CTX) { struct ice_tx_ctx_desc *cdesc; diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h index 0aab21113cc43..ccf53cc6403e9 100644 --- a/drivers/net/ethernet/intel/ice/ice_type.h +++ b/drivers/net/ethernet/intel/ice/ice_type.h @@ -970,6 +970,7 @@ struct ice_hw { u8 intrl_gran; struct ice_ptp_hw ptp; + s8 lane_num; /* Active package version (currently active) */ struct ice_pkg_ver active_pkg_ver; diff --git a/drivers/net/ethernet/intel/ice/ice_vf_lib.c b/drivers/net/ethernet/intel/ice/ice_vf_lib.c index 815ad0bfe8326..48cd533e93b74 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_vf_lib.c @@ -226,6 +226,7 @@ static void ice_vf_clear_counters(struct ice_vf *vf) vsi->num_vlan = 0; vf->num_mac = 0; + vf->num_mac_lldp = 0; memset(&vf->mdd_tx_events, 0, sizeof(vf->mdd_tx_events)); memset(&vf->mdd_rx_events, 0, sizeof(vf->mdd_rx_events)); } @@ -1401,3 +1402,28 @@ struct ice_vsi *ice_get_vf_ctrl_vsi(struct ice_pf *pf, struct ice_vsi *vsi) rcu_read_unlock(); return ctrl_vsi; } + +/** + * ice_vf_update_mac_lldp_num - update the VF's number of LLDP addresses + * @vf: a VF to add the address to + * @vsi: the corresponding VSI + * @incr: is the rule added or removed + */ +void ice_vf_update_mac_lldp_num(struct ice_vf *vf, struct ice_vsi *vsi, + bool incr) +{ + bool lldp_by_fw = test_bit(ICE_FLAG_FW_LLDP_AGENT, vsi->back->flags); + bool was_ena = ice_vf_is_lldp_ena(vf) && !lldp_by_fw; + bool is_ena; + + if (WARN_ON(!vsi)) { + vf->num_mac_lldp = 0; + return; + } + + vf->num_mac_lldp += incr ? 1 : -1; + is_ena = ice_vf_is_lldp_ena(vf) && !lldp_by_fw; + + if (was_ena != is_ena) + ice_vsi_cfg_sw_lldp(vsi, false, is_ena); +} diff --git a/drivers/net/ethernet/intel/ice/ice_vf_lib.h b/drivers/net/ethernet/intel/ice/ice_vf_lib.h index 799b2c1f11844..482f4285fd350 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_vf_lib.h @@ -124,6 +124,7 @@ struct ice_vf { u8 spoofchk:1; u8 link_forced:1; u8 link_up:1; /* only valid if VF link is forced */ + u8 lldp_tx_ena:1; u32 ptp_caps; @@ -134,6 +135,7 @@ struct ice_vf { unsigned long vf_caps; /* VF's adv. capabilities */ u8 num_req_qs; /* num of queue pairs requested by VF */ u16 num_mac; + u16 num_mac_lldp; u16 num_vf_qs; /* num of queue configured per VF */ u8 vlan_strip_ena; /* Outer and Inner VLAN strip enable */ #define ICE_INNER_VLAN_STRIP_ENA BIT(0) @@ -149,6 +151,9 @@ struct ice_vf { /* devlink port data */ struct devlink_port devlink_port; + u16 lldp_recipe_id; + u16 lldp_rule_id; + u16 num_msix; /* num of MSI-X configured on this VF */ struct ice_vf_qs_bw qs_bw[ICE_MAX_RSS_QS_PER_VF]; }; @@ -180,6 +185,11 @@ static inline u16 ice_vf_get_port_vlan_tpid(struct ice_vf *vf) return vf->port_vlan_info.tpid; } +static inline bool ice_vf_is_lldp_ena(struct ice_vf *vf) +{ + return vf->num_mac_lldp && vf->trusted; +} + /* VF Hash Table access functions * * These functions provide abstraction for interacting with the VF hash table. @@ -245,6 +255,8 @@ ice_vf_clear_vsi_promisc(struct ice_vf *vf, struct ice_vsi *vsi, u8 promisc_m); int ice_reset_vf(struct ice_vf *vf, u32 flags); void ice_reset_all_vfs(struct ice_pf *pf); struct ice_vsi *ice_get_vf_ctrl_vsi(struct ice_pf *pf, struct ice_vsi *vsi); +void ice_vf_update_mac_lldp_num(struct ice_vf *vf, struct ice_vsi *vsi, + bool incr); #else /* CONFIG_PCI_IOV */ static inline struct ice_vf *ice_get_vf_by_id(struct ice_pf *pf, u16 vf_id) { diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c index 7c3006eb68dd0..b147f6cf26151 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c @@ -2265,6 +2265,51 @@ ice_vfhw_mac_add(struct ice_vf *vf, struct virtchnl_ether_addr *vc_ether_addr) } } +/** + * ice_is_mc_lldp_eth_addr - check if the given MAC is a multicast LLDP address + * @mac: address to check + * + * Return: true if the address is one of the three possible LLDP multicast + * addresses, false otherwise. + */ +static bool ice_is_mc_lldp_eth_addr(const u8 *mac) +{ + const u8 lldp_mac_base[] = {0x01, 0x80, 0xc2, 0x00, 0x00}; + + if (memcmp(mac, lldp_mac_base, sizeof(lldp_mac_base))) + return false; + + return (mac[5] == 0x0e || mac[5] == 0x03 || mac[5] == 0x00); +} + +/** + * ice_vc_can_add_mac - check if the VF is allowed to add a given MAC + * @vf: a VF to add the address to + * @mac: address to check + * + * Return: true if the VF is allowed to add such MAC address, false otherwise. + */ +static bool ice_vc_can_add_mac(const struct ice_vf *vf, const u8 *mac) +{ + struct device *dev = ice_pf_to_dev(vf->pf); + + if (is_unicast_ether_addr(mac) && + !ice_can_vf_change_mac((struct ice_vf *)vf)) { + dev_err(dev, + "VF attempting to override administratively set MAC address, bring down and up the VF interface to resume normal operation\n"); + return false; + } + + if (!vf->trusted && ice_is_mc_lldp_eth_addr(mac)) { + dev_warn(dev, + "An untrusted VF %u is attempting to configure an LLDP multicast address\n", + vf->vf_id); + return false; + } + + return true; +} + /** * ice_vc_add_mac_addr - attempt to add the MAC address passed in * @vf: pointer to the VF info @@ -2283,10 +2328,8 @@ ice_vc_add_mac_addr(struct ice_vf *vf, struct ice_vsi *vsi, if (ether_addr_equal(mac_addr, vf->dev_lan_addr)) return 0; - if (is_unicast_ether_addr(mac_addr) && !ice_can_vf_change_mac(vf)) { - dev_err(dev, "VF attempting to override administratively set MAC address, bring down and up the VF interface to resume normal operation\n"); + if (!ice_vc_can_add_mac(vf, mac_addr)) return -EPERM; - } ret = ice_fltr_add_mac(vsi, mac_addr, ICE_FWD_TO_VSI); if (ret == -EEXIST) { @@ -2301,6 +2344,8 @@ ice_vc_add_mac_addr(struct ice_vf *vf, struct ice_vsi *vsi, return ret; } else { vf->num_mac++; + if (ice_is_mc_lldp_eth_addr(mac_addr)) + ice_vf_update_mac_lldp_num(vf, vsi, true); } ice_vfhw_mac_add(vf, vc_ether_addr); @@ -2395,6 +2440,8 @@ ice_vc_del_mac_addr(struct ice_vf *vf, struct ice_vsi *vsi, ice_vfhw_mac_del(vf, vc_ether_addr); vf->num_mac--; + if (ice_is_mc_lldp_eth_addr(mac_addr)) + ice_vf_update_mac_lldp_num(vf, vsi, false); return 0; } diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c index f323e1c1989f1..793c960162880 100644 --- a/drivers/net/ethernet/intel/igb/igb_ptp.c +++ b/drivers/net/ethernet/intel/igb/igb_ptp.c @@ -502,13 +502,6 @@ static int igb_ptp_feature_enable_82580(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_EXTTS: - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Both the rising and falling edge are timestamped */ if (rq->extts.flags & PTP_STRICT_FLAGS && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -658,13 +651,6 @@ static int igb_ptp_feature_enable_i210(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_EXTTS: - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Reject requests failing to enable both edges. */ if ((rq->extts.flags & PTP_STRICT_FLAGS) && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -1356,6 +1342,9 @@ void igb_ptp_init(struct igb_adapter *adapter) adapter->ptp_caps.n_per_out = IGB_N_PEROUT; adapter->ptp_caps.n_pins = IGB_N_SDP; adapter->ptp_caps.pps = 0; + adapter->ptp_caps.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; adapter->ptp_caps.pin_config = adapter->sdp_config; adapter->ptp_caps.adjfine = igb_ptp_adjfine_82580; adapter->ptp_caps.adjtime = igb_ptp_adjtime_82576; @@ -1378,6 +1367,9 @@ void igb_ptp_init(struct igb_adapter *adapter) adapter->ptp_caps.n_ext_ts = IGB_N_EXTTS; adapter->ptp_caps.n_per_out = IGB_N_PEROUT; adapter->ptp_caps.n_pins = IGB_N_SDP; + adapter->ptp_caps.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; adapter->ptp_caps.pps = 1; adapter->ptp_caps.pin_config = adapter->sdp_config; adapter->ptp_caps.adjfine = igb_ptp_adjfine_82580; diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index b1669d7cf4359..ddfa654db1e03 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7125,6 +7125,9 @@ static int igc_probe(struct pci_dev *pdev, netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_XSK_ZEROCOPY; + /* enable HW vlan tag insertion/stripping by default */ + netdev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX; + /* MTU range: 68 - 9216 */ netdev->min_mtu = ETH_MIN_MTU; netdev->max_mtu = MAX_STD_JUMBO_FRAME_SIZE; diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index 612ed26a29c5d..cfb589f9da560 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -257,13 +257,6 @@ static int igc_ptp_feature_enable_i225(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_EXTTS: - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Reject requests failing to enable both edges. */ if ((rq->extts.flags & PTP_STRICT_FLAGS) && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -300,10 +293,6 @@ static int igc_ptp_feature_enable_i225(struct ptp_clock_info *ptp, return 0; case PTP_CLK_REQ_PEROUT: - /* Reject requests with unsupported flags */ - if (rq->perout.flags) - return -EOPNOTSUPP; - if (on) { pin = ptp_find_pin(igc->ptp_clock, PTP_PF_PEROUT, rq->perout.index); @@ -1162,6 +1151,9 @@ void igc_ptp_init(struct igc_adapter *adapter) adapter->ptp_caps.pin_config = adapter->sdp_config; adapter->ptp_caps.n_ext_ts = IGC_N_EXTTS; adapter->ptp_caps.n_per_out = IGC_N_PEROUT; + adapter->ptp_caps.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; adapter->ptp_caps.n_pins = IGC_N_SDP; adapter->ptp_caps.verify = igc_ptp_verify_pin; diff --git a/drivers/net/ethernet/intel/ixgbe/Makefile b/drivers/net/ethernet/intel/ixgbe/Makefile index b456d102655ac..ce447540d146e 100644 --- a/drivers/net/ethernet/intel/ixgbe/Makefile +++ b/drivers/net/ethernet/intel/ixgbe/Makefile @@ -4,12 +4,13 @@ # Makefile for the Intel(R) 10GbE PCI Express ethernet driver # +subdir-ccflags-y += -I$(src) obj-$(CONFIG_IXGBE) += ixgbe.o ixgbe-y := ixgbe_main.o ixgbe_common.o ixgbe_ethtool.o \ ixgbe_82599.o ixgbe_82598.o ixgbe_phy.o ixgbe_sriov.o \ ixgbe_mbx.o ixgbe_x540.o ixgbe_x550.o ixgbe_lib.o ixgbe_ptp.o \ - ixgbe_xsk.o ixgbe_e610.o + ixgbe_xsk.o ixgbe_e610.o devlink/devlink.o ixgbe_fw_update.o ixgbe-$(CONFIG_IXGBE_DCB) += ixgbe_dcb.o ixgbe_dcb_82598.o \ ixgbe_dcb_82599.o ixgbe_dcb_nl.o diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c new file mode 100644 index 0000000000000..54f1b83dfe42e --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c @@ -0,0 +1,557 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025, Intel Corporation. */ + +#include "ixgbe.h" +#include "devlink.h" +#include "ixgbe_fw_update.h" + +struct ixgbe_info_ctx { + char buf[128]; + struct ixgbe_orom_info pending_orom; + struct ixgbe_nvm_info pending_nvm; + struct ixgbe_netlist_info pending_netlist; + struct ixgbe_hw_dev_caps dev_caps; +}; + +enum ixgbe_devlink_version_type { + IXGBE_DL_VERSION_RUNNING, + IXGBE_DL_VERSION_STORED +}; + +static void ixgbe_info_get_dsn(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx) +{ + u8 dsn[8]; + + /* Copy the DSN into an array in Big Endian format */ + put_unaligned_be64(pci_get_dsn(adapter->pdev), dsn); + + snprintf(ctx->buf, sizeof(ctx->buf), "%8phD", dsn); +} + +static void ixgbe_info_orom_ver(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx, + enum ixgbe_devlink_version_type type) +{ + struct ixgbe_hw *hw = &adapter->hw; + struct ixgbe_nvm_version nvm_ver; + + ctx->buf[0] = '\0'; + + if (hw->mac.type == ixgbe_mac_e610) { + struct ixgbe_orom_info *orom = &adapter->hw.flash.orom; + + if (type == IXGBE_DL_VERSION_STORED && + ctx->dev_caps.common_cap.nvm_update_pending_orom) + orom = &ctx->pending_orom; + + snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u.%u", + orom->major, orom->build, orom->patch); + return; + } + + ixgbe_get_oem_prod_version(hw, &nvm_ver); + if (nvm_ver.oem_valid) { + snprintf(ctx->buf, sizeof(ctx->buf), "%x.%x.%x", + nvm_ver.oem_major, nvm_ver.oem_minor, + nvm_ver.oem_release); + + return; + } + + ixgbe_get_orom_version(hw, &nvm_ver); + if (nvm_ver.or_valid) + snprintf(ctx->buf, sizeof(ctx->buf), "%d.%d.%d", + nvm_ver.or_major, nvm_ver.or_build, nvm_ver.or_patch); +} + +static void ixgbe_info_eetrack(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx, + enum ixgbe_devlink_version_type type) +{ + struct ixgbe_hw *hw = &adapter->hw; + struct ixgbe_nvm_version nvm_ver; + + if (hw->mac.type == ixgbe_mac_e610) { + u32 eetrack = hw->flash.nvm.eetrack; + + if (type == IXGBE_DL_VERSION_STORED && + ctx->dev_caps.common_cap.nvm_update_pending_nvm) + eetrack = ctx->pending_nvm.eetrack; + + snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", eetrack); + return; + } + + ixgbe_get_oem_prod_version(hw, &nvm_ver); + + /* No ETRACK version for OEM */ + if (nvm_ver.oem_valid) { + ctx->buf[0] = '\0'; + return; + } + + ixgbe_get_etk_id(hw, &nvm_ver); + snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", nvm_ver.etk_id); +} + +static void ixgbe_info_fw_api(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx) +{ + struct ixgbe_hw *hw = &adapter->hw; + + snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u.%u", + hw->api_maj_ver, hw->api_min_ver, hw->api_patch); +} + +static void ixgbe_info_fw_build(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx) +{ + struct ixgbe_hw *hw = &adapter->hw; + + snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", hw->fw_build); +} + +static void ixgbe_info_fw_srev(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx, + enum ixgbe_devlink_version_type type) +{ + struct ixgbe_nvm_info *nvm = &adapter->hw.flash.nvm; + + if (type == IXGBE_DL_VERSION_STORED && + ctx->dev_caps.common_cap.nvm_update_pending_nvm) + nvm = &ctx->pending_nvm; + + snprintf(ctx->buf, sizeof(ctx->buf), "%u", nvm->srev); +} + +static void ixgbe_info_orom_srev(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx, + enum ixgbe_devlink_version_type type) +{ + struct ixgbe_orom_info *orom = &adapter->hw.flash.orom; + + if (type == IXGBE_DL_VERSION_STORED && + ctx->dev_caps.common_cap.nvm_update_pending_orom) + orom = &ctx->pending_orom; + + snprintf(ctx->buf, sizeof(ctx->buf), "%u", orom->srev); +} + +static void ixgbe_info_nvm_ver(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx, + enum ixgbe_devlink_version_type type) +{ + struct ixgbe_nvm_info *nvm = &adapter->hw.flash.nvm; + + if (type == IXGBE_DL_VERSION_STORED && + ctx->dev_caps.common_cap.nvm_update_pending_nvm) + nvm = &ctx->pending_nvm; + + snprintf(ctx->buf, sizeof(ctx->buf), "%x.%02x", nvm->major, nvm->minor); +} + +static void ixgbe_info_netlist_ver(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx, + enum ixgbe_devlink_version_type type) +{ + struct ixgbe_netlist_info *netlist = &adapter->hw.flash.netlist; + + if (type == IXGBE_DL_VERSION_STORED && + ctx->dev_caps.common_cap.nvm_update_pending_netlist) + netlist = &ctx->pending_netlist; + + /* The netlist version fields are BCD formatted */ + snprintf(ctx->buf, sizeof(ctx->buf), "%x.%x.%x-%x.%x.%x", + netlist->major, netlist->minor, + netlist->type >> 16, netlist->type & 0xFFFF, + netlist->rev, netlist->cust_ver); +} + +static void ixgbe_info_netlist_build(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx, + enum ixgbe_devlink_version_type type) +{ + struct ixgbe_netlist_info *netlist = &adapter->hw.flash.netlist; + + if (type == IXGBE_DL_VERSION_STORED && + ctx->dev_caps.common_cap.nvm_update_pending_netlist) + netlist = &ctx->pending_netlist; + + snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", netlist->hash); +} + +static int ixgbe_set_ctx_dev_caps(struct ixgbe_hw *hw, + struct ixgbe_info_ctx *ctx, + struct netlink_ext_ack *extack) +{ + bool *pending_orom, *pending_nvm, *pending_netlist; + int err; + + err = ixgbe_discover_dev_caps(hw, &ctx->dev_caps); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Unable to discover device capabilities"); + return err; + } + + pending_orom = &ctx->dev_caps.common_cap.nvm_update_pending_orom; + pending_nvm = &ctx->dev_caps.common_cap.nvm_update_pending_nvm; + pending_netlist = &ctx->dev_caps.common_cap.nvm_update_pending_netlist; + + if (*pending_orom) { + err = ixgbe_get_inactive_orom_ver(hw, &ctx->pending_orom); + if (err) + *pending_orom = false; + } + + if (*pending_nvm) { + err = ixgbe_get_inactive_nvm_ver(hw, &ctx->pending_nvm); + if (err) + *pending_nvm = false; + } + + if (*pending_netlist) { + err = ixgbe_get_inactive_netlist_ver(hw, &ctx->pending_netlist); + if (err) + *pending_netlist = false; + } + + return 0; +} + +static int ixgbe_devlink_info_get_e610(struct ixgbe_adapter *adapter, + struct devlink_info_req *req, + struct ixgbe_info_ctx *ctx) +{ + int err; + + ixgbe_info_fw_api(adapter, ctx); + err = devlink_info_version_running_put(req, + DEVLINK_INFO_VERSION_GENERIC_FW_MGMT_API, + ctx->buf); + if (err) + return err; + + ixgbe_info_fw_build(adapter, ctx); + err = devlink_info_version_running_put(req, "fw.mgmt.build", ctx->buf); + if (err) + return err; + + ixgbe_info_fw_srev(adapter, ctx, IXGBE_DL_VERSION_RUNNING); + err = devlink_info_version_running_put(req, "fw.mgmt.srev", ctx->buf); + if (err) + return err; + + ixgbe_info_orom_srev(adapter, ctx, IXGBE_DL_VERSION_RUNNING); + err = devlink_info_version_running_put(req, "fw.undi.srev", ctx->buf); + if (err) + return err; + + ixgbe_info_nvm_ver(adapter, ctx, IXGBE_DL_VERSION_RUNNING); + err = devlink_info_version_running_put(req, "fw.psid.api", ctx->buf); + if (err) + return err; + + ixgbe_info_netlist_ver(adapter, ctx, IXGBE_DL_VERSION_RUNNING); + err = devlink_info_version_running_put(req, "fw.netlist", ctx->buf); + if (err) + return err; + + ixgbe_info_netlist_build(adapter, ctx, IXGBE_DL_VERSION_RUNNING); + return devlink_info_version_running_put(req, "fw.netlist.build", + ctx->buf); +} + +static int +ixgbe_devlink_pending_info_get_e610(struct ixgbe_adapter *adapter, + struct devlink_info_req *req, + struct ixgbe_info_ctx *ctx) +{ + int err; + + ixgbe_info_orom_ver(adapter, ctx, IXGBE_DL_VERSION_STORED); + err = devlink_info_version_stored_put(req, + DEVLINK_INFO_VERSION_GENERIC_FW_UNDI, + ctx->buf); + if (err) + return err; + + ixgbe_info_eetrack(adapter, ctx, IXGBE_DL_VERSION_STORED); + err = devlink_info_version_stored_put(req, + DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID, + ctx->buf); + if (err) + return err; + + ixgbe_info_fw_srev(adapter, ctx, IXGBE_DL_VERSION_STORED); + err = devlink_info_version_stored_put(req, "fw.mgmt.srev", ctx->buf); + if (err) + return err; + + ixgbe_info_orom_srev(adapter, ctx, IXGBE_DL_VERSION_STORED); + err = devlink_info_version_stored_put(req, "fw.undi.srev", ctx->buf); + if (err) + return err; + + ixgbe_info_nvm_ver(adapter, ctx, IXGBE_DL_VERSION_STORED); + err = devlink_info_version_stored_put(req, "fw.psid.api", ctx->buf); + if (err) + return err; + + ixgbe_info_netlist_ver(adapter, ctx, IXGBE_DL_VERSION_STORED); + err = devlink_info_version_stored_put(req, "fw.netlist", ctx->buf); + if (err) + return err; + + ixgbe_info_netlist_build(adapter, ctx, IXGBE_DL_VERSION_STORED); + return devlink_info_version_stored_put(req, "fw.netlist.build", + ctx->buf); +} + +static int ixgbe_devlink_info_get(struct devlink *devlink, + struct devlink_info_req *req, + struct netlink_ext_ack *extack) +{ + struct ixgbe_adapter *adapter = devlink_priv(devlink); + struct ixgbe_hw *hw = &adapter->hw; + struct ixgbe_info_ctx *ctx; + int err; + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (hw->mac.type == ixgbe_mac_e610) + ixgbe_refresh_fw_version(adapter); + + ixgbe_info_get_dsn(adapter, ctx); + err = devlink_info_serial_number_put(req, ctx->buf); + if (err) + goto free_ctx; + + err = hw->eeprom.ops.read_pba_string(hw, ctx->buf, sizeof(ctx->buf)); + if (err) + goto free_ctx; + + err = devlink_info_version_fixed_put(req, + DEVLINK_INFO_VERSION_GENERIC_BOARD_ID, + ctx->buf); + if (err) + goto free_ctx; + + ixgbe_info_orom_ver(adapter, ctx, IXGBE_DL_VERSION_RUNNING); + err = devlink_info_version_running_put(req, + DEVLINK_INFO_VERSION_GENERIC_FW_UNDI, + ctx->buf); + if (err) + goto free_ctx; + + ixgbe_info_eetrack(adapter, ctx, IXGBE_DL_VERSION_RUNNING); + err = devlink_info_version_running_put(req, + DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID, + ctx->buf); + if (err || hw->mac.type != ixgbe_mac_e610) + goto free_ctx; + + err = ixgbe_set_ctx_dev_caps(hw, ctx, extack); + if (err) + goto free_ctx; + + err = ixgbe_devlink_info_get_e610(adapter, req, ctx); + if (err) + goto free_ctx; + + err = ixgbe_devlink_pending_info_get_e610(adapter, req, ctx); +free_ctx: + kfree(ctx); + return err; +} + +/** + * ixgbe_devlink_reload_empr_start - Start EMP reset to activate new firmware + * @devlink: pointer to the devlink instance to reload + * @netns_change: if true, the network namespace is changing + * @action: the action to perform. Must be DEVLINK_RELOAD_ACTION_FW_ACTIVATE + * @limit: limits on what reload should do, such as not resetting + * @extack: netlink extended ACK structure + * + * Allow user to activate new Embedded Management Processor firmware by + * issuing device specific EMP reset. Called in response to + * a DEVLINK_CMD_RELOAD with the DEVLINK_RELOAD_ACTION_FW_ACTIVATE. + * + * Note that teardown and rebuild of the driver state happens automatically as + * part of an interrupt and watchdog task. This is because all physical + * functions on the device must be able to reset when an EMP reset occurs from + * any source. + * + * Return: the exit code of the operation. + */ +static int ixgbe_devlink_reload_empr_start(struct devlink *devlink, + bool netns_change, + enum devlink_reload_action action, + enum devlink_reload_limit limit, + struct netlink_ext_ack *extack) +{ + struct ixgbe_adapter *adapter = devlink_priv(devlink); + struct ixgbe_hw *hw = &adapter->hw; + u8 pending; + int err; + + if (hw->mac.type != ixgbe_mac_e610) + return -EOPNOTSUPP; + + err = ixgbe_get_pending_updates(adapter, &pending, extack); + if (err) + return err; + + /* Pending is a bitmask of which flash banks have a pending update, + * including the main NVM bank, the Option ROM bank, and the netlist + * bank. If any of these bits are set, then there is a pending update + * waiting to be activated. + */ + if (!pending) { + NL_SET_ERR_MSG_MOD(extack, "No pending firmware update"); + return -ECANCELED; + } + + if (adapter->fw_emp_reset_disabled) { + NL_SET_ERR_MSG_MOD(extack, + "EMP reset is not available. To activate firmware, a reboot or power cycle is needed"); + return -ECANCELED; + } + + err = ixgbe_aci_nvm_update_empr(hw); + if (err) + NL_SET_ERR_MSG_MOD(extack, + "Failed to trigger EMP device reset to reload firmware"); + + return err; +} + +/*Wait for 10 sec with 0.5 sec tic. EMPR takes no less than half of a sec */ +#define IXGBE_DEVLINK_RELOAD_TIMEOUT_SEC 20 + +/** + * ixgbe_devlink_reload_empr_finish - finishes EMP reset + * @devlink: pointer to the devlink instance + * @action: the action to perform. + * @limit: limits on what reload should do + * @actions_performed: actions performed + * @extack: netlink extended ACK structure + * + * Wait for new NVM to be loaded during EMP reset. + * + * Return: -ETIME when timer is exceeded, 0 on success. + */ +static int ixgbe_devlink_reload_empr_finish(struct devlink *devlink, + enum devlink_reload_action action, + enum devlink_reload_limit limit, + u32 *actions_performed, + struct netlink_ext_ack *extack) +{ + struct ixgbe_adapter *adapter = devlink_priv(devlink); + struct ixgbe_hw *hw = &adapter->hw; + int i = 0; + u32 fwsm; + + do { + /* Just right away after triggering EMP reset the FWSM register + * may be not cleared yet, so begin the loop with the delay + * in order to not check the not updated register. + */ + mdelay(500); + + fwsm = IXGBE_READ_REG(hw, IXGBE_FWSM(hw)); + + if (i++ >= IXGBE_DEVLINK_RELOAD_TIMEOUT_SEC) + return -ETIME; + + } while (!(fwsm & IXGBE_FWSM_FW_VAL_BIT)); + + *actions_performed = BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE); + + adapter->flags2 &= ~(IXGBE_FLAG2_API_MISMATCH | + IXGBE_FLAG2_FW_ROLLBACK); + + return 0; +} + +static const struct devlink_ops ixgbe_devlink_ops = { + .info_get = ixgbe_devlink_info_get, + .supported_flash_update_params = + DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK, + .flash_update = ixgbe_flash_pldm_image, + .reload_actions = BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE), + .reload_down = ixgbe_devlink_reload_empr_start, + .reload_up = ixgbe_devlink_reload_empr_finish, +}; + +/** + * ixgbe_allocate_devlink - Allocate devlink instance + * @dev: device to allocate devlink for + * + * Allocate a devlink instance for this physical function. + * + * Return: pointer to the device adapter structure on success, + * ERR_PTR(-ENOMEM) when allocation failed. + */ +struct ixgbe_adapter *ixgbe_allocate_devlink(struct device *dev) +{ + struct ixgbe_adapter *adapter; + struct devlink *devlink; + + devlink = devlink_alloc(&ixgbe_devlink_ops, sizeof(*adapter), dev); + if (!devlink) + return ERR_PTR(-ENOMEM); + + adapter = devlink_priv(devlink); + adapter->devlink = devlink; + + return adapter; +} + +/** + * ixgbe_devlink_set_switch_id - Set unique switch ID based on PCI DSN + * @adapter: pointer to the device adapter structure + * @ppid: struct with switch id information + */ +static void ixgbe_devlink_set_switch_id(struct ixgbe_adapter *adapter, + struct netdev_phys_item_id *ppid) +{ + u64 id = pci_get_dsn(adapter->pdev); + + ppid->id_len = sizeof(id); + put_unaligned_be64(id, &ppid->id); +} + +/** + * ixgbe_devlink_register_port - Register devlink port + * @adapter: pointer to the device adapter structure + * + * Create and register a devlink_port for this physical function. + * + * Return: 0 on success, error code on failure. + */ +int ixgbe_devlink_register_port(struct ixgbe_adapter *adapter) +{ + struct devlink_port *devlink_port = &adapter->devlink_port; + struct devlink *devlink = adapter->devlink; + struct device *dev = &adapter->pdev->dev; + struct devlink_port_attrs attrs = {}; + int err; + + attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; + attrs.phys.port_number = adapter->hw.bus.func; + ixgbe_devlink_set_switch_id(adapter, &attrs.switch_id); + + devlink_port_attrs_set(devlink_port, &attrs); + + err = devl_port_register(devlink, devlink_port, 0); + if (err) { + dev_err(dev, + "devlink port registration failed, err %d\n", err); + } + + return err; +} diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.h b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.h new file mode 100644 index 0000000000000..0b27653a34072 --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025, Intel Corporation. */ + +#ifndef _IXGBE_DEVLINK_H_ +#define _IXGBE_DEVLINK_H_ + +struct ixgbe_adapter *ixgbe_allocate_devlink(struct device *dev); +int ixgbe_devlink_register_port(struct ixgbe_adapter *adapter); + +#endif /* _IXGBE_DEVLINK_H_ */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index e6a380d4929bd..23c2e2c2649ce 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -17,6 +17,8 @@ #include #include +#include + #include "ixgbe_type.h" #include "ixgbe_common.h" #include "ixgbe_dcb.h" @@ -612,6 +614,8 @@ struct ixgbe_adapter { struct bpf_prog *xdp_prog; struct pci_dev *pdev; struct mii_bus *mii_bus; + struct devlink *devlink; + struct devlink_port devlink_port; unsigned long state; @@ -667,6 +671,8 @@ struct ixgbe_adapter { #define IXGBE_FLAG2_PHY_FW_LOAD_FAILED BIT(20) #define IXGBE_FLAG2_NO_MEDIA BIT(21) #define IXGBE_FLAG2_MOD_POWER_UNSUPPORTED BIT(22) +#define IXGBE_FLAG2_API_MISMATCH BIT(23) +#define IXGBE_FLAG2_FW_ROLLBACK BIT(24) /* Tx fast path data */ int num_tx_queues; @@ -755,6 +761,8 @@ struct ixgbe_adapter { u32 atr_sample_rate; spinlock_t fdir_perfect_lock; + bool fw_emp_reset_disabled; + #ifdef IXGBE_FCOE struct ixgbe_fcoe fcoe; #endif /* IXGBE_FCOE */ @@ -830,6 +838,17 @@ struct ixgbe_adapter { spinlock_t vfs_lock; }; +struct ixgbe_netdevice_priv { + struct ixgbe_adapter *adapter; +}; + +static inline struct ixgbe_adapter *ixgbe_from_netdev(struct net_device *netdev) +{ + struct ixgbe_netdevice_priv *priv = netdev_priv(netdev); + + return priv->adapter; +} + static inline int ixgbe_determine_xdp_q_idx(int cpu) { if (static_key_enabled(&ixgbe_xdp_locking_key)) @@ -945,6 +964,8 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter); int ixgbe_init_interrupt_scheme(struct ixgbe_adapter *adapter); bool ixgbe_wol_supported(struct ixgbe_adapter *adapter, u16 device_id, u16 subdevice_id); +void ixgbe_set_fw_version_e610(struct ixgbe_adapter *adapter); +void ixgbe_refresh_fw_version(struct ixgbe_adapter *adapter); #ifdef CONFIG_PCI_IOV void ixgbe_full_sync_mac_table(struct ixgbe_adapter *adapter); #endif diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c index 4aaaea3b5f8ff..444da982593f6 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c @@ -1169,6 +1169,7 @@ static const struct ixgbe_eeprom_operations eeprom_ops_82598 = { .calc_checksum = &ixgbe_calc_eeprom_checksum_generic, .validate_checksum = &ixgbe_validate_eeprom_checksum_generic, .update_checksum = &ixgbe_update_eeprom_checksum_generic, + .read_pba_string = &ixgbe_read_pba_string_generic, }; static const struct ixgbe_phy_operations phy_ops_82598 = { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c index 964988b4d58b8..d5b1b974b4a33 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c @@ -2230,6 +2230,7 @@ static const struct ixgbe_eeprom_operations eeprom_ops_82599 = { .calc_checksum = &ixgbe_calc_eeprom_checksum_generic, .validate_checksum = &ixgbe_validate_eeprom_checksum_generic, .update_checksum = &ixgbe_update_eeprom_checksum_generic, + .read_pba_string = &ixgbe_read_pba_string_generic, }; static const struct ixgbe_phy_operations phy_ops_82599 = { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c index 7beaf6ea57f9e..5784d5d1896e0 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c @@ -332,6 +332,7 @@ int ixgbe_start_hw_generic(struct ixgbe_hw *hw) * Devices in the second generation: * 82599 * X540 + * E610 **/ int ixgbe_start_hw_gen2(struct ixgbe_hw *hw) { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c index 19d6b6fa8fb38..3dd5a16a14df1 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c @@ -118,14 +118,14 @@ static int ixgbe_copy_dcb_cfg(struct ixgbe_adapter *adapter, int tc_max) static u8 ixgbe_dcbnl_get_state(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); return !!(adapter->flags & IXGBE_FLAG_DCB_ENABLED); } static u8 ixgbe_dcbnl_set_state(struct net_device *netdev, u8 state) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); /* Fail command if not in CEE mode */ if (!(adapter->dcbx_cap & DCB_CAP_DCBX_VER_CEE)) @@ -142,7 +142,7 @@ static u8 ixgbe_dcbnl_set_state(struct net_device *netdev, u8 state) static void ixgbe_dcbnl_get_perm_hw_addr(struct net_device *netdev, u8 *perm_addr) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); int i, j; memset(perm_addr, 0xff, MAX_ADDR_LEN); @@ -167,7 +167,7 @@ static void ixgbe_dcbnl_set_pg_tc_cfg_tx(struct net_device *netdev, int tc, u8 prio, u8 bwg_id, u8 bw_pct, u8 up_map) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (prio != DCB_ATTR_VALUE_UNDEFINED) adapter->temp_dcb_cfg.tc_config[tc].path[0].prio_type = prio; @@ -184,7 +184,7 @@ static void ixgbe_dcbnl_set_pg_tc_cfg_tx(struct net_device *netdev, int tc, static void ixgbe_dcbnl_set_pg_bwg_cfg_tx(struct net_device *netdev, int bwg_id, u8 bw_pct) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); adapter->temp_dcb_cfg.bw_percentage[0][bwg_id] = bw_pct; } @@ -193,7 +193,7 @@ static void ixgbe_dcbnl_set_pg_tc_cfg_rx(struct net_device *netdev, int tc, u8 prio, u8 bwg_id, u8 bw_pct, u8 up_map) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (prio != DCB_ATTR_VALUE_UNDEFINED) adapter->temp_dcb_cfg.tc_config[tc].path[1].prio_type = prio; @@ -210,7 +210,7 @@ static void ixgbe_dcbnl_set_pg_tc_cfg_rx(struct net_device *netdev, int tc, static void ixgbe_dcbnl_set_pg_bwg_cfg_rx(struct net_device *netdev, int bwg_id, u8 bw_pct) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); adapter->temp_dcb_cfg.bw_percentage[1][bwg_id] = bw_pct; } @@ -219,7 +219,7 @@ static void ixgbe_dcbnl_get_pg_tc_cfg_tx(struct net_device *netdev, int tc, u8 *prio, u8 *bwg_id, u8 *bw_pct, u8 *up_map) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); *prio = adapter->dcb_cfg.tc_config[tc].path[0].prio_type; *bwg_id = adapter->dcb_cfg.tc_config[tc].path[0].bwg_id; @@ -230,7 +230,7 @@ static void ixgbe_dcbnl_get_pg_tc_cfg_tx(struct net_device *netdev, int tc, static void ixgbe_dcbnl_get_pg_bwg_cfg_tx(struct net_device *netdev, int bwg_id, u8 *bw_pct) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); *bw_pct = adapter->dcb_cfg.bw_percentage[0][bwg_id]; } @@ -239,7 +239,7 @@ static void ixgbe_dcbnl_get_pg_tc_cfg_rx(struct net_device *netdev, int tc, u8 *prio, u8 *bwg_id, u8 *bw_pct, u8 *up_map) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); *prio = adapter->dcb_cfg.tc_config[tc].path[1].prio_type; *bwg_id = adapter->dcb_cfg.tc_config[tc].path[1].bwg_id; @@ -250,7 +250,7 @@ static void ixgbe_dcbnl_get_pg_tc_cfg_rx(struct net_device *netdev, int tc, static void ixgbe_dcbnl_get_pg_bwg_cfg_rx(struct net_device *netdev, int bwg_id, u8 *bw_pct) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); *bw_pct = adapter->dcb_cfg.bw_percentage[1][bwg_id]; } @@ -258,7 +258,7 @@ static void ixgbe_dcbnl_get_pg_bwg_cfg_rx(struct net_device *netdev, int bwg_id, static void ixgbe_dcbnl_set_pfc_cfg(struct net_device *netdev, int priority, u8 setting) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); adapter->temp_dcb_cfg.tc_config[priority].dcb_pfc = setting; if (adapter->temp_dcb_cfg.tc_config[priority].dcb_pfc != @@ -269,14 +269,14 @@ static void ixgbe_dcbnl_set_pfc_cfg(struct net_device *netdev, int priority, static void ixgbe_dcbnl_get_pfc_cfg(struct net_device *netdev, int priority, u8 *setting) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); *setting = adapter->dcb_cfg.tc_config[priority].dcb_pfc; } static void ixgbe_dcbnl_devreset(struct net_device *dev) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); while (test_and_set_bit(__IXGBE_RESETTING, &adapter->state)) usleep_range(1000, 2000); @@ -295,7 +295,7 @@ static void ixgbe_dcbnl_devreset(struct net_device *dev) static u8 ixgbe_dcbnl_set_all(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_dcb_config *dcb_cfg = &adapter->dcb_cfg; struct ixgbe_hw *hw = &adapter->hw; int ret = DCB_NO_HW_CHG; @@ -383,7 +383,7 @@ static u8 ixgbe_dcbnl_set_all(struct net_device *netdev) static u8 ixgbe_dcbnl_getcap(struct net_device *netdev, int capid, u8 *cap) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); switch (capid) { case DCB_CAP_ATTR_PG: @@ -420,7 +420,7 @@ static u8 ixgbe_dcbnl_getcap(struct net_device *netdev, int capid, u8 *cap) static int ixgbe_dcbnl_getnumtcs(struct net_device *netdev, int tcid, u8 *num) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) { switch (tcid) { @@ -447,14 +447,14 @@ static int ixgbe_dcbnl_setnumtcs(struct net_device *netdev, int tcid, u8 num) static u8 ixgbe_dcbnl_getpfcstate(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); return adapter->dcb_cfg.pfc_mode_enable; } static void ixgbe_dcbnl_setpfcstate(struct net_device *netdev, u8 state) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); adapter->temp_dcb_cfg.pfc_mode_enable = state; } @@ -471,7 +471,7 @@ static void ixgbe_dcbnl_setpfcstate(struct net_device *netdev, u8 state) */ static int ixgbe_dcbnl_getapp(struct net_device *netdev, u8 idtype, u16 id) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct dcb_app app = { .selector = idtype, .protocol = id, @@ -486,7 +486,7 @@ static int ixgbe_dcbnl_getapp(struct net_device *netdev, u8 idtype, u16 id) static int ixgbe_dcbnl_ieee_getets(struct net_device *dev, struct ieee_ets *ets) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ieee_ets *my_ets = adapter->ixgbe_ieee_ets; ets->ets_cap = adapter->dcb_cfg.num_tcs.pg_tcs; @@ -506,7 +506,7 @@ static int ixgbe_dcbnl_ieee_getets(struct net_device *dev, static int ixgbe_dcbnl_ieee_setets(struct net_device *dev, struct ieee_ets *ets) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); int max_frame = dev->mtu + ETH_HLEN + ETH_FCS_LEN; int i, err; __u8 max_tc = 0; @@ -559,7 +559,7 @@ static int ixgbe_dcbnl_ieee_setets(struct net_device *dev, static int ixgbe_dcbnl_ieee_getpfc(struct net_device *dev, struct ieee_pfc *pfc) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ieee_pfc *my_pfc = adapter->ixgbe_ieee_pfc; int i; @@ -584,7 +584,7 @@ static int ixgbe_dcbnl_ieee_getpfc(struct net_device *dev, static int ixgbe_dcbnl_ieee_setpfc(struct net_device *dev, struct ieee_pfc *pfc) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_hw *hw = &adapter->hw; u8 *prio_tc; int err; @@ -616,7 +616,7 @@ static int ixgbe_dcbnl_ieee_setpfc(struct net_device *dev, static int ixgbe_dcbnl_ieee_setapp(struct net_device *dev, struct dcb_app *app) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); int err; if (!(adapter->dcbx_cap & DCB_CAP_DCBX_VER_IEEE)) @@ -661,7 +661,7 @@ static int ixgbe_dcbnl_ieee_setapp(struct net_device *dev, static int ixgbe_dcbnl_ieee_delapp(struct net_device *dev, struct dcb_app *app) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); int err; if (!(adapter->dcbx_cap & DCB_CAP_DCBX_VER_IEEE)) @@ -705,13 +705,13 @@ static int ixgbe_dcbnl_ieee_delapp(struct net_device *dev, static u8 ixgbe_dcbnl_getdcbx(struct net_device *dev) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); return adapter->dcbx_cap; } static u8 ixgbe_dcbnl_setdcbx(struct net_device *dev, u8 mode) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ieee_ets ets = {0}; struct ieee_pfc pfc = {0}; int err = 0; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index 00935747c8c55..9ada35f7d8f76 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -343,6 +343,40 @@ void ixgbe_fill_dflt_direct_cmd_desc(struct ixgbe_aci_desc *desc, u16 opcode) desc->flags = cpu_to_le16(IXGBE_ACI_FLAG_SI); } +/** + * ixgbe_aci_get_fw_ver - Get the firmware version + * @hw: pointer to the HW struct + * + * Get the firmware version using ACI command (0x0001). + * + * Return: the exit code of the operation. + */ +static int ixgbe_aci_get_fw_ver(struct ixgbe_hw *hw) +{ + struct ixgbe_aci_cmd_get_ver *resp; + struct ixgbe_aci_desc desc; + int err; + + resp = &desc.params.get_ver; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_get_ver); + + err = ixgbe_aci_send_cmd(hw, &desc, NULL, 0); + if (!err) { + hw->fw_branch = resp->fw_branch; + hw->fw_maj_ver = resp->fw_major; + hw->fw_min_ver = resp->fw_minor; + hw->fw_patch = resp->fw_patch; + hw->fw_build = le32_to_cpu(resp->fw_build); + hw->api_branch = resp->api_branch; + hw->api_maj_ver = resp->api_major; + hw->api_min_ver = resp->api_minor; + hw->api_patch = resp->api_patch; + } + + return err; +} + /** * ixgbe_aci_req_res - request a common resource * @hw: pointer to the HW struct @@ -554,6 +588,20 @@ static bool ixgbe_parse_e610_caps(struct ixgbe_hw *hw, break; case IXGBE_ACI_CAPS_NVM_VER: break; + case IXGBE_ACI_CAPS_PENDING_NVM_VER: + caps->nvm_update_pending_nvm = true; + break; + case IXGBE_ACI_CAPS_PENDING_OROM_VER: + caps->nvm_update_pending_orom = true; + break; + case IXGBE_ACI_CAPS_PENDING_NET_VER: + caps->nvm_update_pending_netlist = true; + break; + case IXGBE_ACI_CAPS_NVM_MGMT: + caps->nvm_unified_update = + (number & IXGBE_NVM_MGMT_UNIFIED_UPD_SUPPORT) ? + true : false; + break; case IXGBE_ACI_CAPS_MAX_MTU: caps->max_mtu = number; break; @@ -1410,6 +1458,32 @@ int ixgbe_configure_lse(struct ixgbe_hw *hw, bool activate, u16 mask) return ixgbe_aci_get_link_info(hw, activate, NULL); } +/** + * ixgbe_start_hw_e610 - Prepare hardware for Tx/Rx + * @hw: pointer to hardware structure + * + * Get firmware version and start the hardware using the generic + * start_hw() and ixgbe_start_hw_gen2() functions. + * + * Return: the exit code of the operation. + */ +static int ixgbe_start_hw_e610(struct ixgbe_hw *hw) +{ + int err; + + err = ixgbe_aci_get_fw_ver(hw); + if (err) + return err; + + err = ixgbe_start_hw_generic(hw); + if (err) + return err; + + ixgbe_start_hw_gen2(hw); + + return 0; +} + /** * ixgbe_get_media_type_e610 - Gets media type * @hw: pointer to the HW struct @@ -1742,6 +1816,38 @@ void ixgbe_disable_rx_e610(struct ixgbe_hw *hw) } } +/** + * ixgbe_fw_recovery_mode_e610 - Check FW NVM recovery mode + * @hw: pointer to hardware structure + * + * Check FW NVM recovery mode by reading the value of + * the dedicated register. + * + * Return: true if FW is in recovery mode, otherwise false. + */ +static bool ixgbe_fw_recovery_mode_e610(struct ixgbe_hw *hw) +{ + u32 fwsm = IXGBE_READ_REG(hw, IXGBE_GL_MNG_FWSM); + + return !!(fwsm & IXGBE_GL_MNG_FWSM_RECOVERY_M); +} + +/** + * ixgbe_fw_rollback_mode_e610 - Check FW NVM rollback mode + * @hw: pointer to hardware structure + * + * Check FW NVM rollback mode by reading the value of + * the dedicated register. + * + * Return: true if FW is in rollback mode, otherwise false. + */ +static bool ixgbe_fw_rollback_mode_e610(struct ixgbe_hw *hw) +{ + u32 fwsm = IXGBE_READ_REG(hw, IXGBE_GL_MNG_FWSM); + + return !!(fwsm & IXGBE_GL_MNG_FWSM_ROLLBACK_M); +} + /** * ixgbe_init_phy_ops_e610 - PHY specific init * @hw: pointer to hardware structure @@ -2225,6 +2331,131 @@ int ixgbe_aci_read_nvm(struct ixgbe_hw *hw, u16 module_typeid, u32 offset, return ixgbe_aci_send_cmd(hw, &desc, data, length); } +/** + * ixgbe_aci_erase_nvm - erase NVM sector + * @hw: pointer to the HW struct + * @module_typeid: module pointer location in words from the NVM beginning + * + * Erase the NVM sector using the ACI command (0x0702). + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_erase_nvm(struct ixgbe_hw *hw, u16 module_typeid) +{ + struct ixgbe_aci_cmd_nvm *cmd; + struct ixgbe_aci_desc desc; + __le16 len; + int err; + + /* Read a length value from SR, so module_typeid is equal to 0, + * calculate offset where module size is placed from bytes to words + * set last command and read from SR values to true. + */ + err = ixgbe_aci_read_nvm(hw, 0, 2 * module_typeid + 2, 2, &len, true, + true); + if (err) + return err; + + cmd = &desc.params.nvm; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_nvm_erase); + + cmd->module_typeid = cpu_to_le16(module_typeid); + cmd->length = len; + cmd->offset_low = 0; + cmd->offset_high = 0; + + return ixgbe_aci_send_cmd(hw, &desc, NULL, 0); +} + +/** + * ixgbe_aci_update_nvm - update NVM + * @hw: pointer to the HW struct + * @module_typeid: module pointer location in words from the NVM beginning + * @offset: byte offset from the module beginning + * @length: length of the section to be written (in bytes from the offset) + * @data: command buffer (size [bytes] = length) + * @last_command: tells if this is the last command in a series + * @command_flags: command parameters + * + * Update the NVM using the ACI command (0x0703). + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_update_nvm(struct ixgbe_hw *hw, u16 module_typeid, + u32 offset, u16 length, void *data, + bool last_command, u8 command_flags) +{ + struct ixgbe_aci_cmd_nvm *cmd; + struct ixgbe_aci_desc desc; + + cmd = &desc.params.nvm; + + /* In offset the highest byte must be zeroed. */ + if (offset & 0xFF000000) + return -EINVAL; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_nvm_write); + + cmd->cmd_flags |= command_flags; + + /* If this is the last command in a series, set the proper flag. */ + if (last_command) + cmd->cmd_flags |= IXGBE_ACI_NVM_LAST_CMD; + cmd->module_typeid = cpu_to_le16(module_typeid); + cmd->offset_low = cpu_to_le16(offset & 0xFFFF); + cmd->offset_high = FIELD_GET(IXGBE_ACI_NVM_OFFSET_HI_U_MASK, offset); + cmd->length = cpu_to_le16(length); + + desc.flags |= cpu_to_le16(IXGBE_ACI_FLAG_RD); + + return ixgbe_aci_send_cmd(hw, &desc, data, length); +} + +/** + * ixgbe_nvm_write_activate - NVM activate write + * @hw: pointer to the HW struct + * @cmd_flags: flags for write activate command + * @response_flags: response indicators from firmware + * + * Update the control word with the required banks' validity bits + * and dumps the Shadow RAM to flash using ACI command (0x0707). + * + * cmd_flags controls which banks to activate, the preservation level to use + * when activating the NVM bank, and whether an EMP reset is required for + * activation. + * + * Note that the 16bit cmd_flags value is split between two separate 1 byte + * flag values in the descriptor. + * + * On successful return of the firmware command, the response_flags variable + * is updated with the flags reported by firmware indicating certain status, + * such as whether EMP reset is enabled. + * + * Return: the exit code of the operation. + */ +int ixgbe_nvm_write_activate(struct ixgbe_hw *hw, u16 cmd_flags, + u8 *response_flags) +{ + struct ixgbe_aci_cmd_nvm *cmd; + struct ixgbe_aci_desc desc; + s32 err; + + cmd = &desc.params.nvm; + ixgbe_fill_dflt_direct_cmd_desc(&desc, + ixgbe_aci_opc_nvm_write_activate); + + cmd->cmd_flags = (u8)(cmd_flags & 0xFF); + cmd->offset_high = (u8)FIELD_GET(IXGBE_ACI_NVM_OFFSET_HI_A_MASK, + cmd_flags); + + err = ixgbe_aci_send_cmd(hw, &desc, NULL, 0); + if (!err && response_flags) + *response_flags = cmd->cmd_flags; + + return err; +} + /** * ixgbe_nvm_validate_checksum - validate checksum * @hw: pointer to the HW struct @@ -2267,160 +2498,1109 @@ int ixgbe_nvm_validate_checksum(struct ixgbe_hw *hw) } /** - * ixgbe_read_sr_word_aci - Reads Shadow RAM via ACI - * @hw: pointer to the HW structure - * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF) - * @data: word read from the Shadow RAM + * ixgbe_discover_flash_size - Discover the available flash size + * @hw: pointer to the HW struct * - * Reads one 16 bit word from the Shadow RAM using ixgbe_read_flat_nvm. + * The device flash could be up to 16MB in size. However, it is possible that + * the actual size is smaller. Use bisection to determine the accessible size + * of flash memory. * * Return: the exit code of the operation. */ -int ixgbe_read_sr_word_aci(struct ixgbe_hw *hw, u16 offset, u16 *data) +static int ixgbe_discover_flash_size(struct ixgbe_hw *hw) { - u32 bytes = sizeof(u16); - u16 data_local; + u32 min_size = 0, max_size = IXGBE_ACI_NVM_MAX_OFFSET + 1; int err; - err = ixgbe_read_flat_nvm(hw, offset * sizeof(u16), &bytes, - (u8 *)&data_local, true); + err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ); if (err) return err; - *data = data_local; - return 0; + while ((max_size - min_size) > 1) { + u32 offset = (max_size + min_size) / 2; + u32 len = 1; + u8 data; + + err = ixgbe_read_flat_nvm(hw, offset, &len, &data, false); + if (err == -EIO && + hw->aci.last_status == IXGBE_ACI_RC_EINVAL) { + err = 0; + max_size = offset; + } else if (!err) { + min_size = offset; + } else { + /* an unexpected error occurred */ + goto err_read_flat_nvm; + } + } + + hw->flash.flash_size = max_size; + +err_read_flat_nvm: + ixgbe_release_nvm(hw); + + return err; } /** - * ixgbe_read_flat_nvm - Read portion of NVM by flat offset - * @hw: pointer to the HW struct - * @offset: offset from beginning of NVM - * @length: (in) number of bytes to read; (out) number of bytes actually read - * @data: buffer to return data in (sized to fit the specified length) - * @read_shadow_ram: if true, read from shadow RAM instead of NVM + * ixgbe_read_sr_base_address - Read the value of a Shadow RAM pointer word + * @hw: pointer to the HW structure + * @offset: the word offset of the Shadow RAM word to read + * @pointer: pointer value read from Shadow RAM * - * Reads a portion of the NVM, as a flat memory space. This function correctly - * breaks read requests across Shadow RAM sectors, prevents Shadow RAM size - * from being exceeded in case of Shadow RAM read requests and ensures that no - * single read request exceeds the maximum 4KB read for a single admin command. + * Read the given Shadow RAM word, and convert it to a pointer value specified + * in bytes. This function assumes the specified offset is a valid pointer + * word. * - * Returns an error code on failure. Note that the data pointer may be - * partially updated if some reads succeed before a failure. + * Each pointer word specifies whether it is stored in word size or 4KB + * sector size by using the highest bit. The reported pointer value will be in + * bytes, intended for flat NVM reads. * * Return: the exit code of the operation. */ -int ixgbe_read_flat_nvm(struct ixgbe_hw *hw, u32 offset, u32 *length, - u8 *data, bool read_shadow_ram) +static int ixgbe_read_sr_base_address(struct ixgbe_hw *hw, u16 offset, + u32 *pointer) { - u32 inlen = *length; - u32 bytes_read = 0; - bool last_cmd; + u16 value; int err; - /* Verify the length of the read if this is for the Shadow RAM */ - if (read_shadow_ram && ((offset + inlen) > - (hw->eeprom.word_size * 2u))) - return -EINVAL; - - do { - u32 read_size, sector_offset; - - /* ixgbe_aci_read_nvm cannot read more than 4KB at a time. - * Additionally, a read from the Shadow RAM may not cross over - * a sector boundary. Conveniently, the sector size is also 4KB. - */ - sector_offset = offset % IXGBE_ACI_MAX_BUFFER_SIZE; - read_size = min_t(u32, - IXGBE_ACI_MAX_BUFFER_SIZE - sector_offset, - inlen - bytes_read); - - last_cmd = !(bytes_read + read_size < inlen); - - /* ixgbe_aci_read_nvm takes the length as a u16. Our read_size - * is calculated using a u32, but the IXGBE_ACI_MAX_BUFFER_SIZE - * maximum size guarantees that it will fit within the 2 bytes. - */ - err = ixgbe_aci_read_nvm(hw, IXGBE_ACI_NVM_START_POINT, - offset, (u16)read_size, - data + bytes_read, last_cmd, - read_shadow_ram); - if (err) - break; + err = ixgbe_read_ee_aci_e610(hw, offset, &value); + if (err) + return err; - bytes_read += read_size; - offset += read_size; - } while (!last_cmd); + /* Determine if the pointer is in 4KB or word units */ + if (value & IXGBE_SR_NVM_PTR_4KB_UNITS) + *pointer = (value & ~IXGBE_SR_NVM_PTR_4KB_UNITS) * SZ_4K; + else + *pointer = value * sizeof(u16); - *length = bytes_read; - return err; + return 0; } /** - * ixgbe_read_sr_buf_aci - Read Shadow RAM buffer via ACI + * ixgbe_read_sr_area_size - Read an area size from a Shadow RAM word * @hw: pointer to the HW structure - * @offset: offset of the Shadow RAM words to read (0x000000 - 0x001FFF) - * @words: (in) number of words to read; (out) number of words actually read - * @data: words read from the Shadow RAM + * @offset: the word offset of the Shadow RAM to read + * @size: size value read from the Shadow RAM * - * Read 16 bit words (data buf) from the Shadow RAM. Acquire/release the NVM - * ownership. + * Read the given Shadow RAM word, and convert it to an area size value + * specified in bytes. This function assumes the specified offset is a valid + * area size word. * - * Return: the operation exit code. + * Each area size word is specified in 4KB sector units. This function reports + * the size in bytes, intended for flat NVM reads. + * + * Return: the exit code of the operation. */ -int ixgbe_read_sr_buf_aci(struct ixgbe_hw *hw, u16 offset, u16 *words, - u16 *data) +static int ixgbe_read_sr_area_size(struct ixgbe_hw *hw, u16 offset, u32 *size) { - u32 bytes = *words * 2; + u16 value; int err; - err = ixgbe_read_flat_nvm(hw, offset * 2, &bytes, (u8 *)data, true); + err = ixgbe_read_ee_aci_e610(hw, offset, &value); if (err) return err; - *words = bytes / 2; - - for (int i = 0; i < *words; i++) - data[i] = le16_to_cpu(((__le16 *)data)[i]); + /* Area sizes are always specified in 4KB units */ + *size = value * SZ_4K; return 0; } /** - * ixgbe_read_ee_aci_e610 - Read EEPROM word using the admin command. - * @hw: pointer to hardware structure - * @offset: offset of word in the EEPROM to read - * @data: word read from the EEPROM + * ixgbe_determine_active_flash_banks - Discover active bank for each module + * @hw: pointer to the HW struct * - * Reads a 16 bit word from the EEPROM using the ACI. - * If the EEPROM params are not initialized, the function - * initialize them before proceeding with reading. - * The function acquires and then releases the NVM ownership. + * Read the Shadow RAM control word and determine which banks are active for + * the NVM, OROM, and Netlist modules. Also read and calculate the associated + * pointer and size. These values are then cached into the ixgbe_flash_info + * structure for later use in order to calculate the correct offset to read + * from the active module. * * Return: the exit code of the operation. */ -int ixgbe_read_ee_aci_e610(struct ixgbe_hw *hw, u16 offset, u16 *data) +static int ixgbe_determine_active_flash_banks(struct ixgbe_hw *hw) { + struct ixgbe_bank_info *banks = &hw->flash.banks; + u16 ctrl_word; int err; - if (hw->eeprom.type == ixgbe_eeprom_uninitialized) { - err = hw->eeprom.ops.init_params(hw); - if (err) - return err; - } - - err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ); + err = ixgbe_read_ee_aci_e610(hw, IXGBE_E610_SR_NVM_CTRL_WORD, + &ctrl_word); if (err) return err; - err = ixgbe_read_sr_word_aci(hw, offset, data); - ixgbe_release_nvm(hw); + if (FIELD_GET(IXGBE_SR_CTRL_WORD_1_M, ctrl_word) != + IXGBE_SR_CTRL_WORD_VALID) + return -ENODATA; - return err; -} + if (!(ctrl_word & IXGBE_SR_CTRL_WORD_NVM_BANK)) + banks->nvm_bank = IXGBE_1ST_FLASH_BANK; + else + banks->nvm_bank = IXGBE_2ND_FLASH_BANK; -/** - * ixgbe_read_ee_aci_buffer_e610 - Read EEPROM words via ACI - * @hw: pointer to hardware structure + if (!(ctrl_word & IXGBE_SR_CTRL_WORD_OROM_BANK)) + banks->orom_bank = IXGBE_1ST_FLASH_BANK; + else + banks->orom_bank = IXGBE_2ND_FLASH_BANK; + + if (!(ctrl_word & IXGBE_SR_CTRL_WORD_NETLIST_BANK)) + banks->netlist_bank = IXGBE_1ST_FLASH_BANK; + else + banks->netlist_bank = IXGBE_2ND_FLASH_BANK; + + err = ixgbe_read_sr_base_address(hw, IXGBE_E610_SR_1ST_NVM_BANK_PTR, + &banks->nvm_ptr); + if (err) + return err; + + err = ixgbe_read_sr_area_size(hw, IXGBE_E610_SR_NVM_BANK_SIZE, + &banks->nvm_size); + if (err) + return err; + + err = ixgbe_read_sr_base_address(hw, IXGBE_E610_SR_1ST_OROM_BANK_PTR, + &banks->orom_ptr); + if (err) + return err; + + err = ixgbe_read_sr_area_size(hw, IXGBE_E610_SR_OROM_BANK_SIZE, + &banks->orom_size); + if (err) + return err; + + err = ixgbe_read_sr_base_address(hw, IXGBE_E610_SR_NETLIST_BANK_PTR, + &banks->netlist_ptr); + if (err) + return err; + + err = ixgbe_read_sr_area_size(hw, IXGBE_E610_SR_NETLIST_BANK_SIZE, + &banks->netlist_size); + + return err; +} + +/** + * ixgbe_get_flash_bank_offset - Get offset into requested flash bank + * @hw: pointer to the HW structure + * @bank: whether to read from the active or inactive flash bank + * @module: the module to read from + * + * Based on the module, lookup the module offset from the beginning of the + * flash. + * + * Return: the flash offset. Note that a value of zero is invalid and must be + * treated as an error. + */ +static int ixgbe_get_flash_bank_offset(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + u16 module) +{ + struct ixgbe_bank_info *banks = &hw->flash.banks; + enum ixgbe_flash_bank active_bank; + bool second_bank_active; + u32 offset, size; + + switch (module) { + case IXGBE_E610_SR_1ST_NVM_BANK_PTR: + offset = banks->nvm_ptr; + size = banks->nvm_size; + active_bank = banks->nvm_bank; + break; + case IXGBE_E610_SR_1ST_OROM_BANK_PTR: + offset = banks->orom_ptr; + size = banks->orom_size; + active_bank = banks->orom_bank; + break; + case IXGBE_E610_SR_NETLIST_BANK_PTR: + offset = banks->netlist_ptr; + size = banks->netlist_size; + active_bank = banks->netlist_bank; + break; + default: + return 0; + } + + switch (active_bank) { + case IXGBE_1ST_FLASH_BANK: + second_bank_active = false; + break; + case IXGBE_2ND_FLASH_BANK: + second_bank_active = true; + break; + default: + return 0; + } + + /* The second flash bank is stored immediately following the first + * bank. Based on whether the 1st or 2nd bank is active, and whether + * we want the active or inactive bank, calculate the desired offset. + */ + switch (bank) { + case IXGBE_ACTIVE_FLASH_BANK: + return offset + (second_bank_active ? size : 0); + case IXGBE_INACTIVE_FLASH_BANK: + return offset + (second_bank_active ? 0 : size); + } + + return 0; +} + +/** + * ixgbe_read_flash_module - Read a word from one of the main NVM modules + * @hw: pointer to the HW structure + * @bank: which bank of the module to read + * @module: the module to read + * @offset: the offset into the module in bytes + * @data: storage for the word read from the flash + * @length: bytes of data to read + * + * Read data from the specified flash module. The bank parameter indicates + * whether or not to read from the active bank or the inactive bank of that + * module. + * + * The word will be read using flat NVM access, and relies on the + * hw->flash.banks data being setup by ixgbe_determine_active_flash_banks() + * during initialization. + * + * Return: the exit code of the operation. + */ +static int ixgbe_read_flash_module(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + u16 module, u32 offset, u8 *data, u32 length) +{ + u32 start; + int err; + + start = ixgbe_get_flash_bank_offset(hw, bank, module); + if (!start) + return -EINVAL; + + err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ); + if (err) + return err; + + err = ixgbe_read_flat_nvm(hw, start + offset, &length, data, false); + + ixgbe_release_nvm(hw); + + return err; +} + +/** + * ixgbe_read_nvm_module - Read from the active main NVM module + * @hw: pointer to the HW structure + * @bank: whether to read from active or inactive NVM module + * @offset: offset into the NVM module to read, in words + * @data: storage for returned word value + * + * Read the specified word from the active NVM module. This includes the CSS + * header at the start of the NVM module. + * + * Return: the exit code of the operation. + */ +static int ixgbe_read_nvm_module(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + u32 offset, u16 *data) +{ + __le16 data_local; + int err; + + err = ixgbe_read_flash_module(hw, bank, IXGBE_E610_SR_1ST_NVM_BANK_PTR, + offset * sizeof(data_local), + (u8 *)&data_local, + sizeof(data_local)); + if (!err) + *data = le16_to_cpu(data_local); + + return err; +} + +/** + * ixgbe_read_netlist_module - Read data from the netlist module area + * @hw: pointer to the HW structure + * @bank: whether to read from the active or inactive module + * @offset: offset into the netlist to read from + * @data: storage for returned word value + * + * Read a word from the specified netlist bank. + * + * Return: the exit code of the operation. + */ +static int ixgbe_read_netlist_module(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + u32 offset, u16 *data) +{ + __le16 data_local; + int err; + + err = ixgbe_read_flash_module(hw, bank, IXGBE_E610_SR_NETLIST_BANK_PTR, + offset * sizeof(data_local), + (u8 *)&data_local, sizeof(data_local)); + if (!err) + *data = le16_to_cpu(data_local); + + return err; +} + +/** + * ixgbe_read_orom_module - Read from the active Option ROM module + * @hw: pointer to the HW structure + * @bank: whether to read from active or inactive OROM module + * @offset: offset into the OROM module to read, in words + * @data: storage for returned word value + * + * Read the specified word from the active Option ROM module of the flash. + * Note that unlike the NVM module, the CSS data is stored at the end of the + * module instead of at the beginning. + * + * Return: the exit code of the operation. + */ +static int ixgbe_read_orom_module(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + u32 offset, u16 *data) +{ + __le16 data_local; + int err; + + err = ixgbe_read_flash_module(hw, bank, IXGBE_E610_SR_1ST_OROM_BANK_PTR, + offset * sizeof(data_local), + (u8 *)&data_local, sizeof(data_local)); + if (!err) + *data = le16_to_cpu(data_local); + + return err; +} + +/** + * ixgbe_get_nvm_css_hdr_len - Read the CSS header length + * @hw: pointer to the HW struct + * @bank: whether to read from the active or inactive flash bank + * @hdr_len: storage for header length in words + * + * Read the CSS header length from the NVM CSS header and add the + * Authentication header size, and then convert to words. + * + * Return: the exit code of the operation. + */ +static int ixgbe_get_nvm_css_hdr_len(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + u32 *hdr_len) +{ + u16 hdr_len_l, hdr_len_h; + u32 hdr_len_dword; + int err; + + err = ixgbe_read_nvm_module(hw, bank, IXGBE_NVM_CSS_HDR_LEN_L, + &hdr_len_l); + if (err) + return err; + + err = ixgbe_read_nvm_module(hw, bank, IXGBE_NVM_CSS_HDR_LEN_H, + &hdr_len_h); + if (err) + return err; + + /* CSS header length is in DWORD, so convert to words and add + * authentication header size. + */ + hdr_len_dword = (hdr_len_h << 16) | hdr_len_l; + *hdr_len = hdr_len_dword * 2 + IXGBE_NVM_AUTH_HEADER_LEN; + + return 0; +} + +/** + * ixgbe_read_nvm_sr_copy - Read a word from the Shadow RAM copy + * @hw: pointer to the HW structure + * @bank: whether to read from the active or inactive NVM module + * @offset: offset into the Shadow RAM copy to read, in words + * @data: storage for returned word value + * + * Read the specified word from the copy of the Shadow RAM found in the + * specified NVM module. + * + * Return: the exit code of the operation. + */ +static int ixgbe_read_nvm_sr_copy(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + u32 offset, u16 *data) +{ + u32 hdr_len; + int err; + + err = ixgbe_get_nvm_css_hdr_len(hw, bank, &hdr_len); + if (err) + return err; + + hdr_len = round_up(hdr_len, IXGBE_HDR_LEN_ROUNDUP); + + return ixgbe_read_nvm_module(hw, bank, hdr_len + offset, data); +} + +/** + * ixgbe_get_nvm_srev - Read the security revision from the NVM CSS header + * @hw: pointer to the HW struct + * @bank: whether to read from the active or inactive flash bank + * @srev: storage for security revision + * + * Read the security revision out of the CSS header of the active NVM module + * bank. + * + * Return: the exit code of the operation. + */ +static int ixgbe_get_nvm_srev(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, u32 *srev) +{ + u16 srev_l, srev_h; + int err; + + err = ixgbe_read_nvm_module(hw, bank, IXGBE_NVM_CSS_SREV_L, &srev_l); + if (err) + return err; + + err = ixgbe_read_nvm_module(hw, bank, IXGBE_NVM_CSS_SREV_H, &srev_h); + if (err) + return err; + + *srev = (srev_h << 16) | srev_l; + + return 0; +} + +/** + * ixgbe_get_orom_civd_data - Get the combo version information from Option ROM + * @hw: pointer to the HW struct + * @bank: whether to read from the active or inactive flash module + * @civd: storage for the Option ROM CIVD data. + * + * Searches through the Option ROM flash contents to locate the CIVD data for + * the image. + * + * Return: the exit code of the operation. + */ +static int +ixgbe_get_orom_civd_data(struct ixgbe_hw *hw, enum ixgbe_bank_select bank, + struct ixgbe_orom_civd_info *civd) +{ + struct ixgbe_orom_civd_info tmp; + u32 offset; + int err; + + /* The CIVD section is located in the Option ROM aligned to 512 bytes. + * The first 4 bytes must contain the ASCII characters "$CIV". + * A simple modulo 256 sum of all of the bytes of the structure must + * equal 0. + */ + for (offset = 0; (offset + SZ_512) <= hw->flash.banks.orom_size; + offset += SZ_512) { + u8 sum = 0; + u32 i; + + err = ixgbe_read_flash_module(hw, bank, + IXGBE_E610_SR_1ST_OROM_BANK_PTR, + offset, + (u8 *)&tmp, sizeof(tmp)); + if (err) + return err; + + /* Skip forward until we find a matching signature */ + if (memcmp(IXGBE_OROM_CIV_SIGNATURE, tmp.signature, + sizeof(tmp.signature))) + continue; + + /* Verify that the simple checksum is zero */ + for (i = 0; i < sizeof(tmp); i++) + sum += ((u8 *)&tmp)[i]; + + if (sum) + return -EDOM; + + *civd = tmp; + return 0; + } + + return -ENODATA; +} + +/** + * ixgbe_get_orom_srev - Read the security revision from the OROM CSS header + * @hw: pointer to the HW struct + * @bank: whether to read from active or inactive flash module + * @srev: storage for security revision + * + * Read the security revision out of the CSS header of the active OROM module + * bank. + * + * Return: the exit code of the operation. + */ +static int ixgbe_get_orom_srev(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + u32 *srev) +{ + u32 orom_size_word = hw->flash.banks.orom_size / 2; + u32 css_start, hdr_len; + u16 srev_l, srev_h; + int err; + + err = ixgbe_get_nvm_css_hdr_len(hw, bank, &hdr_len); + if (err) + return err; + + if (orom_size_word < hdr_len) + return -EINVAL; + + /* Calculate how far into the Option ROM the CSS header starts. Note + * that ixgbe_read_orom_module takes a word offset. + */ + css_start = orom_size_word - hdr_len; + err = ixgbe_read_orom_module(hw, bank, + css_start + IXGBE_NVM_CSS_SREV_L, + &srev_l); + if (err) + return err; + + err = ixgbe_read_orom_module(hw, bank, + css_start + IXGBE_NVM_CSS_SREV_H, + &srev_h); + if (err) + return err; + + *srev = srev_h << 16 | srev_l; + + return 0; +} + +/** + * ixgbe_get_orom_ver_info - Read Option ROM version information + * @hw: pointer to the HW struct + * @bank: whether to read from the active or inactive flash module + * @orom: pointer to Option ROM info structure + * + * Read Option ROM version and security revision from the Option ROM flash + * section. + * + * Return: the exit code of the operation. + */ +static int ixgbe_get_orom_ver_info(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + struct ixgbe_orom_info *orom) +{ + struct ixgbe_orom_civd_info civd; + u32 combo_ver; + int err; + + err = ixgbe_get_orom_civd_data(hw, bank, &civd); + if (err) + return err; + + combo_ver = le32_to_cpu(civd.combo_ver); + + orom->major = (u8)FIELD_GET(IXGBE_OROM_VER_MASK, combo_ver); + orom->patch = (u8)FIELD_GET(IXGBE_OROM_VER_PATCH_MASK, combo_ver); + orom->build = (u16)FIELD_GET(IXGBE_OROM_VER_BUILD_MASK, combo_ver); + + return ixgbe_get_orom_srev(hw, bank, &orom->srev); +} + +/** + * ixgbe_get_inactive_orom_ver - Read Option ROM version from the inactive bank + * @hw: pointer to the HW structure + * @orom: storage for Option ROM version information + * + * Read the Option ROM version and security revision data for the inactive + * section of flash. Used to access version data for a pending update that has + * not yet been activated. + * + * Return: the exit code of the operation. + */ +int ixgbe_get_inactive_orom_ver(struct ixgbe_hw *hw, + struct ixgbe_orom_info *orom) +{ + return ixgbe_get_orom_ver_info(hw, IXGBE_INACTIVE_FLASH_BANK, orom); +} + +/** + * ixgbe_get_nvm_ver_info - Read NVM version information + * @hw: pointer to the HW struct + * @bank: whether to read from the active or inactive flash bank + * @nvm: pointer to NVM info structure + * + * Read the NVM EETRACK ID and map version of the main NVM image bank, filling + * in the nvm info structure. + * + * Return: the exit code of the operation. + */ +static int ixgbe_get_nvm_ver_info(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + struct ixgbe_nvm_info *nvm) +{ + u16 eetrack_lo, eetrack_hi, ver; + int err; + + err = ixgbe_read_nvm_sr_copy(hw, bank, + IXGBE_E610_SR_NVM_DEV_STARTER_VER, &ver); + if (err) + return err; + + nvm->major = FIELD_GET(IXGBE_E610_NVM_VER_HI_MASK, ver); + nvm->minor = FIELD_GET(IXGBE_E610_NVM_VER_LO_MASK, ver); + + err = ixgbe_read_nvm_sr_copy(hw, bank, IXGBE_E610_SR_NVM_EETRACK_LO, + &eetrack_lo); + if (err) + return err; + + err = ixgbe_read_nvm_sr_copy(hw, bank, IXGBE_E610_SR_NVM_EETRACK_HI, + &eetrack_hi); + if (err) + return err; + + nvm->eetrack = (eetrack_hi << 16) | eetrack_lo; + + ixgbe_get_nvm_srev(hw, bank, &nvm->srev); + + return 0; +} + +/** + * ixgbe_get_inactive_nvm_ver - Read Option ROM version from the inactive bank + * @hw: pointer to the HW structure + * @nvm: storage for Option ROM version information + * + * Read the NVM EETRACK ID, Map version, and security revision of the + * inactive NVM bank. Used to access version data for a pending update that + * has not yet been activated. + * + * Return: the exit code of the operation. + */ +int ixgbe_get_inactive_nvm_ver(struct ixgbe_hw *hw, struct ixgbe_nvm_info *nvm) +{ + return ixgbe_get_nvm_ver_info(hw, IXGBE_INACTIVE_FLASH_BANK, nvm); +} + +/** + * ixgbe_get_active_nvm_ver - Read Option ROM version from the active bank + * @hw: pointer to the HW structure + * @nvm: storage for Option ROM version information + * + * Reads the NVM EETRACK ID, Map version, and security revision of the + * active NVM bank. + * + * Return: the exit code of the operation. + */ +static int ixgbe_get_active_nvm_ver(struct ixgbe_hw *hw, + struct ixgbe_nvm_info *nvm) +{ + return ixgbe_get_nvm_ver_info(hw, IXGBE_ACTIVE_FLASH_BANK, nvm); +} + +/** + * ixgbe_get_netlist_info - Read the netlist version information + * @hw: pointer to the HW struct + * @bank: whether to read from the active or inactive flash bank + * @netlist: pointer to netlist version info structure + * + * Get the netlist version information from the requested bank. Reads the Link + * Topology section to find the Netlist ID block and extract the relevant + * information into the netlist version structure. + * + * Return: the exit code of the operation. + */ +static int ixgbe_get_netlist_info(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + struct ixgbe_netlist_info *netlist) +{ + u16 module_id, length, node_count, i; + u16 *id_blk; + int err; + + err = ixgbe_read_netlist_module(hw, bank, IXGBE_NETLIST_TYPE_OFFSET, + &module_id); + if (err) + return err; + + if (module_id != IXGBE_NETLIST_LINK_TOPO_MOD_ID) + return -EIO; + + err = ixgbe_read_netlist_module(hw, bank, IXGBE_LINK_TOPO_MODULE_LEN, + &length); + if (err) + return err; + + /* Sanity check that we have at least enough words to store the + * netlist ID block. + */ + if (length < IXGBE_NETLIST_ID_BLK_SIZE) + return -EIO; + + err = ixgbe_read_netlist_module(hw, bank, IXGBE_LINK_TOPO_NODE_COUNT, + &node_count); + if (err) + return err; + + node_count &= IXGBE_LINK_TOPO_NODE_COUNT_M; + + id_blk = kcalloc(IXGBE_NETLIST_ID_BLK_SIZE, sizeof(*id_blk), GFP_KERNEL); + if (!id_blk) + return -ENOMEM; + + /* Read out the entire Netlist ID Block at once. */ + err = ixgbe_read_flash_module(hw, bank, IXGBE_E610_SR_NETLIST_BANK_PTR, + IXGBE_NETLIST_ID_BLK_OFFSET(node_count) * + sizeof(*id_blk), (u8 *)id_blk, + IXGBE_NETLIST_ID_BLK_SIZE * + sizeof(*id_blk)); + if (err) + goto free_id_blk; + + for (i = 0; i < IXGBE_NETLIST_ID_BLK_SIZE; i++) + id_blk[i] = le16_to_cpu(((__le16 *)id_blk)[i]); + + netlist->major = id_blk[IXGBE_NETLIST_ID_BLK_MAJOR_VER_HIGH] << 16 | + id_blk[IXGBE_NETLIST_ID_BLK_MAJOR_VER_LOW]; + netlist->minor = id_blk[IXGBE_NETLIST_ID_BLK_MINOR_VER_HIGH] << 16 | + id_blk[IXGBE_NETLIST_ID_BLK_MINOR_VER_LOW]; + netlist->type = id_blk[IXGBE_NETLIST_ID_BLK_TYPE_HIGH] << 16 | + id_blk[IXGBE_NETLIST_ID_BLK_TYPE_LOW]; + netlist->rev = id_blk[IXGBE_NETLIST_ID_BLK_REV_HIGH] << 16 | + id_blk[IXGBE_NETLIST_ID_BLK_REV_LOW]; + netlist->cust_ver = id_blk[IXGBE_NETLIST_ID_BLK_CUST_VER]; + /* Read the left most 4 bytes of SHA */ + netlist->hash = id_blk[IXGBE_NETLIST_ID_BLK_SHA_HASH_WORD(15)] << 16 | + id_blk[IXGBE_NETLIST_ID_BLK_SHA_HASH_WORD(14)]; + +free_id_blk: + kfree(id_blk); + return err; +} + +/** + * ixgbe_get_inactive_netlist_ver - Read netlist version from the inactive bank + * @hw: pointer to the HW struct + * @netlist: pointer to netlist version info structure + * + * Read the netlist version data from the inactive netlist bank. Used to + * extract version data of a pending flash update in order to display the + * version data. + * + * Return: the exit code of the operation. + */ +int ixgbe_get_inactive_netlist_ver(struct ixgbe_hw *hw, + struct ixgbe_netlist_info *netlist) +{ + return ixgbe_get_netlist_info(hw, IXGBE_INACTIVE_FLASH_BANK, netlist); +} + +/** + * ixgbe_get_flash_data - get flash data + * @hw: pointer to the HW struct + * + * Read and populate flash data such as Shadow RAM size, + * max_timeout and blank_nvm_mode + * + * Return: the exit code of the operation. + */ +int ixgbe_get_flash_data(struct ixgbe_hw *hw) +{ + struct ixgbe_flash_info *flash = &hw->flash; + u32 fla, gens_stat; + u8 sr_size; + int err; + + /* The SR size is stored regardless of the NVM programming mode + * as the blank mode may be used in the factory line. + */ + gens_stat = IXGBE_READ_REG(hw, GLNVM_GENS); + sr_size = FIELD_GET(GLNVM_GENS_SR_SIZE_M, gens_stat); + + /* Switching to words (sr_size contains power of 2) */ + flash->sr_words = BIT(sr_size) * (SZ_1K / sizeof(u16)); + + /* Check if we are in the normal or blank NVM programming mode */ + fla = IXGBE_READ_REG(hw, IXGBE_GLNVM_FLA); + if (fla & IXGBE_GLNVM_FLA_LOCKED_M) { + flash->blank_nvm_mode = false; + } else { + flash->blank_nvm_mode = true; + return -EIO; + } + + err = ixgbe_discover_flash_size(hw); + if (err) + return err; + + err = ixgbe_determine_active_flash_banks(hw); + if (err) + return err; + + err = ixgbe_get_nvm_ver_info(hw, IXGBE_ACTIVE_FLASH_BANK, + &flash->nvm); + if (err) + return err; + + err = ixgbe_get_orom_ver_info(hw, IXGBE_ACTIVE_FLASH_BANK, + &flash->orom); + if (err) + return err; + + err = ixgbe_get_netlist_info(hw, IXGBE_ACTIVE_FLASH_BANK, + &flash->netlist); + return err; +} + +/** + * ixgbe_aci_nvm_update_empr - update NVM using EMPR + * @hw: pointer to the HW struct + * + * Force EMP reset using ACI command (0x0709). This command allows SW to + * request an EMPR to activate new FW. + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_nvm_update_empr(struct ixgbe_hw *hw) +{ + struct ixgbe_aci_desc desc; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_nvm_update_empr); + + return ixgbe_aci_send_cmd(hw, &desc, NULL, 0); +} + +/* ixgbe_nvm_set_pkg_data - NVM set package data + * @hw: pointer to the HW struct + * @del_pkg_data_flag: If is set then the current pkg_data store by FW + * is deleted. + * If bit is set to 1, then buffer should be size 0. + * @data: pointer to buffer + * @length: length of the buffer + * + * Set package data using ACI command (0x070A). + * This command is equivalent to the reception of + * a PLDM FW Update GetPackageData cmd. This command should be sent + * as part of the NVM update as the first cmd in the flow. + * + * Return: the exit code of the operation. + */ +int ixgbe_nvm_set_pkg_data(struct ixgbe_hw *hw, bool del_pkg_data_flag, + u8 *data, u16 length) +{ + struct ixgbe_aci_cmd_nvm_pkg_data *cmd; + struct ixgbe_aci_desc desc; + + if (length != 0 && !data) + return -EINVAL; + + cmd = &desc.params.pkg_data; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_nvm_pkg_data); + desc.flags |= cpu_to_le16(IXGBE_ACI_FLAG_RD); + + if (del_pkg_data_flag) + cmd->cmd_flags |= IXGBE_ACI_NVM_PKG_DELETE; + + return ixgbe_aci_send_cmd(hw, &desc, data, length); +} + +/* ixgbe_nvm_pass_component_tbl - NVM pass component table + * @hw: pointer to the HW struct + * @data: pointer to buffer + * @length: length of the buffer + * @transfer_flag: parameter for determining stage of the update + * @comp_response: a pointer to the response from the 0x070B ACI. + * @comp_response_code: a pointer to the response code from the 0x070B ACI. + * + * Pass component table using ACI command (0x070B). This command is equivalent + * to the reception of a PLDM FW Update PassComponentTable cmd. + * This command should be sent once per component. It can be only sent after + * Set Package Data cmd and before actual update. FW will assume these + * commands are going to be sent until the TransferFlag is set to End or + * StartAndEnd. + * + * Return: the exit code of the operation. + */ +int ixgbe_nvm_pass_component_tbl(struct ixgbe_hw *hw, u8 *data, u16 length, + u8 transfer_flag, u8 *comp_response, + u8 *comp_response_code) +{ + struct ixgbe_aci_cmd_nvm_pass_comp_tbl *cmd; + struct ixgbe_aci_desc desc; + int err; + + if (!data || !comp_response || !comp_response_code) + return -EINVAL; + + cmd = &desc.params.pass_comp_tbl; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, + ixgbe_aci_opc_nvm_pass_component_tbl); + desc.flags |= cpu_to_le16(IXGBE_ACI_FLAG_RD); + + cmd->transfer_flag = transfer_flag; + err = ixgbe_aci_send_cmd(hw, &desc, data, length); + if (!err) { + *comp_response = cmd->component_response; + *comp_response_code = cmd->component_response_code; + } + + return err; +} + +/** + * ixgbe_read_sr_word_aci - Reads Shadow RAM via ACI + * @hw: pointer to the HW structure + * @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF) + * @data: word read from the Shadow RAM + * + * Reads one 16 bit word from the Shadow RAM using ixgbe_read_flat_nvm. + * + * Return: the exit code of the operation. + */ +int ixgbe_read_sr_word_aci(struct ixgbe_hw *hw, u16 offset, u16 *data) +{ + u32 bytes = sizeof(u16); + u16 data_local; + int err; + + err = ixgbe_read_flat_nvm(hw, offset * sizeof(u16), &bytes, + (u8 *)&data_local, true); + if (err) + return err; + + *data = data_local; + return 0; +} + +/** + * ixgbe_read_flat_nvm - Read portion of NVM by flat offset + * @hw: pointer to the HW struct + * @offset: offset from beginning of NVM + * @length: (in) number of bytes to read; (out) number of bytes actually read + * @data: buffer to return data in (sized to fit the specified length) + * @read_shadow_ram: if true, read from shadow RAM instead of NVM + * + * Reads a portion of the NVM, as a flat memory space. This function correctly + * breaks read requests across Shadow RAM sectors, prevents Shadow RAM size + * from being exceeded in case of Shadow RAM read requests and ensures that no + * single read request exceeds the maximum 4KB read for a single admin command. + * + * Returns an error code on failure. Note that the data pointer may be + * partially updated if some reads succeed before a failure. + * + * Return: the exit code of the operation. + */ +int ixgbe_read_flat_nvm(struct ixgbe_hw *hw, u32 offset, u32 *length, + u8 *data, bool read_shadow_ram) +{ + u32 inlen = *length; + u32 bytes_read = 0; + bool last_cmd; + int err; + + /* Verify the length of the read if this is for the Shadow RAM */ + if (read_shadow_ram && ((offset + inlen) > + (hw->eeprom.word_size * 2u))) + return -EINVAL; + + do { + u32 read_size, sector_offset; + + /* ixgbe_aci_read_nvm cannot read more than 4KB at a time. + * Additionally, a read from the Shadow RAM may not cross over + * a sector boundary. Conveniently, the sector size is also 4KB. + */ + sector_offset = offset % IXGBE_ACI_MAX_BUFFER_SIZE; + read_size = min_t(u32, + IXGBE_ACI_MAX_BUFFER_SIZE - sector_offset, + inlen - bytes_read); + + last_cmd = !(bytes_read + read_size < inlen); + + /* ixgbe_aci_read_nvm takes the length as a u16. Our read_size + * is calculated using a u32, but the IXGBE_ACI_MAX_BUFFER_SIZE + * maximum size guarantees that it will fit within the 2 bytes. + */ + err = ixgbe_aci_read_nvm(hw, IXGBE_ACI_NVM_START_POINT, + offset, (u16)read_size, + data + bytes_read, last_cmd, + read_shadow_ram); + if (err) + break; + + bytes_read += read_size; + offset += read_size; + } while (!last_cmd); + + *length = bytes_read; + return err; +} + +/** + * ixgbe_read_sr_buf_aci - Read Shadow RAM buffer via ACI + * @hw: pointer to the HW structure + * @offset: offset of the Shadow RAM words to read (0x000000 - 0x001FFF) + * @words: (in) number of words to read; (out) number of words actually read + * @data: words read from the Shadow RAM + * + * Read 16 bit words (data buf) from the Shadow RAM. Acquire/release the NVM + * ownership. + * + * Return: the operation exit code. + */ +int ixgbe_read_sr_buf_aci(struct ixgbe_hw *hw, u16 offset, u16 *words, + u16 *data) +{ + u32 bytes = *words * 2; + int err; + + err = ixgbe_read_flat_nvm(hw, offset * 2, &bytes, (u8 *)data, true); + if (err) + return err; + + *words = bytes / 2; + + for (int i = 0; i < *words; i++) + data[i] = le16_to_cpu(((__le16 *)data)[i]); + + return 0; +} + +/** + * ixgbe_read_ee_aci_e610 - Read EEPROM word using the admin command. + * @hw: pointer to hardware structure + * @offset: offset of word in the EEPROM to read + * @data: word read from the EEPROM + * + * Reads a 16 bit word from the EEPROM using the ACI. + * If the EEPROM params are not initialized, the function + * initialize them before proceeding with reading. + * The function acquires and then releases the NVM ownership. + * + * Return: the exit code of the operation. + */ +int ixgbe_read_ee_aci_e610(struct ixgbe_hw *hw, u16 offset, u16 *data) +{ + int err; + + if (hw->eeprom.type == ixgbe_eeprom_uninitialized) { + err = hw->eeprom.ops.init_params(hw); + if (err) + return err; + } + + err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ); + if (err) + return err; + + err = ixgbe_read_sr_word_aci(hw, offset, data); + ixgbe_release_nvm(hw); + + return err; +} + +/** + * ixgbe_read_ee_aci_buffer_e610 - Read EEPROM words via ACI + * @hw: pointer to hardware structure * @offset: offset of words in the EEPROM to read * @words: number of words to read * @data: words to read from the EEPROM @@ -2485,7 +3665,7 @@ int ixgbe_validate_eeprom_checksum_e610(struct ixgbe_hw *hw, u16 *checksum_val) if (err) return err; - err = ixgbe_read_sr_word_aci(hw, E610_SR_SW_CHECKSUM_WORD, + err = ixgbe_read_sr_word_aci(hw, IXGBE_E610_SR_SW_CHECKSUM_WORD, &tmp_checksum); ixgbe_release_nvm(hw); @@ -2580,9 +3760,129 @@ int ixgbe_reset_hw_e610(struct ixgbe_hw *hw) return err; } +/** + * ixgbe_get_pfa_module_tlv - Read sub module TLV from NVM PFA + * @hw: pointer to hardware structure + * @module_tlv: pointer to module TLV to return + * @module_tlv_len: pointer to module TLV length to return + * @module_type: module type requested + * + * Find the requested sub module TLV type from the Preserved Field + * Area (PFA) and returns the TLV pointer and length. The caller can + * use these to read the variable length TLV value. + * + * Return: the exit code of the operation. + */ +static int ixgbe_get_pfa_module_tlv(struct ixgbe_hw *hw, u16 *module_tlv, + u16 *module_tlv_len, u16 module_type) +{ + u16 pfa_len, pfa_ptr, pfa_end_ptr; + u16 next_tlv; + int err; + + err = ixgbe_read_ee_aci_e610(hw, IXGBE_E610_SR_PFA_PTR, &pfa_ptr); + if (err) + return err; + + err = ixgbe_read_ee_aci_e610(hw, pfa_ptr, &pfa_len); + if (err) + return err; + + /* Starting with first TLV after PFA length, iterate through the list + * of TLVs to find the requested one. + */ + next_tlv = pfa_ptr + 1; + pfa_end_ptr = pfa_ptr + pfa_len; + while (next_tlv < pfa_end_ptr) { + u16 tlv_sub_module_type, tlv_len; + + /* Read TLV type */ + err = ixgbe_read_ee_aci_e610(hw, next_tlv, + &tlv_sub_module_type); + if (err) + break; + + /* Read TLV length */ + err = ixgbe_read_ee_aci_e610(hw, next_tlv + 1, &tlv_len); + if (err) + break; + + if (tlv_sub_module_type == module_type) { + if (tlv_len) { + *module_tlv = next_tlv; + *module_tlv_len = tlv_len; + return 0; + } + return -EIO; + } + /* Check next TLV, i.e. current TLV pointer + length + 2 words + * (for current TLV's type and length). + */ + next_tlv = next_tlv + tlv_len + 2; + } + /* Module does not exist */ + return -ENODATA; +} + +/** + * ixgbe_read_pba_string_e610 - Read PBA string from NVM + * @hw: pointer to hardware structure + * @pba_num: stores the part number string from the NVM + * @pba_num_size: part number string buffer length + * + * Read the part number string from the NVM. + * + * Return: the exit code of the operation. + */ +static int ixgbe_read_pba_string_e610(struct ixgbe_hw *hw, u8 *pba_num, + u32 pba_num_size) +{ + u16 pba_tlv, pba_tlv_len; + u16 pba_word, pba_size; + int err; + + *pba_num = '\0'; + + err = ixgbe_get_pfa_module_tlv(hw, &pba_tlv, &pba_tlv_len, + IXGBE_E610_SR_PBA_BLOCK_PTR); + if (err) + return err; + + /* pba_size is the next word */ + err = ixgbe_read_ee_aci_e610(hw, (pba_tlv + 2), &pba_size); + if (err) + return err; + + if (pba_tlv_len < pba_size) + return -EINVAL; + + /* Subtract one to get PBA word count (PBA Size word is included in + * total size). + */ + pba_size--; + + if (pba_num_size < (((u32)pba_size * 2) + 1)) + return -EINVAL; + + for (u16 i = 0; i < pba_size; i++) { + err = ixgbe_read_ee_aci_e610(hw, (pba_tlv + 2 + 1) + i, + &pba_word); + if (err) + return err; + + pba_num[(i * 2)] = FIELD_GET(IXGBE_E610_SR_PBA_BLOCK_MASK, + pba_word); + pba_num[(i * 2) + 1] = pba_word & 0xFF; + } + + pba_num[(pba_size * 2)] = '\0'; + + return err; +} + static const struct ixgbe_mac_operations mac_ops_e610 = { .init_hw = ixgbe_init_hw_generic, - .start_hw = ixgbe_start_hw_X540, + .start_hw = ixgbe_start_hw_e610, .clear_hw_cntrs = ixgbe_clear_hw_cntrs_generic, .enable_rx_dma = ixgbe_enable_rx_dma_generic, .get_mac_addr = ixgbe_get_mac_addr_generic, @@ -2621,8 +3921,12 @@ static const struct ixgbe_mac_operations mac_ops_e610 = { .led_off = ixgbe_led_off_generic, .init_led_link_act = ixgbe_init_led_link_act_generic, .reset_hw = ixgbe_reset_hw_e610, + .get_fw_ver = ixgbe_aci_get_fw_ver, .get_media_type = ixgbe_get_media_type_e610, .setup_link = ixgbe_setup_link_e610, + .fw_recovery_mode = ixgbe_fw_recovery_mode_e610, + .fw_rollback_mode = ixgbe_fw_rollback_mode_e610, + .get_nvm_ver = ixgbe_get_active_nvm_ver, .get_link_capabilities = ixgbe_get_link_capabilities_e610, .get_bus_info = ixgbe_get_bus_info_generic, .acquire_swfw_sync = ixgbe_acquire_swfw_sync_X540, @@ -2647,6 +3951,8 @@ static const struct ixgbe_eeprom_operations eeprom_ops_e610 = { .read = ixgbe_read_ee_aci_e610, .read_buffer = ixgbe_read_ee_aci_buffer_e610, .validate_checksum = ixgbe_validate_eeprom_checksum_e610, + .read_pba_string = ixgbe_read_pba_string_e610, + .init_params = ixgbe_init_eeprom_params_e610, }; const struct ixgbe_info ixgbe_e610_info = { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h index ba8c06b738105..30bc1f1b2549b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h @@ -67,6 +67,11 @@ int ixgbe_aci_read_nvm(struct ixgbe_hw *hw, u16 module_typeid, u32 offset, u16 length, void *data, bool last_command, bool read_shadow_ram); int ixgbe_nvm_validate_checksum(struct ixgbe_hw *hw); +int ixgbe_get_inactive_orom_ver(struct ixgbe_hw *hw, + struct ixgbe_orom_info *orom); +int ixgbe_get_inactive_nvm_ver(struct ixgbe_hw *hw, struct ixgbe_nvm_info *nvm); +int ixgbe_get_inactive_netlist_ver(struct ixgbe_hw *hw, + struct ixgbe_netlist_info *netlist); int ixgbe_read_sr_word_aci(struct ixgbe_hw *hw, u16 offset, u16 *data); int ixgbe_read_flat_nvm(struct ixgbe_hw *hw, u32 offset, u32 *length, u8 *data, bool read_shadow_ram); @@ -77,5 +82,18 @@ int ixgbe_read_ee_aci_buffer_e610(struct ixgbe_hw *hw, u16 offset, u16 words, u16 *data); int ixgbe_validate_eeprom_checksum_e610(struct ixgbe_hw *hw, u16 *checksum_val); int ixgbe_reset_hw_e610(struct ixgbe_hw *hw); +int ixgbe_get_flash_data(struct ixgbe_hw *hw); +int ixgbe_aci_nvm_update_empr(struct ixgbe_hw *hw); +int ixgbe_nvm_set_pkg_data(struct ixgbe_hw *hw, bool del_pkg_data_flag, + u8 *data, u16 length); +int ixgbe_nvm_pass_component_tbl(struct ixgbe_hw *hw, u8 *data, u16 length, + u8 transfer_flag, u8 *comp_response, + u8 *comp_response_code); +int ixgbe_aci_erase_nvm(struct ixgbe_hw *hw, u16 module_typeid); +int ixgbe_aci_update_nvm(struct ixgbe_hw *hw, u16 module_typeid, + u32 offset, u16 length, void *data, + bool last_command, u8 command_flags); +int ixgbe_nvm_write_activate(struct ixgbe_hw *hw, u16 cmd_flags, + u8 *response_flags); #endif /* _IXGBE_E610_H_ */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index f03925c1f521a..c86103eccc8aa 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -213,7 +213,7 @@ static void ixgbe_set_advertising_10gtypes(struct ixgbe_hw *hw, static int ixgbe_get_link_ksettings(struct net_device *netdev, struct ethtool_link_ksettings *cmd) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; ixgbe_link_speed supported_link; bool autoneg = false; @@ -458,7 +458,7 @@ static int ixgbe_get_link_ksettings(struct net_device *netdev, static int ixgbe_set_link_ksettings(struct net_device *netdev, const struct ethtool_link_ksettings *cmd) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; u32 advertised, old; int err = 0; @@ -535,7 +535,7 @@ static int ixgbe_set_link_ksettings(struct net_device *netdev, static void ixgbe_get_pause_stats(struct net_device *netdev, struct ethtool_pause_stats *stats) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw_stats *hwstats = &adapter->stats; stats->tx_pause_frames = hwstats->lxontxc + hwstats->lxofftxc; @@ -545,7 +545,7 @@ static void ixgbe_get_pause_stats(struct net_device *netdev, static void ixgbe_get_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; if (ixgbe_device_supports_autoneg_fc(hw) && @@ -567,7 +567,7 @@ static void ixgbe_get_pauseparam(struct net_device *netdev, static int ixgbe_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; struct ixgbe_fc_info fc = hw->fc; @@ -606,13 +606,13 @@ static int ixgbe_set_pauseparam(struct net_device *netdev, static u32 ixgbe_get_msglevel(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); return adapter->msg_enable; } static void ixgbe_set_msglevel(struct net_device *netdev, u32 data) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); adapter->msg_enable = data; } @@ -627,7 +627,7 @@ static int ixgbe_get_regs_len(struct net_device *netdev) static void ixgbe_get_regs(struct net_device *netdev, struct ethtool_regs *regs, void *p) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; u32 *regs_buff = p; u8 i; @@ -994,14 +994,14 @@ static void ixgbe_get_regs(struct net_device *netdev, static int ixgbe_get_eeprom_len(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); return adapter->hw.eeprom.word_size * 2; } static int ixgbe_get_eeprom(struct net_device *netdev, struct ethtool_eeprom *eeprom, u8 *bytes) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; u16 *eeprom_buff; int first_word, last_word, eeprom_len; @@ -1037,7 +1037,7 @@ static int ixgbe_get_eeprom(struct net_device *netdev, static int ixgbe_set_eeprom(struct net_device *netdev, struct ethtool_eeprom *eeprom, u8 *bytes) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; u16 *eeprom_buff; void *ptr; @@ -1104,10 +1104,22 @@ static int ixgbe_set_eeprom(struct net_device *netdev, return ret_val; } +void ixgbe_refresh_fw_version(struct ixgbe_adapter *adapter) +{ + struct ixgbe_hw *hw = &adapter->hw; + + ixgbe_get_flash_data(hw); + ixgbe_set_fw_version_e610(adapter); +} + static void ixgbe_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); + + /* need to refresh info for e610 in case fw reloads in runtime */ + if (adapter->hw.mac.type == ixgbe_mac_e610) + ixgbe_refresh_fw_version(adapter); strscpy(drvinfo->driver, ixgbe_driver_name, sizeof(drvinfo->driver)); @@ -1161,7 +1173,7 @@ static void ixgbe_get_ringparam(struct net_device *netdev, struct kernel_ethtool_ringparam *kernel_ring, struct netlink_ext_ack *extack) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_ring *tx_ring = adapter->tx_ring[0]; struct ixgbe_ring *rx_ring = adapter->rx_ring[0]; @@ -1176,7 +1188,7 @@ static int ixgbe_set_ringparam(struct net_device *netdev, struct kernel_ethtool_ringparam *kernel_ring, struct netlink_ext_ack *extack) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_ring *temp_ring; int i, j, err = 0; u32 new_rx_count, new_tx_count; @@ -1336,7 +1348,7 @@ static int ixgbe_get_sset_count(struct net_device *netdev, int sset) static void ixgbe_get_ethtool_stats(struct net_device *netdev, struct ethtool_stats *stats, u64 *data) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *net_stats; unsigned int start; @@ -1710,7 +1722,7 @@ static int ixgbe_eeprom_test(struct ixgbe_adapter *adapter, u64 *data) static irqreturn_t ixgbe_test_intr(int irq, void *data) { struct net_device *netdev = (struct net_device *) data; - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); adapter->test_icr |= IXGBE_READ_REG(&adapter->hw, IXGBE_EICR); @@ -2183,7 +2195,7 @@ static int ixgbe_loopback_test(struct ixgbe_adapter *adapter, u64 *data) static void ixgbe_diag_test(struct net_device *netdev, struct ethtool_test *eth_test, u64 *data) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); bool if_running = netif_running(netdev); if (ixgbe_removed(adapter->hw.hw_addr)) { @@ -2306,7 +2318,7 @@ static int ixgbe_wol_exclusion(struct ixgbe_adapter *adapter, static void ixgbe_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); wol->supported = WAKE_UCAST | WAKE_MCAST | WAKE_BCAST | WAKE_MAGIC; @@ -2328,7 +2340,7 @@ static void ixgbe_get_wol(struct net_device *netdev, static int ixgbe_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (wol->wolopts & (WAKE_PHY | WAKE_ARP | WAKE_MAGICSECURE | WAKE_FILTER)) @@ -2355,7 +2367,7 @@ static int ixgbe_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) static int ixgbe_nway_reset(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (netif_running(netdev)) ixgbe_reinit_locked(adapter); @@ -2366,7 +2378,7 @@ static int ixgbe_nway_reset(struct net_device *netdev) static int ixgbe_set_phys_id(struct net_device *netdev, enum ethtool_phys_id_state state) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; if (!hw->mac.ops.led_on || !hw->mac.ops.led_off) @@ -2399,7 +2411,7 @@ static int ixgbe_get_coalesce(struct net_device *netdev, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); /* only valid if in constant ITR mode */ if (adapter->rx_itr_setting <= 1) @@ -2455,7 +2467,7 @@ static int ixgbe_set_coalesce(struct net_device *netdev, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_q_vector *q_vector; int i; u16 tx_itr_param, rx_itr_param, tx_itr_prev; @@ -2681,7 +2693,7 @@ static int ixgbe_rss_indir_tbl_max(struct ixgbe_adapter *adapter) static int ixgbe_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, u32 *rule_locs) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); int ret = -EOPNOTSUPP; switch (cmd->cmd) { @@ -3069,7 +3081,7 @@ static int ixgbe_set_rss_hash_opt(struct ixgbe_adapter *adapter, static int ixgbe_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); int ret = -EOPNOTSUPP; switch (cmd->cmd) { @@ -3096,7 +3108,7 @@ static u32 ixgbe_get_rxfh_key_size(struct net_device *netdev) static u32 ixgbe_rss_indir_size(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); return ixgbe_rss_indir_tbl_entries(adapter); } @@ -3116,7 +3128,7 @@ static void ixgbe_get_reta(struct ixgbe_adapter *adapter, u32 *indir) static int ixgbe_get_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); rxfh->hfunc = ETH_RSS_HASH_TOP; @@ -3134,7 +3146,7 @@ static int ixgbe_set_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh, struct netlink_ext_ack *extack) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); int i; u32 reta_entries = ixgbe_rss_indir_tbl_entries(adapter); @@ -3176,7 +3188,7 @@ static int ixgbe_set_rxfh(struct net_device *netdev, static int ixgbe_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); /* we always support timestamping disabled */ info->rx_filters = BIT(HWTSTAMP_FILTER_NONE); @@ -3252,7 +3264,7 @@ static unsigned int ixgbe_max_channels(struct ixgbe_adapter *adapter) static void ixgbe_get_channels(struct net_device *dev, struct ethtool_channels *ch) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); /* report maximum channels */ ch->max_combined = ixgbe_max_channels(adapter); @@ -3289,7 +3301,7 @@ static void ixgbe_get_channels(struct net_device *dev, static int ixgbe_set_channels(struct net_device *dev, struct ethtool_channels *ch) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); unsigned int count = ch->combined_count; u8 max_rss_indices = ixgbe_max_rss_indices(adapter); @@ -3327,7 +3339,7 @@ static int ixgbe_set_channels(struct net_device *dev, static int ixgbe_get_module_info(struct net_device *dev, struct ethtool_modinfo *modinfo) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_hw *hw = &adapter->hw; u8 sff8472_rev, addr_mode; bool page_swap = false; @@ -3373,7 +3385,7 @@ static int ixgbe_get_module_eeprom(struct net_device *dev, struct ethtool_eeprom *ee, u8 *data) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_hw *hw = &adapter->hw; int status = -EFAULT; u8 databyte = 0xFF; @@ -3469,7 +3481,7 @@ ixgbe_get_eee_fw(struct ixgbe_adapter *adapter, struct ethtool_keee *edata) static int ixgbe_get_eee(struct net_device *netdev, struct ethtool_keee *edata) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; if (!(adapter->flags2 & IXGBE_FLAG2_EEE_CAPABLE)) @@ -3483,7 +3495,7 @@ static int ixgbe_get_eee(struct net_device *netdev, struct ethtool_keee *edata) static int ixgbe_set_eee(struct net_device *netdev, struct ethtool_keee *edata) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; struct ethtool_keee eee_data; int ret_val; @@ -3538,7 +3550,7 @@ static int ixgbe_set_eee(struct net_device *netdev, struct ethtool_keee *edata) static u32 ixgbe_get_priv_flags(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); u32 priv_flags = 0; if (adapter->flags2 & IXGBE_FLAG2_RX_LEGACY) @@ -3555,7 +3567,7 @@ static u32 ixgbe_get_priv_flags(struct net_device *netdev) static int ixgbe_set_priv_flags(struct net_device *netdev, u32 priv_flags) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); unsigned int flags2 = adapter->flags2; unsigned int i; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c index 955dced844a98..7dcf6ecd157b1 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c @@ -56,7 +56,7 @@ int ixgbe_fcoe_ddp_put(struct net_device *netdev, u16 xid) if (xid >= netdev->fcoe_ddp_xid) return 0; - adapter = netdev_priv(netdev); + adapter = ixgbe_from_netdev(netdev); fcoe = &adapter->fcoe; ddp = &fcoe->ddp[xid]; if (!ddp->udl) @@ -153,7 +153,7 @@ static int ixgbe_fcoe_ddp_setup(struct net_device *netdev, u16 xid, if (!netdev || !sgl) return 0; - adapter = netdev_priv(netdev); + adapter = ixgbe_from_netdev(netdev); if (xid >= netdev->fcoe_ddp_xid) { e_warn(drv, "xid=0x%x out-of-range\n", xid); return 0; @@ -834,7 +834,7 @@ static void ixgbe_fcoe_ddp_disable(struct ixgbe_adapter *adapter) */ int ixgbe_fcoe_enable(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_fcoe *fcoe = &adapter->fcoe; atomic_inc(&fcoe->refcnt); @@ -881,7 +881,7 @@ int ixgbe_fcoe_enable(struct net_device *netdev) */ int ixgbe_fcoe_disable(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (!atomic_dec_and_test(&adapter->fcoe.refcnt)) return -EINVAL; @@ -927,7 +927,7 @@ int ixgbe_fcoe_disable(struct net_device *netdev) int ixgbe_fcoe_get_wwn(struct net_device *netdev, u64 *wwn, int type) { u16 prefix = 0xffff; - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_mac_info *mac = &adapter->hw.mac; switch (type) { @@ -967,7 +967,7 @@ int ixgbe_fcoe_get_wwn(struct net_device *netdev, u64 *wwn, int type) int ixgbe_fcoe_get_hbainfo(struct net_device *netdev, struct netdev_fcoe_hbainfo *info) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; u64 dsn; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c new file mode 100644 index 0000000000000..49d3b66add7ee --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c @@ -0,0 +1,707 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2025 Intel Corporation. */ + +#include +#include +#include + +#include "ixgbe.h" +#include "ixgbe_fw_update.h" + +struct ixgbe_fwu_priv { + struct pldmfw context; + + struct ixgbe_adapter *adapter; + struct netlink_ext_ack *extack; + + /* Track which NVM banks to activate at the end of the update */ + u8 activate_flags; + bool emp_reset_available; +}; + +/** + * ixgbe_send_package_data - Send record package data to firmware + * @context: PLDM fw update structure + * @data: pointer to the package data + * @length: length of the package data + * + * Send a copy of the package data associated with the PLDM record matching + * this device to the firmware. + * + * Note that this function sends an AdminQ command that will fail unless the + * NVM resource has been acquired. + * + * Return: zero on success, or a negative error code on failure. + */ +static int ixgbe_send_package_data(struct pldmfw *context, + const u8 *data, u16 length) +{ + struct ixgbe_fwu_priv *priv = container_of(context, + struct ixgbe_fwu_priv, + context); + struct ixgbe_adapter *adapter = priv->adapter; + struct ixgbe_hw *hw = &adapter->hw; + u8 *package_data; + int err; + + package_data = kmemdup(data, length, GFP_KERNEL); + if (!package_data) + return -ENOMEM; + + err = ixgbe_nvm_set_pkg_data(hw, false, package_data, length); + + kfree(package_data); + + return err; +} + +/** + * ixgbe_check_component_response - Report firmware response to a component + * @adapter: device private data structure + * @response: indicates whether this component can be updated + * @code: code indicating reason for response + * @extack: netlink extended ACK structure + * + * Check whether firmware indicates if this component can be updated. Report + * a suitable error message over the netlink extended ACK if the component + * cannot be updated. + * + * Return: 0 if the component can be updated, or -ECANCELED if the + * firmware indicates the component cannot be updated. + */ +static int ixgbe_check_component_response(struct ixgbe_adapter *adapter, + u8 response, u8 code, + struct netlink_ext_ack *extack) +{ + struct ixgbe_hw *hw = &adapter->hw; + + switch (response) { + case IXGBE_ACI_NVM_PASS_COMP_CAN_BE_UPDATED: + /* Firmware indicated this update is good to proceed. */ + return 0; + case IXGBE_ACI_NVM_PASS_COMP_CAN_MAY_BE_UPDATEABLE: + NL_SET_ERR_MSG_MOD(extack, + "Firmware recommends not updating, as it may result in a downgrade. Continuing anyways"); + return 0; + case IXGBE_ACI_NVM_PASS_COMP_CAN_NOT_BE_UPDATED: + NL_SET_ERR_MSG_MOD(extack, "Firmware has rejected updating."); + break; + case IXGBE_ACI_NVM_PASS_COMP_PARTIAL_CHECK: + if (hw->mac.ops.fw_recovery_mode && + hw->mac.ops.fw_recovery_mode(hw)) + return 0; + break; + } + + switch (code) { + case IXGBE_ACI_NVM_PASS_COMP_STAMP_IDENTICAL_CODE: + NL_SET_ERR_MSG_MOD(extack, + "Component comparison stamp is identical to running image"); + break; + case IXGBE_ACI_NVM_PASS_COMP_STAMP_LOWER: + NL_SET_ERR_MSG_MOD(extack, + "Component comparison stamp is lower than running image"); + break; + case IXGBE_ACI_NVM_PASS_COMP_INVALID_STAMP_CODE: + NL_SET_ERR_MSG_MOD(extack, + "Component comparison stamp is invalid"); + break; + case IXGBE_ACI_NVM_PASS_COMP_CONFLICT_CODE: + NL_SET_ERR_MSG_MOD(extack, + "Component table conflict occurred"); + break; + case IXGBE_ACI_NVM_PASS_COMP_PRE_REQ_NOT_MET_CODE: + NL_SET_ERR_MSG_MOD(extack, "Component pre-requisites not met"); + break; + case IXGBE_ACI_NVM_PASS_COMP_NOT_SUPPORTED_CODE: + NL_SET_ERR_MSG_MOD(extack, "Component not supported"); + break; + case IXGBE_ACI_NVM_PASS_COMP_CANNOT_DOWNGRADE_CODE: + NL_SET_ERR_MSG_MOD(extack, "Component cannot be downgraded"); + break; + case IXGBE_ACI_NVM_PASS_COMP_INCOMPLETE_IMAGE_CODE: + NL_SET_ERR_MSG_MOD(extack, "Incomplete component image"); + break; + case IXGBE_ACI_NVM_PASS_COMP_VER_STR_IDENTICAL_CODE: + NL_SET_ERR_MSG_MOD(extack, + "Component version is identical to running image"); + break; + case IXGBE_ACI_NVM_PASS_COMP_VER_STR_LOWER_CODE: + NL_SET_ERR_MSG_MOD(extack, + "Component version is lower than the running image"); + break; + default: + NL_SET_ERR_MSG_MOD(extack, + "Received unexpected response code from firmware"); + break; + } + + return -ECANCELED; +} + +/** + * ixgbe_send_component_table - Send PLDM component table to firmware + * @context: PLDM fw update structure + * @component: the component to process + * @transfer_flag: relative transfer order of this component + * + * Read relevant data from the component and forward it to the device + * firmware. Check the response to determine if the firmware indicates that + * the update can proceed. + * + * This function sends ACI commands related to the NVM, and assumes that + * the NVM resource has been acquired. + * + * Return: 0 on success, or a negative error code on failure. + */ +static int ixgbe_send_component_table(struct pldmfw *context, + struct pldmfw_component *component, + u8 transfer_flag) +{ + struct ixgbe_fwu_priv *priv = container_of(context, + struct ixgbe_fwu_priv, + context); + struct ixgbe_adapter *adapter = priv->adapter; + struct netlink_ext_ack *extack = priv->extack; + struct ixgbe_aci_cmd_nvm_comp_tbl *comp_tbl; + u8 comp_response, comp_response_code; + struct ixgbe_hw *hw = &adapter->hw; + size_t length; + int err; + + switch (component->identifier) { + case NVM_COMP_ID_OROM: + case NVM_COMP_ID_NVM: + case NVM_COMP_ID_NETLIST: + break; + default: + NL_SET_ERR_MSG_MOD(extack, + "Unable to update due to unknown firmware component"); + return -EOPNOTSUPP; + } + + length = struct_size(comp_tbl, cvs, component->version_len); + comp_tbl = kzalloc(length, GFP_KERNEL); + if (!comp_tbl) + return -ENOMEM; + + comp_tbl->comp_class = cpu_to_le16(component->classification); + comp_tbl->comp_id = cpu_to_le16(component->identifier); + comp_tbl->comp_class_idx = FWU_COMP_CLASS_IDX_NOT_USE; + comp_tbl->comp_cmp_stamp = cpu_to_le32(component->comparison_stamp); + comp_tbl->cvs_type = component->version_type; + comp_tbl->cvs_len = component->version_len; + + memcpy(comp_tbl->cvs, component->version_string, + component->version_len); + + err = ixgbe_nvm_pass_component_tbl(hw, (u8 *)comp_tbl, length, + transfer_flag, &comp_response, + &comp_response_code); + + kfree(comp_tbl); + + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to transfer component table to firmware"); + return -EIO; + } + + return ixgbe_check_component_response(adapter, + comp_response, + comp_response_code, extack); +} + +/** + * ixgbe_write_one_nvm_block - Write an NVM block and await completion response + * @adapter: the PF data structure + * @module: the module to write to + * @offset: offset in bytes + * @block_size: size of the block to write, up to 4k + * @block: pointer to block of data to write + * @last_cmd: whether this is the last command + * @extack: netlink extended ACK structure + * + * Write a block of data to a flash module, and await for the completion + * response message from firmware. + * + * Note this function assumes the caller has acquired the NVM resource. + * + * On successful return, reset level indicates the device reset required to + * complete the update. + * + * 0 - IXGBE_ACI_NVM_POR_FLAG - A full power on is required + * 1 - IXGBE_ACI_NVM_PERST_FLAG - A cold PCIe reset is required + * 2 - IXGBE_ACI_NVM_EMPR_FLAG - An EMP reset is required + * + * Return: 0 on success, or a negative error code on failure. + */ +static int ixgbe_write_one_nvm_block(struct ixgbe_adapter *adapter, + u16 module, u32 offset, + u16 block_size, u8 *block, bool last_cmd, + struct netlink_ext_ack *extack) +{ + struct ixgbe_hw *hw = &adapter->hw; + + return ixgbe_aci_update_nvm(hw, module, offset, block_size, block, + last_cmd, 0); +} + +/** + * ixgbe_write_nvm_module - Write data to an NVM module + * @adapter: the PF driver structure + * @module: the module id to program + * @component: the name of the component being updated + * @image: buffer of image data to write to the NVM + * @length: length of the buffer + * @extack: netlink extended ACK structure + * + * Loop over the data for a given NVM module and program it in 4 Kb + * blocks. Notify devlink core of progress after each block is programmed. + * Loops over a block of data and programs the NVM in 4k block chunks. + * + * Note this function assumes the caller has acquired the NVM resource. + * + * Return: 0 on success, or a negative error code on failure. + */ +static int ixgbe_write_nvm_module(struct ixgbe_adapter *adapter, u16 module, + const char *component, const u8 *image, + u32 length, + struct netlink_ext_ack *extack) +{ + struct devlink *devlink = adapter->devlink; + u32 offset = 0; + bool last_cmd; + u8 *block; + int err; + + devlink_flash_update_status_notify(devlink, "Flashing", + component, 0, length); + + block = kzalloc(IXGBE_ACI_MAX_BUFFER_SIZE, GFP_KERNEL); + if (!block) + return -ENOMEM; + + do { + u32 block_size; + + block_size = min_t(u32, IXGBE_ACI_MAX_BUFFER_SIZE, + length - offset); + last_cmd = !(offset + block_size < length); + + memcpy(block, image + offset, block_size); + + err = ixgbe_write_one_nvm_block(adapter, module, offset, + block_size, block, last_cmd, + extack); + if (err) + break; + + offset += block_size; + + devlink_flash_update_status_notify(devlink, "Flashing", + component, offset, length); + } while (!last_cmd); + + if (err) + devlink_flash_update_status_notify(devlink, "Flashing failed", + component, length, length); + else + devlink_flash_update_status_notify(devlink, "Flashing done", + component, length, length); + + kfree(block); + + return err; +} + +/* Length in seconds to wait before timing out when erasing a flash module. + * Yes, erasing really can take minutes to complete. + */ +#define IXGBE_FW_ERASE_TIMEOUT 300 + +/** + * ixgbe_erase_nvm_module - Erase an NVM module and await firmware completion + * @adapter: the PF data structure + * @module: the module to erase + * @component: name of the component being updated + * @extack: netlink extended ACK structure + * + * Erase the inactive NVM bank associated with this module, and await for + * a completion response message from firmware. + * + * Note this function assumes the caller has acquired the NVM resource. + * + * Return: 0 on success, or a negative error code on failure. + */ +static int ixgbe_erase_nvm_module(struct ixgbe_adapter *adapter, u16 module, + const char *component, + struct netlink_ext_ack *extack) +{ + struct devlink *devlink = adapter->devlink; + struct ixgbe_hw *hw = &adapter->hw; + int err; + + devlink_flash_update_timeout_notify(devlink, "Erasing", component, + IXGBE_FW_ERASE_TIMEOUT); + + err = ixgbe_aci_erase_nvm(hw, module); + if (err) + devlink_flash_update_status_notify(devlink, "Erasing failed", + component, 0, 0); + else + devlink_flash_update_status_notify(devlink, "Erasing done", + component, 0, 0); + + return err; +} + +/** + * ixgbe_switch_flash_banks - Tell firmware to switch NVM banks + * @adapter: Pointer to the PF data structure + * @activate_flags: flags used for the activation command + * @emp_reset_available: on return, indicates if EMP reset is available + * @extack: netlink extended ACK structure + * + * Notify firmware to activate the newly written flash banks, and wait for the + * firmware response. + * + * Return: 0 on success or an error code on failure. + */ +static int ixgbe_switch_flash_banks(struct ixgbe_adapter *adapter, + u8 activate_flags, + bool *emp_reset_available, + struct netlink_ext_ack *extack) +{ + struct ixgbe_hw *hw = &adapter->hw; + u8 response_flags; + int err; + + err = ixgbe_nvm_write_activate(hw, activate_flags, &response_flags); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to switch active flash banks"); + return err; + } + + if (emp_reset_available) { + if (hw->dev_caps.common_cap.reset_restrict_support) + *emp_reset_available = + response_flags & IXGBE_ACI_NVM_EMPR_ENA; + else + *emp_reset_available = true; + } + + return 0; +} + +/** + * ixgbe_flash_component - Flash a component of the NVM + * @context: PLDM fw update structure + * @component: the component table to program + * + * Program the flash contents for a given component. First, determine the + * module id. Then, erase the secondary bank for this module. Finally, write + * the contents of the component to the NVM. + * + * Note this function assumes the caller has acquired the NVM resource. + * + * Return: 0 on success, or a negative error code on failure. + */ +static int ixgbe_flash_component(struct pldmfw *context, + struct pldmfw_component *component) +{ + struct ixgbe_fwu_priv *priv = container_of(context, + struct ixgbe_fwu_priv, + context); + struct netlink_ext_ack *extack = priv->extack; + struct ixgbe_adapter *adapter = priv->adapter; + const char *name; + u16 module; + int err; + u8 flag; + + switch (component->identifier) { + case NVM_COMP_ID_OROM: + module = IXGBE_E610_SR_1ST_OROM_BANK_PTR; + flag = IXGBE_ACI_NVM_ACTIV_SEL_OROM; + name = "fw.undi"; + break; + case NVM_COMP_ID_NVM: + module = IXGBE_E610_SR_1ST_NVM_BANK_PTR; + flag = IXGBE_ACI_NVM_ACTIV_SEL_NVM; + name = "fw.mgmt"; + break; + case NVM_COMP_ID_NETLIST: + module = IXGBE_E610_SR_NETLIST_BANK_PTR; + flag = IXGBE_ACI_NVM_ACTIV_SEL_NETLIST; + name = "fw.netlist"; + break; + + default: + return -EOPNOTSUPP; + } + + /* Mark this component for activating at the end. */ + priv->activate_flags |= flag; + + err = ixgbe_erase_nvm_module(adapter, module, name, extack); + if (err) + return err; + + return ixgbe_write_nvm_module(adapter, module, name, + component->component_data, + component->component_size, extack); +} + +/** + * ixgbe_finalize_update - Perform last steps to complete device update + * @context: PLDM fw update structure + * + * Called as the last step of the update process. Complete the update by + * telling the firmware to switch active banks, and perform a reset of + * configured. + * + * Return: 0 on success, or an error code on failure. + */ +static int ixgbe_finalize_update(struct pldmfw *context) +{ + struct ixgbe_fwu_priv *priv = container_of(context, + struct ixgbe_fwu_priv, + context); + struct ixgbe_adapter *adapter = priv->adapter; + struct netlink_ext_ack *extack = priv->extack; + struct devlink *devlink = adapter->devlink; + int err; + + /* Finally, notify firmware to activate the written NVM banks */ + err = ixgbe_switch_flash_banks(adapter, priv->activate_flags, + &priv->emp_reset_available, extack); + if (err) + return err; + + adapter->fw_emp_reset_disabled = !priv->emp_reset_available; + + if (!adapter->fw_emp_reset_disabled) + devlink_flash_update_status_notify(devlink, + "Suggested is to activate new firmware by devlink reload, if it doesn't work then a power cycle is required", + NULL, 0, 0); + + return 0; +} + +static const struct pldmfw_ops ixgbe_fwu_ops_e610 = { + .match_record = &pldmfw_op_pci_match_record, + .send_package_data = &ixgbe_send_package_data, + .send_component_table = &ixgbe_send_component_table, + .flash_component = &ixgbe_flash_component, + .finalize_update = &ixgbe_finalize_update, +}; + +/** + * ixgbe_get_pending_updates - Check if the component has a pending update + * @adapter: the PF driver structure + * @pending: on return, bitmap of updates pending + * @extack: Netlink extended ACK + * + * Check if the device has any pending updates on any flash components. + * + * Return: 0 on success, or a negative error code on failure. Update + * pending with the bitmap of pending updates. + */ +int ixgbe_get_pending_updates(struct ixgbe_adapter *adapter, u8 *pending, + struct netlink_ext_ack *extack) +{ + struct ixgbe_hw_dev_caps *dev_caps; + struct ixgbe_hw *hw = &adapter->hw; + int err; + + dev_caps = kzalloc(sizeof(*dev_caps), GFP_KERNEL); + if (!dev_caps) + return -ENOMEM; + + err = ixgbe_discover_dev_caps(hw, dev_caps); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Unable to read device capabilities"); + kfree(dev_caps); + return -EIO; + } + + *pending = 0; + + if (dev_caps->common_cap.nvm_update_pending_nvm) + *pending |= IXGBE_ACI_NVM_ACTIV_SEL_NVM; + + if (dev_caps->common_cap.nvm_update_pending_orom) + *pending |= IXGBE_ACI_NVM_ACTIV_SEL_OROM; + + if (dev_caps->common_cap.nvm_update_pending_netlist) + *pending |= IXGBE_ACI_NVM_ACTIV_SEL_NETLIST; + + kfree(dev_caps); + + return 0; +} + +/** + * ixgbe_cancel_pending_update - Cancel any pending update for a component + * @adapter: the PF driver structure + * @component: if not NULL, the name of the component being updated + * @extack: Netlink extended ACK structure + * + * Cancel any pending update for the specified component. If component is + * NULL, all device updates will be canceled. + * + * Return: 0 on success, or a negative error code on failure. + */ +static int ixgbe_cancel_pending_update(struct ixgbe_adapter *adapter, + const char *component, + struct netlink_ext_ack *extack) +{ + struct devlink *devlink = adapter->devlink; + struct ixgbe_hw *hw = &adapter->hw; + u8 pending; + int err; + + err = ixgbe_get_pending_updates(adapter, &pending, extack); + if (err) + return err; + + /* If the flash_update request is for a specific component, ignore all + * of the other components. + */ + if (component) { + if (strcmp(component, "fw.mgmt") == 0) + pending &= IXGBE_ACI_NVM_ACTIV_SEL_NVM; + else if (strcmp(component, "fw.undi") == 0) + pending &= IXGBE_ACI_NVM_ACTIV_SEL_OROM; + else if (strcmp(component, "fw.netlist") == 0) + pending &= IXGBE_ACI_NVM_ACTIV_SEL_NETLIST; + else + return -EINVAL; + } + + /* There is no previous pending update, so this request may continue */ + if (!pending) + return 0; + + /* In order to allow overwriting a previous pending update, notify + * firmware to cancel that update by issuing the appropriate command. + */ + devlink_flash_update_status_notify(devlink, + "Canceling previous pending update", + component, 0, 0); + + err = ixgbe_acquire_nvm(hw, IXGBE_RES_WRITE); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to acquire device flash lock"); + return -EIO; + } + + pending |= IXGBE_ACI_NVM_REVERT_LAST_ACTIV; + err = ixgbe_switch_flash_banks(adapter, pending, NULL, extack); + + ixgbe_release_nvm(hw); + + return err; +} + +/** + * ixgbe_flash_pldm_image - Write a PLDM-formatted firmware image to the device + * @devlink: pointer to devlink associated with the device to update + * @params: devlink flash update parameters + * @extack: netlink extended ACK structure + * + * Parse the data for a given firmware file, verifying that it is a valid PLDM + * formatted image that matches this device. + * + * Extract the device record Package Data and Component Tables and send them + * to the firmware. Extract and write the flash data for each of the three + * main flash components, "fw.mgmt", "fw.undi", and "fw.netlist". Notify + * firmware once the data is written to the inactive banks. + * + * Return: 0 on success or a negative error code on failure. + */ +int ixgbe_flash_pldm_image(struct devlink *devlink, + struct devlink_flash_update_params *params, + struct netlink_ext_ack *extack) +{ + struct ixgbe_adapter *adapter = devlink_priv(devlink); + struct device *dev = &adapter->pdev->dev; + struct ixgbe_hw *hw = &adapter->hw; + struct ixgbe_fwu_priv priv; + u8 preservation; + int err; + + if (hw->mac.type != ixgbe_mac_e610) + return -EOPNOTSUPP; + + switch (params->overwrite_mask) { + case 0: + /* preserve all settings and identifiers */ + preservation = IXGBE_ACI_NVM_PRESERVE_ALL; + break; + case DEVLINK_FLASH_OVERWRITE_SETTINGS: + /* Overwrite settings, but preserve vital information such as + * device identifiers. + */ + preservation = IXGBE_ACI_NVM_PRESERVE_SELECTED; + break; + case (DEVLINK_FLASH_OVERWRITE_SETTINGS | + DEVLINK_FLASH_OVERWRITE_IDENTIFIERS): + /* overwrite both settings and identifiers, preserve nothing */ + preservation = IXGBE_ACI_NVM_NO_PRESERVATION; + break; + default: + NL_SET_ERR_MSG_MOD(extack, + "Requested overwrite mask is not supported"); + return -EOPNOTSUPP; + } + + /* Cannot get caps in recovery mode, so lack of nvm_unified_update bit + * cannot lead to error + */ + if (!hw->dev_caps.common_cap.nvm_unified_update && + (hw->mac.ops.fw_recovery_mode && + !hw->mac.ops.fw_recovery_mode(hw))) { + NL_SET_ERR_MSG_MOD(extack, + "Current firmware does not support unified update"); + return -EOPNOTSUPP; + } + + memset(&priv, 0, sizeof(priv)); + + priv.context.ops = &ixgbe_fwu_ops_e610; + priv.context.dev = dev; + priv.extack = extack; + priv.adapter = adapter; + priv.activate_flags = preservation; + + devlink_flash_update_status_notify(devlink, + "Preparing to flash", NULL, 0, 0); + + err = ixgbe_cancel_pending_update(adapter, NULL, extack); + if (err) + return err; + + err = ixgbe_acquire_nvm(hw, IXGBE_RES_WRITE); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to acquire device flash lock"); + return -EIO; + } + + err = pldmfw_flash_image(&priv.context, params->fw); + if (err == -ENOENT) { + NL_SET_ERR_MSG_MOD(extack, + "Firmware image has no record matching this device"); + } else if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed to flash PLDM image"); + } + + ixgbe_release_nvm(hw); + + return err; +} diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.h new file mode 100644 index 0000000000000..abdd708c93dff --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2025 Intel Corporation. */ + +#ifndef _IXGBE_FW_UPDATE_H_ +#define _IXGBE_FW_UPDATE_H_ + +int ixgbe_flash_pldm_image(struct devlink *devlink, + struct devlink_flash_update_params *params, + struct netlink_ext_ack *extack); +int ixgbe_get_pending_updates(struct ixgbe_adapter *adapter, u8 *pending, + struct netlink_ext_ack *extack); +#endif diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c index 07ea1954a276e..648a7c618cd18 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c @@ -478,7 +478,7 @@ static int ixgbe_ipsec_parse_proto_keys(struct xfrm_state *xs, static int ixgbe_ipsec_check_mgmt_ip(struct xfrm_state *xs) { struct net_device *dev = xs->xso.real_dev; - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_hw *hw = &adapter->hw; u32 mfval, manc, reg; int num_filters = 4; @@ -563,7 +563,7 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs, struct netlink_ext_ack *extack) { struct net_device *dev = xs->xso.real_dev; - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_ipsec *ipsec = adapter->ipsec; struct ixgbe_hw *hw = &adapter->hw; int checked, match, first; @@ -757,7 +757,7 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs, static void ixgbe_ipsec_del_sa(struct xfrm_state *xs) { struct net_device *dev = xs->xso.real_dev; - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_ipsec *ipsec = adapter->ipsec; struct ixgbe_hw *hw = &adapter->hw; u32 zerobuf[4] = {0, 0, 0, 0}; @@ -1052,7 +1052,7 @@ int ixgbe_ipsec_tx(struct ixgbe_ring *tx_ring, struct ixgbe_tx_buffer *first, struct ixgbe_ipsec_tx_data *itd) { - struct ixgbe_adapter *adapter = netdev_priv(tx_ring->netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(tx_ring->netdev); struct ixgbe_ipsec *ipsec = adapter->ipsec; struct xfrm_state *xs; struct sec_path *sp; @@ -1142,7 +1142,7 @@ void ixgbe_ipsec_rx(struct ixgbe_ring *rx_ring, union ixgbe_adv_rx_desc *rx_desc, struct sk_buff *skb) { - struct ixgbe_adapter *adapter = netdev_priv(rx_ring->netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(rx_ring->netdev); __le16 pkt_info = rx_desc->wb.lower.lo_dword.hs_rss.pkt_info; __le16 ipsec_pkt_types = cpu_to_le16(IXGBE_RXDADV_PKTTYPE_IPSEC_AH | IXGBE_RXDADV_PKTTYPE_IPSEC_ESP); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index a2718218963ed..cdfafc477ee0d 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -49,6 +49,7 @@ #include "ixgbe_sriov.h" #include "ixgbe_model.h" #include "ixgbe_txrx_common.h" +#include "devlink/devlink.h" char ixgbe_driver_name[] = "ixgbe"; static const char ixgbe_driver_string[] = @@ -1095,7 +1096,7 @@ static void ixgbe_tx_timeout_reset(struct ixgbe_adapter *adapter) static int ixgbe_tx_maxrate(struct net_device *netdev, int queue_index, u32 maxrate) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; u32 bcnrc_val = ixgbe_link_mbps(adapter); @@ -4678,7 +4679,7 @@ static void ixgbe_configure_rx(struct ixgbe_adapter *adapter) static int ixgbe_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; /* add VID to filter table */ @@ -4737,7 +4738,7 @@ void ixgbe_update_pf_promisc_vlvf(struct ixgbe_adapter *adapter, u32 vid) static int ixgbe_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; /* remove VID from filter table */ @@ -4962,7 +4963,7 @@ static void ixgbe_restore_vlan(struct ixgbe_adapter *adapter) **/ static int ixgbe_write_mc_addr_list(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; if (!netif_running(netdev)) @@ -5138,7 +5139,7 @@ int ixgbe_del_mac_filter(struct ixgbe_adapter *adapter, static int ixgbe_uc_sync(struct net_device *netdev, const unsigned char *addr) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); int ret; ret = ixgbe_add_mac_filter(adapter, addr, VMDQ_P(0)); @@ -5148,7 +5149,7 @@ static int ixgbe_uc_sync(struct net_device *netdev, const unsigned char *addr) static int ixgbe_uc_unsync(struct net_device *netdev, const unsigned char *addr) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); ixgbe_del_mac_filter(adapter, addr, VMDQ_P(0)); @@ -5166,7 +5167,7 @@ static int ixgbe_uc_unsync(struct net_device *netdev, const unsigned char *addr) **/ void ixgbe_set_rx_mode(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; u32 fctrl, vmolr = IXGBE_VMOLR_BAM | IXGBE_VMOLR_AUPE; netdev_features_t features = netdev->features; @@ -5268,7 +5269,7 @@ static void ixgbe_napi_disable_all(struct ixgbe_adapter *adapter) static int ixgbe_udp_tunnel_sync(struct net_device *dev, unsigned int table) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_hw *hw = &adapter->hw; struct udp_tunnel_info ti; @@ -6600,7 +6601,7 @@ static void ixgbe_set_eee_capable(struct ixgbe_adapter *adapter) **/ static void ixgbe_tx_timeout(struct net_device *netdev, unsigned int __always_unused txqueue) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); /* Do the reset outside of interrupt context */ ixgbe_tx_timeout_reset(adapter); @@ -6849,7 +6850,7 @@ static int ixgbe_sw_init(struct ixgbe_adapter *adapter, adapter->tx_work_limit = IXGBE_DEFAULT_TX_WORK; /* initialize eeprom parameters */ - if (ixgbe_init_eeprom_params_generic(hw)) { + if (hw->eeprom.ops.init_params(hw)) { e_dev_err("EEPROM initialization failed\n"); return -EIO; } @@ -7165,7 +7166,7 @@ static int ixgbe_max_xdp_frame_size(struct ixgbe_adapter *adapter) **/ static int ixgbe_change_mtu(struct net_device *netdev, int new_mtu) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (ixgbe_enabled_xdp_adapter(adapter)) { int new_frame_size = new_mtu + IXGBE_PKT_HDR_PAD; @@ -7212,7 +7213,7 @@ static int ixgbe_change_mtu(struct net_device *netdev, int new_mtu) **/ int ixgbe_open(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; int err, queues; @@ -7316,7 +7317,7 @@ static void ixgbe_close_suspend(struct ixgbe_adapter *adapter) **/ int ixgbe_close(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); ixgbe_ptp_stop(adapter); @@ -8364,6 +8365,34 @@ static void ixgbe_reset_subtask(struct ixgbe_adapter *adapter) rtnl_unlock(); } +static int ixgbe_check_fw_api_mismatch(struct ixgbe_adapter *adapter) +{ + struct ixgbe_hw *hw = &adapter->hw; + + if (hw->mac.type != ixgbe_mac_e610) + return 0; + + if (hw->mac.ops.get_fw_ver && hw->mac.ops.get_fw_ver(hw)) + return 0; + + if (hw->api_maj_ver > IXGBE_FW_API_VER_MAJOR) { + e_dev_err("The driver for the device stopped because the NVM image is newer than expected. You must install the most recent version of the network driver.\n"); + + adapter->flags2 |= IXGBE_FLAG2_API_MISMATCH; + return -EOPNOTSUPP; + } else if (hw->api_maj_ver == IXGBE_FW_API_VER_MAJOR && + hw->api_min_ver > IXGBE_FW_API_VER_MINOR + IXGBE_FW_API_VER_DIFF_ALLOWED) { + e_dev_info("The driver for the device detected a newer version of the NVM image than expected. Please install the most recent version of the network driver.\n"); + adapter->flags2 |= IXGBE_FLAG2_API_MISMATCH; + } else if (hw->api_maj_ver < IXGBE_FW_API_VER_MAJOR || + hw->api_min_ver < IXGBE_FW_API_VER_MINOR - IXGBE_FW_API_VER_DIFF_ALLOWED) { + e_dev_info("The driver for the device detected an older version of the NVM image than expected. Please update the NVM image.\n"); + adapter->flags2 |= IXGBE_FLAG2_API_MISMATCH; + } + + return 0; +} + /** * ixgbe_check_fw_error - Check firmware for errors * @adapter: the adapter private structure @@ -8374,12 +8403,14 @@ static bool ixgbe_check_fw_error(struct ixgbe_adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; u32 fwsm; + int err; /* read fwsm.ext_err_ind register and log errors */ fwsm = IXGBE_READ_REG(hw, IXGBE_FWSM(hw)); + /* skip if E610's FW is reloading, warning in that case may be misleading */ if (fwsm & IXGBE_FWSM_EXT_ERR_IND_MASK || - !(fwsm & IXGBE_FWSM_FW_VAL_BIT)) + (!(fwsm & IXGBE_FWSM_FW_VAL_BIT) && !(hw->mac.type == ixgbe_mac_e610))) e_dev_warn("Warning firmware error detected FWSM: 0x%08X\n", fwsm); @@ -8387,10 +8418,53 @@ static bool ixgbe_check_fw_error(struct ixgbe_adapter *adapter) e_dev_err("Firmware recovery mode detected. Limiting functionality. Refer to the Intel(R) Ethernet Adapters and Devices User Guide for details on firmware recovery mode.\n"); return true; } + if (!(adapter->flags2 & IXGBE_FLAG2_API_MISMATCH)) { + err = ixgbe_check_fw_api_mismatch(adapter); + if (err) + return true; + } + + /* return here if FW rollback mode has been already detected */ + if (adapter->flags2 & IXGBE_FLAG2_FW_ROLLBACK) + return false; + + if (hw->mac.ops.fw_rollback_mode && hw->mac.ops.fw_rollback_mode(hw)) { + struct ixgbe_nvm_info *nvm_info = &adapter->hw.flash.nvm; + char ver_buff[64] = ""; + + if (hw->mac.ops.get_fw_ver && hw->mac.ops.get_fw_ver(hw)) + goto no_version; + + if (hw->mac.ops.get_nvm_ver && + hw->mac.ops.get_nvm_ver(hw, nvm_info)) + goto no_version; + + snprintf(ver_buff, sizeof(ver_buff), + "Current version is NVM:%x.%x.%x, FW:%d.%d. ", + nvm_info->major, nvm_info->minor, nvm_info->eetrack, + hw->fw_maj_ver, hw->fw_maj_ver); +no_version: + e_dev_warn("Firmware rollback mode detected. %sDevice may exhibit limited functionality. Refer to the Intel(R) Ethernet Adapters and Devices User Guide for details on firmware rollback mode.", + ver_buff); + + adapter->flags2 |= IXGBE_FLAG2_FW_ROLLBACK; + } return false; } +static void ixgbe_recovery_service_task(struct work_struct *work) +{ + struct ixgbe_adapter *adapter = container_of(work, + struct ixgbe_adapter, + service_task); + + ixgbe_handle_fw_event(adapter); + ixgbe_service_event_complete(adapter); + + mod_timer(&adapter->service_timer, jiffies + msecs_to_jiffies(100)); +} + /** * ixgbe_service_task - manages and runs subtasks * @work: pointer to work_struct containing our data @@ -8410,8 +8484,13 @@ static void ixgbe_service_task(struct work_struct *work) return; } if (ixgbe_check_fw_error(adapter)) { - if (!test_bit(__IXGBE_DOWN, &adapter->state)) + if (!test_bit(__IXGBE_DOWN, &adapter->state)) { + if (adapter->mii_bus) { + mdiobus_unregister(adapter->mii_bus); + adapter->mii_bus = NULL; + } unregister_netdev(adapter->netdev); + } ixgbe_service_event_complete(adapter); return; } @@ -9001,7 +9080,7 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, switch (vlan_get_protocol(skb)) { case htons(ETH_P_FCOE): case htons(ETH_P_FIP): - adapter = netdev_priv(dev); + adapter = ixgbe_from_netdev(dev); if (!sb_dev && (adapter->flags & IXGBE_FLAG_FCOE_ENABLED)) break; @@ -9260,7 +9339,7 @@ static netdev_tx_t __ixgbe_xmit_frame(struct sk_buff *skb, struct net_device *netdev, struct ixgbe_ring *ring) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_ring *tx_ring; /* @@ -9292,7 +9371,7 @@ static netdev_tx_t ixgbe_xmit_frame(struct sk_buff *skb, **/ static int ixgbe_set_mac(struct net_device *netdev, void *p) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; struct sockaddr *addr = p; @@ -9310,7 +9389,7 @@ static int ixgbe_set_mac(struct net_device *netdev, void *p) static int ixgbe_mdio_read(struct net_device *netdev, int prtad, int devad, u16 addr) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; u16 value; int rc; @@ -9336,7 +9415,7 @@ ixgbe_mdio_read(struct net_device *netdev, int prtad, int devad, u16 addr) static int ixgbe_mdio_write(struct net_device *netdev, int prtad, int devad, u16 addr, u16 value) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; if (adapter->mii_bus) { @@ -9356,7 +9435,7 @@ static int ixgbe_mdio_write(struct net_device *netdev, int prtad, int devad, static int ixgbe_ioctl(struct net_device *netdev, struct ifreq *req, int cmd) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); switch (cmd) { case SIOCSHWTSTAMP: @@ -9382,7 +9461,7 @@ static int ixgbe_ioctl(struct net_device *netdev, struct ifreq *req, int cmd) static int ixgbe_add_sanmac_netdev(struct net_device *dev) { int err = 0; - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_hw *hw = &adapter->hw; if (is_valid_ether_addr(hw->mac.san_addr)) { @@ -9406,7 +9485,7 @@ static int ixgbe_add_sanmac_netdev(struct net_device *dev) static int ixgbe_del_sanmac_netdev(struct net_device *dev) { int err = 0; - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_mac_info *mac = &adapter->hw.mac; if (is_valid_ether_addr(mac->san_addr)) { @@ -9437,7 +9516,7 @@ static void ixgbe_get_ring_stats64(struct rtnl_link_stats64 *stats, static void ixgbe_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); int i; rcu_read_lock(); @@ -9480,7 +9559,7 @@ static void ixgbe_get_stats64(struct net_device *netdev, static int ixgbe_ndo_get_vf_stats(struct net_device *netdev, int vf, struct ifla_vf_stats *vf_stats) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (vf < 0 || vf >= adapter->num_vfs) return -EINVAL; @@ -9597,7 +9676,7 @@ static int ixgbe_reassign_macvlan_pool(struct net_device *vdev, static void ixgbe_defrag_macvlan_pools(struct net_device *dev) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct netdev_nested_priv priv = { .data = (void *)adapter, }; @@ -9618,7 +9697,7 @@ static void ixgbe_defrag_macvlan_pools(struct net_device *dev) */ int ixgbe_setup_tc(struct net_device *dev, u8 tc) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_hw *hw = &adapter->hw; /* Hardware supports up to 8 traffic classes */ @@ -10176,7 +10255,7 @@ static LIST_HEAD(ixgbe_block_cb_list); static int __ixgbe_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); switch (type) { case TC_SETUP_BLOCK: @@ -10204,7 +10283,7 @@ void ixgbe_sriov_reinit(struct ixgbe_adapter *adapter) #endif void ixgbe_do_reset(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (netif_running(netdev)) ixgbe_reinit_locked(adapter); @@ -10215,7 +10294,7 @@ void ixgbe_do_reset(struct net_device *netdev) static netdev_features_t ixgbe_fix_features(struct net_device *netdev, netdev_features_t features) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); /* If Rx checksum is disabled, then RSC/LRO should also be disabled */ if (!(features & NETIF_F_RXCSUM)) @@ -10252,7 +10331,7 @@ static void ixgbe_reset_l2fw_offload(struct ixgbe_adapter *adapter) static int ixgbe_set_features(struct net_device *netdev, netdev_features_t features) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); netdev_features_t changed = netdev->features ^ features; bool need_reset = false; @@ -10328,7 +10407,7 @@ static int ixgbe_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], { /* guarantee we can provide a unique filter for the unicast address */ if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); u16 pool = VMDQ_P(0); if (netdev_uc_count(dev) >= ixgbe_available_rars(adapter, pool)) @@ -10416,7 +10495,7 @@ static int ixgbe_ndo_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, struct netlink_ext_ack *extack) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct nlattr *attr, *br_spec; int rem; @@ -10444,7 +10523,7 @@ static int ixgbe_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); if (!(adapter->flags & IXGBE_FLAG_SRIOV_ENABLED)) return 0; @@ -10456,7 +10535,7 @@ static int ixgbe_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, static void *ixgbe_fwd_add(struct net_device *pdev, struct net_device *vdev) { - struct ixgbe_adapter *adapter = netdev_priv(pdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(pdev); struct ixgbe_fwd_adapter *accel; int tcs = adapter->hw_tcs ? : 1; int pool, err; @@ -10553,7 +10632,7 @@ static void *ixgbe_fwd_add(struct net_device *pdev, struct net_device *vdev) static void ixgbe_fwd_del(struct net_device *pdev, void *priv) { struct ixgbe_fwd_adapter *accel = priv; - struct ixgbe_adapter *adapter = netdev_priv(pdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(pdev); unsigned int rxbase = accel->rx_base_queue; unsigned int i; @@ -10631,7 +10710,7 @@ ixgbe_features_check(struct sk_buff *skb, struct net_device *dev, static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog) { int i, frame_size = dev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct bpf_prog *old_prog; bool need_reset; int num_queues; @@ -10703,7 +10782,7 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog) static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); switch (xdp->command) { case XDP_SETUP_PROG: @@ -10738,7 +10817,7 @@ void ixgbe_xdp_ring_update_tail_locked(struct ixgbe_ring *ring) static int ixgbe_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_ring *ring; int nxmit = 0; int i; @@ -11146,7 +11225,7 @@ bool ixgbe_wol_supported(struct ixgbe_adapter *adapter, u16 device_id, * format to display. The FW version is taken from the EEPROM/NVM. * */ -static void ixgbe_set_fw_version_e610(struct ixgbe_adapter *adapter) +void ixgbe_set_fw_version_e610(struct ixgbe_adapter *adapter) { struct ixgbe_orom_info *orom = &adapter->hw.flash.orom; struct ixgbe_nvm_info *nvm = &adapter->hw.flash.nvm; @@ -11196,6 +11275,65 @@ static void ixgbe_set_fw_version(struct ixgbe_adapter *adapter) "0x%08x", nvm_ver.etk_id); } +/** + * ixgbe_recovery_probe - Handle FW recovery mode during probe + * @adapter: the adapter private structure + * + * Perform limited driver initialization when FW error is detected. + * + * Return: 0 on successful probe for E610, -EIO if recovery mode is detected + * for non-E610 adapter, error status code on any other case. + */ +static int ixgbe_recovery_probe(struct ixgbe_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + struct pci_dev *pdev = adapter->pdev; + struct ixgbe_hw *hw = &adapter->hw; + bool disable_dev; + int err = -EIO; + + if (hw->mac.type != ixgbe_mac_e610) + goto clean_up_probe; + + ixgbe_get_hw_control(adapter); + mutex_init(&hw->aci.lock); + err = ixgbe_get_flash_data(&adapter->hw); + if (err) + goto shutdown_aci; + + timer_setup(&adapter->service_timer, ixgbe_service_timer, 0); + INIT_WORK(&adapter->service_task, ixgbe_recovery_service_task); + set_bit(__IXGBE_SERVICE_INITED, &adapter->state); + clear_bit(__IXGBE_SERVICE_SCHED, &adapter->state); + + if (hw->mac.ops.get_bus_info) + hw->mac.ops.get_bus_info(hw); + + pci_set_drvdata(pdev, adapter); + /* We are creating devlink interface so NIC can be managed, + * e.g. new NVM image loaded + */ + devl_lock(adapter->devlink); + ixgbe_devlink_register_port(adapter); + SET_NETDEV_DEVLINK_PORT(adapter->netdev, + &adapter->devlink_port); + devl_register(adapter->devlink); + devl_unlock(adapter->devlink); + + return 0; +shutdown_aci: + mutex_destroy(&adapter->hw.aci.lock); + ixgbe_release_hw_control(adapter); + devlink_free(adapter->devlink); +clean_up_probe: + disable_dev = !test_and_set_bit(__IXGBE_DISABLED, &adapter->state); + free_netdev(netdev); + pci_release_mem_regions(pdev); + if (disable_dev) + pci_disable_device(pdev); + return err; +} + /** * ixgbe_probe - Device Initialization Routine * @pdev: PCI device information struct @@ -11210,6 +11348,7 @@ static void ixgbe_set_fw_version(struct ixgbe_adapter *adapter) static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { struct net_device *netdev; + struct ixgbe_netdevice_priv *netdev_priv_wrapper; struct ixgbe_adapter *adapter = NULL; struct ixgbe_hw *hw; const struct ixgbe_info *ii = ixgbe_info_tbl[ent->driver_data]; @@ -11263,7 +11402,13 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) indices = IXGBE_MAX_RSS_INDICES_X550; } - netdev = alloc_etherdev_mq(sizeof(struct ixgbe_adapter), indices); + adapter = ixgbe_allocate_devlink(&pdev->dev); + if (IS_ERR(adapter)) { + err = PTR_ERR(adapter); + goto err_devlink; + } + + netdev = alloc_etherdev_mq(sizeof(*netdev_priv_wrapper), indices); if (!netdev) { err = -ENOMEM; goto err_alloc_etherdev; @@ -11271,7 +11416,8 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) SET_NETDEV_DEV(netdev, &pdev->dev); - adapter = netdev_priv(netdev); + netdev_priv_wrapper = netdev_priv(netdev); + netdev_priv_wrapper->adapter = adapter; adapter->netdev = netdev; adapter->pdev = pdev; @@ -11326,10 +11472,21 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto err_sw_init; + /* Make sure the SWFW semaphore is in a valid state */ + if (hw->mac.ops.init_swfw_sync) + hw->mac.ops.init_swfw_sync(hw); + + if (ixgbe_check_fw_error(adapter)) + return ixgbe_recovery_probe(adapter); + if (adapter->hw.mac.type == ixgbe_mac_e610) { err = ixgbe_get_caps(&adapter->hw); if (err) dev_err(&pdev->dev, "ixgbe_get_caps failed %d\n", err); + + err = ixgbe_get_flash_data(&adapter->hw); + if (err) + goto err_sw_init; } if (adapter->hw.mac.type == ixgbe_mac_82599EB) @@ -11348,10 +11505,6 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) break; } - /* Make sure the SWFW semaphore is in a valid state */ - if (hw->mac.ops.init_swfw_sync) - hw->mac.ops.init_swfw_sync(hw); - /* Make it possible the adapter to be woken up via WOL */ switch (adapter->hw.mac.type) { case ixgbe_mac_82599EB: @@ -11504,11 +11657,6 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) netdev->features |= NETIF_F_LRO; - if (ixgbe_check_fw_error(adapter)) { - err = -EIO; - goto err_sw_init; - } - /* make sure the EEPROM is good */ if (hw->eeprom.ops.validate_checksum(hw, NULL) < 0) { e_dev_err("The EEPROM Checksum Is Not Valid\n"); @@ -11591,7 +11739,7 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (expected_gts > 0) ixgbe_check_minimum_link(adapter, expected_gts); - err = ixgbe_read_pba_string_generic(hw, part_str, sizeof(part_str)); + err = hw->eeprom.ops.read_pba_string(hw, part_str, sizeof(part_str)); if (err) strscpy(part_str, "Unknown", sizeof(part_str)); if (ixgbe_is_sfp(hw) && hw->phy.sfp_type != ixgbe_sfp_type_not_present) @@ -11617,6 +11765,11 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } strcpy(netdev->name, "eth%d"); pci_set_drvdata(pdev, adapter); + + devl_lock(adapter->devlink); + ixgbe_devlink_register_port(adapter); + SET_NETDEV_DEVLINK_PORT(adapter->netdev, &adapter->devlink_port); + err = register_netdev(netdev); if (err) goto err_register; @@ -11671,11 +11824,15 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto err_netdev; + devl_register(adapter->devlink); + devl_unlock(adapter->devlink); return 0; err_netdev: unregister_netdev(netdev); err_register: + devl_port_unregister(&adapter->devlink_port); + devl_unlock(adapter->devlink); ixgbe_release_hw_control(adapter); ixgbe_clear_interrupt_scheme(adapter); if (hw->mac.type == ixgbe_mac_e610) @@ -11692,7 +11849,9 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) disable_dev = !test_and_set_bit(__IXGBE_DISABLED, &adapter->state); free_netdev(netdev); err_alloc_etherdev: + devlink_free(adapter->devlink); pci_release_mem_regions(pdev); +err_devlink: err_pci_reg: err_dma: if (!adapter || disable_dev) @@ -11721,6 +11880,8 @@ static void ixgbe_remove(struct pci_dev *pdev) return; netdev = adapter->netdev; + devl_lock(adapter->devlink); + devl_unregister(adapter->devlink); ixgbe_dbg_adapter_exit(adapter); set_bit(__IXGBE_REMOVING, &adapter->state); @@ -11756,6 +11917,10 @@ static void ixgbe_remove(struct pci_dev *pdev) if (netdev->reg_state == NETREG_REGISTERED) unregister_netdev(netdev); + devl_port_unregister(&adapter->devlink_port); + devl_unlock(adapter->devlink); + devlink_free(adapter->devlink); + ixgbe_stop_ipsec_offload(adapter); ixgbe_clear_interrupt_scheme(adapter); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c index 0a03a8bb5f886..2d54828bdfbbc 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c @@ -167,7 +167,7 @@ int ixgbe_write_i2c_combined_generic_int(struct ixgbe_hw *hw, u8 addr, u16 reg, u16 val, bool lock) { u32 swfw_mask = hw->phy.phy_semaphore_mask; - int max_retry = 1; + int max_retry = 3; int retry = 0; u8 reg_high; u8 csum; @@ -2285,7 +2285,7 @@ static int ixgbe_write_i2c_byte_generic_int(struct ixgbe_hw *hw, u8 byte_offset, u8 dev_addr, u8 data, bool lock) { u32 swfw_mask = hw->phy.phy_semaphore_mask; - u32 max_retry = 1; + u32 max_retry = 3; u32 retry = 0; int status; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c index ccdce80edd142..0dbbd2befd4df 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c @@ -1418,7 +1418,7 @@ void ixgbe_set_all_vfs(struct ixgbe_adapter *adapter) int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); int retval; if (vf >= adapter->num_vfs) @@ -1526,7 +1526,7 @@ int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos, __be16 vlan_proto) { int err = 0; - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if ((vf >= adapter->num_vfs) || (vlan > 4095) || (qos > 7)) return -EINVAL; @@ -1644,7 +1644,7 @@ void ixgbe_check_vf_rate_limit(struct ixgbe_adapter *adapter) int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int min_tx_rate, int max_tx_rate) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); int link_speed; /* verify VF is active */ @@ -1679,7 +1679,7 @@ int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int min_tx_rate, int ixgbe_ndo_set_vf_spoofchk(struct net_device *netdev, int vf, bool setting) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; if (vf >= adapter->num_vfs) @@ -1757,7 +1757,7 @@ void ixgbe_set_vf_link_state(struct ixgbe_adapter *adapter, int vf, int state) **/ int ixgbe_ndo_set_vf_link_state(struct net_device *netdev, int vf, int state) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); int ret = 0; if (vf < 0 || vf >= adapter->num_vfs) { @@ -1794,7 +1794,7 @@ int ixgbe_ndo_set_vf_link_state(struct net_device *netdev, int vf, int state) int ixgbe_ndo_set_vf_rss_query_en(struct net_device *netdev, int vf, bool setting) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); /* This operation is currently supported only for 82599 and x540 * devices. @@ -1813,7 +1813,7 @@ int ixgbe_ndo_set_vf_rss_query_en(struct net_device *netdev, int vf, int ixgbe_ndo_set_vf_trust(struct net_device *netdev, int vf, bool setting) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (vf >= adapter->num_vfs) return -EINVAL; @@ -1836,7 +1836,7 @@ int ixgbe_ndo_set_vf_trust(struct net_device *netdev, int vf, bool setting) int ixgbe_ndo_get_vf_config(struct net_device *netdev, int vf, struct ifla_vf_info *ivi) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (vf >= adapter->num_vfs) return -EINVAL; ivi->vf = vf; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h index 5fdf32d79d822..892fa6c1f8797 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h @@ -3446,6 +3446,8 @@ struct ixgbe_eeprom_operations { int (*validate_checksum)(struct ixgbe_hw *, u16 *); int (*update_checksum)(struct ixgbe_hw *); int (*calc_checksum)(struct ixgbe_hw *); + int (*read_pba_string)(struct ixgbe_hw *hw, u8 *pba_num, + u32 pba_num_size); }; struct ixgbe_mac_operations { @@ -3454,6 +3456,7 @@ struct ixgbe_mac_operations { int (*start_hw)(struct ixgbe_hw *); int (*clear_hw_cntrs)(struct ixgbe_hw *); enum ixgbe_media_type (*get_media_type)(struct ixgbe_hw *); + int (*get_fw_ver)(struct ixgbe_hw *hw); int (*get_mac_addr)(struct ixgbe_hw *, u8 *); int (*get_san_mac_addr)(struct ixgbe_hw *, u8 *); int (*get_device_caps)(struct ixgbe_hw *, u16 *); @@ -3522,6 +3525,8 @@ struct ixgbe_mac_operations { int (*get_thermal_sensor_data)(struct ixgbe_hw *); int (*init_thermal_sensor_thresh)(struct ixgbe_hw *hw); bool (*fw_recovery_mode)(struct ixgbe_hw *hw); + bool (*fw_rollback_mode)(struct ixgbe_hw *hw); + int (*get_nvm_ver)(struct ixgbe_hw *hw, struct ixgbe_nvm_info *nvm); void (*disable_rx)(struct ixgbe_hw *hw); void (*enable_rx)(struct ixgbe_hw *hw); void (*set_source_address_pruning)(struct ixgbe_hw *, bool, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h index 617e07878e4f7..bea94e5ccb730 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h @@ -10,11 +10,75 @@ #define IXGBE_MAX_VSI 768 /* Checksum and Shadow RAM pointers */ -#define E610_SR_SW_CHECKSUM_WORD 0x3F +#define IXGBE_E610_SR_NVM_CTRL_WORD 0x00 +#define IXGBE_E610_SR_PBA_BLOCK_PTR 0x16 +#define IXGBE_E610_SR_PBA_BLOCK_MASK GENMASK(15, 8) +#define IXGBE_E610_SR_NVM_DEV_STARTER_VER 0x18 +#define IXGBE_E610_SR_NVM_EETRACK_LO 0x2D +#define IXGBE_E610_SR_NVM_EETRACK_HI 0x2E +#define IXGBE_E610_NVM_VER_LO_MASK GENMASK(7, 0) +#define IXGBE_E610_NVM_VER_HI_MASK GENMASK(15, 12) +#define IXGBE_E610_SR_SW_CHECKSUM_WORD 0x3F +#define IXGBE_E610_SR_PFA_PTR 0x40 +#define IXGBE_E610_SR_1ST_NVM_BANK_PTR 0x42 +#define IXGBE_E610_SR_NVM_BANK_SIZE 0x43 +#define IXGBE_E610_SR_1ST_OROM_BANK_PTR 0x44 +#define IXGBE_E610_SR_OROM_BANK_SIZE 0x45 +#define IXGBE_E610_SR_NETLIST_BANK_PTR 0x46 +#define IXGBE_E610_SR_NETLIST_BANK_SIZE 0x47 + +/* The OROM version topology */ +#define IXGBE_OROM_VER_PATCH_MASK GENMASK_ULL(7, 0) +#define IXGBE_OROM_VER_BUILD_MASK GENMASK_ULL(23, 8) +#define IXGBE_OROM_VER_MASK GENMASK_ULL(31, 24) + +/* CSS Header words */ +#define IXGBE_NVM_CSS_HDR_LEN_L 0x02 +#define IXGBE_NVM_CSS_HDR_LEN_H 0x03 +#define IXGBE_NVM_CSS_SREV_L 0x14 +#define IXGBE_NVM_CSS_SREV_H 0x15 + +#define IXGBE_HDR_LEN_ROUNDUP 32 + +/* Length of Authentication header section in words */ +#define IXGBE_NVM_AUTH_HEADER_LEN 0x08 /* Shadow RAM related */ #define IXGBE_SR_WORDS_IN_1KB 512 +/* The Netlist ID Block is located after all of the Link Topology nodes. */ +#define IXGBE_NETLIST_ID_BLK_SIZE 0x30 +#define IXGBE_NETLIST_ID_BLK_OFFSET(n) IXGBE_NETLIST_LINK_TOPO_OFFSET(0x0004 + 2 * (n)) + +/* netlist ID block field offsets (word offsets) */ +#define IXGBE_NETLIST_ID_BLK_MAJOR_VER_LOW 0x02 +#define IXGBE_NETLIST_ID_BLK_MAJOR_VER_HIGH 0x03 +#define IXGBE_NETLIST_ID_BLK_MINOR_VER_LOW 0x04 +#define IXGBE_NETLIST_ID_BLK_MINOR_VER_HIGH 0x05 +#define IXGBE_NETLIST_ID_BLK_TYPE_LOW 0x06 +#define IXGBE_NETLIST_ID_BLK_TYPE_HIGH 0x07 +#define IXGBE_NETLIST_ID_BLK_REV_LOW 0x08 +#define IXGBE_NETLIST_ID_BLK_REV_HIGH 0x09 +#define IXGBE_NETLIST_ID_BLK_SHA_HASH_WORD(n) (0x0A + (n)) +#define IXGBE_NETLIST_ID_BLK_CUST_VER 0x2F + +/* The Link Topology Netlist section is stored as a series of words. It is + * stored in the NVM as a TLV, with the first two words containing the type + * and length. + */ +#define IXGBE_NETLIST_LINK_TOPO_MOD_ID 0x011B +#define IXGBE_NETLIST_TYPE_OFFSET 0x0000 +#define IXGBE_NETLIST_LEN_OFFSET 0x0001 + +/* The Link Topology section follows the TLV header. When reading the netlist + * using ixgbe_read_netlist_module, we need to account for the 2-word TLV + * header. + */ +#define IXGBE_NETLIST_LINK_TOPO_OFFSET(n) ((n) + 2) +#define IXGBE_LINK_TOPO_MODULE_LEN IXGBE_NETLIST_LINK_TOPO_OFFSET(0x0000) +#define IXGBE_LINK_TOPO_NODE_COUNT IXGBE_NETLIST_LINK_TOPO_OFFSET(0x0001) +#define IXGBE_LINK_TOPO_NODE_COUNT_M GENMASK_ULL(9, 0) + /* Firmware Status Register (GL_FWSTS) */ #define GL_FWSTS 0x00083048 /* Reset Source: POR */ #define GL_FWSTS_EP_PF0 BIT(24) @@ -24,11 +88,23 @@ #define GLNVM_GENS 0x000B6100 /* Reset Source: POR */ #define GLNVM_GENS_SR_SIZE_M GENMASK(7, 5) +#define IXGBE_GL_MNG_FWSM 0x000B6134 /* Reset Source: POR */ +#define IXGBE_GL_MNG_FWSM_RECOVERY_M BIT(1) +#define IXGBE_GL_MNG_FWSM_ROLLBACK_M BIT(2) + /* Flash Access Register */ #define IXGBE_GLNVM_FLA 0x000B6108 /* Reset Source: POR */ #define IXGBE_GLNVM_FLA_LOCKED_S 6 #define IXGBE_GLNVM_FLA_LOCKED_M BIT(6) +/* Auxiliary field, mask and shift definition for Shadow RAM and NVM Flash */ +#define IXGBE_SR_CTRL_WORD_1_M GENMASK(7, 6) +#define IXGBE_SR_CTRL_WORD_VALID BIT(0) +#define IXGBE_SR_CTRL_WORD_OROM_BANK BIT(3) +#define IXGBE_SR_CTRL_WORD_NETLIST_BANK BIT(4) +#define IXGBE_SR_CTRL_WORD_NVM_BANK BIT(5) +#define IXGBE_SR_NVM_PTR_4KB_UNITS BIT(15) + /* Admin Command Interface (ACI) registers */ #define IXGBE_PF_HIDA(_i) (0x00085000 + ((_i) * 4)) #define IXGBE_PF_HIDA_2(_i) (0x00085020 + ((_i) * 4)) @@ -40,6 +116,10 @@ #define IXGBE_PF_HICR_SV BIT(2) #define IXGBE_PF_HICR_EV BIT(3) +#define IXGBE_FW_API_VER_MAJOR 0x01 +#define IXGBE_FW_API_VER_MINOR 0x07 +#define IXGBE_FW_API_VER_DIFF_ALLOWED 0x02 + #define IXGBE_ACI_DESC_SIZE 32 #define IXGBE_ACI_DESC_SIZE_IN_DWORDS (IXGBE_ACI_DESC_SIZE / BYTES_PER_DWORD) @@ -761,6 +841,8 @@ struct ixgbe_aci_cmd_nvm { #define IXGBE_ACI_NVM_MAX_OFFSET 0xFFFFFF __le16 offset_low; u8 offset_high; /* For Write Activate offset_high is used as flags2 */ +#define IXGBE_ACI_NVM_OFFSET_HI_A_MASK GENMASK(15, 8) +#define IXGBE_ACI_NVM_OFFSET_HI_U_MASK GENMASK(23, 16) u8 cmd_flags; #define IXGBE_ACI_NVM_LAST_CMD BIT(0) #define IXGBE_ACI_NVM_PCIR_REQ BIT(0) /* Used by NVM Write reply */ @@ -776,6 +858,9 @@ struct ixgbe_aci_cmd_nvm { #define IXGBE_ACI_NVM_PERST_FLAG 1 #define IXGBE_ACI_NVM_EMPR_FLAG 2 #define IXGBE_ACI_NVM_EMPR_ENA BIT(0) /* Write Activate reply only */ +#define IXGBE_ACI_NVM_NO_PRESERVATION 0x0 +#define IXGBE_ACI_NVM_PRESERVE_SELECTED 0x6 + /* For Write Activate, several flags are sent as part of a separate * flags2 field using a separate byte. For simplicity of the software * interface, we pass the flags as a 16 bit value so these flags are @@ -805,6 +890,63 @@ struct ixgbe_aci_cmd_nvm_checksum { u8 rsvd2[12]; }; +/* Used for NVM Set Package Data command - 0x070A */ +struct ixgbe_aci_cmd_nvm_pkg_data { + u8 reserved[3]; + u8 cmd_flags; +#define IXGBE_ACI_NVM_PKG_DELETE BIT(0) /* used for command call */ + + u32 reserved1; + __le32 addr_high; + __le32 addr_low; +}; + +/* Used for Pass Component Table command - 0x070B */ +struct ixgbe_aci_cmd_nvm_pass_comp_tbl { + u8 component_response; /* Response only */ +#define IXGBE_ACI_NVM_PASS_COMP_CAN_BE_UPDATED 0x0 +#define IXGBE_ACI_NVM_PASS_COMP_CAN_MAY_BE_UPDATEABLE 0x1 +#define IXGBE_ACI_NVM_PASS_COMP_CAN_NOT_BE_UPDATED 0x2 +#define IXGBE_ACI_NVM_PASS_COMP_PARTIAL_CHECK 0x3 + u8 component_response_code; /* Response only */ +#define IXGBE_ACI_NVM_PASS_COMP_CAN_BE_UPDATED_CODE 0x0 +#define IXGBE_ACI_NVM_PASS_COMP_STAMP_IDENTICAL_CODE 0x1 +#define IXGBE_ACI_NVM_PASS_COMP_STAMP_LOWER 0x2 +#define IXGBE_ACI_NVM_PASS_COMP_INVALID_STAMP_CODE 0x3 +#define IXGBE_ACI_NVM_PASS_COMP_CONFLICT_CODE 0x4 +#define IXGBE_ACI_NVM_PASS_COMP_PRE_REQ_NOT_MET_CODE 0x5 +#define IXGBE_ACI_NVM_PASS_COMP_NOT_SUPPORTED_CODE 0x6 +#define IXGBE_ACI_NVM_PASS_COMP_CANNOT_DOWNGRADE_CODE 0x7 +#define IXGBE_ACI_NVM_PASS_COMP_INCOMPLETE_IMAGE_CODE 0x8 +#define IXGBE_ACI_NVM_PASS_COMP_VER_STR_IDENTICAL_CODE 0xA +#define IXGBE_ACI_NVM_PASS_COMP_VER_STR_LOWER_CODE 0xB + u8 reserved; + u8 transfer_flag; + __le32 reserved1; + __le32 addr_high; + __le32 addr_low; +}; + +struct ixgbe_aci_cmd_nvm_comp_tbl { + __le16 comp_class; +#define NVM_COMP_CLASS_ALL_FW 0x000A + + __le16 comp_id; +#define NVM_COMP_ID_OROM 0x5 +#define NVM_COMP_ID_NVM 0x6 +#define NVM_COMP_ID_NETLIST 0x8 + + u8 comp_class_idx; +#define FWU_COMP_CLASS_IDX_NOT_USE 0x0 + + __le32 comp_cmp_stamp; + u8 cvs_type; +#define NVM_CVS_TYPE_ASCII 0x1 + + u8 cvs_len; + u8 cvs[]; /* Component Version String */ +} __packed; + /** * struct ixgbe_aci_desc - Admin Command (AC) descriptor * @flags: IXGBE_ACI_FLAG_* flags @@ -848,6 +990,8 @@ struct ixgbe_aci_desc { struct ixgbe_aci_cmd_sff_eeprom read_write_sff_param; struct ixgbe_aci_cmd_nvm nvm; struct ixgbe_aci_cmd_nvm_checksum nvm_checksum; + struct ixgbe_aci_cmd_nvm_pkg_data pkg_data; + struct ixgbe_aci_cmd_nvm_pass_comp_tbl pass_comp_tbl; } params; }; @@ -984,6 +1128,16 @@ struct ixgbe_hw_caps { #define IXGBE_EXT_TOPO_DEV_IMG_PROG_EN BIT(1) } __packed; +#define IXGBE_OROM_CIV_SIGNATURE "$CIV" + +struct ixgbe_orom_civd_info { + u8 signature[4]; /* Must match ASCII '$CIV' characters */ + u8 checksum; /* Simple modulo 256 sum of all structure bytes must equal 0 */ + __le32 combo_ver; /* Combo Image Version number */ + u8 combo_name_len; /* Length of the unicode combo image version string, max of 32 */ + __le16 combo_name[32]; /* Unicode string representing the Combo Image version */ +}; + /* Function specific capabilities */ struct ixgbe_hw_func_caps { u32 num_allocd_vfs; /* Number of allocated VFs */ @@ -1015,6 +1169,11 @@ struct ixgbe_aci_info { enum ixgbe_aci_err last_status; /* last status of sent admin command */ }; +enum ixgbe_bank_select { + IXGBE_ACTIVE_FLASH_BANK, + IXGBE_INACTIVE_FLASH_BANK, +}; + /* Option ROM version information */ struct ixgbe_orom_info { u8 major; /* Major version of OROM */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c index 1fc821fb351a5..f1ab95aa8c83b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c @@ -894,6 +894,7 @@ static const struct ixgbe_eeprom_operations eeprom_ops_X540 = { .calc_checksum = &ixgbe_calc_eeprom_checksum_X540, .validate_checksum = &ixgbe_validate_eeprom_checksum_X540, .update_checksum = &ixgbe_update_eeprom_checksum_X540, + .read_pba_string = &ixgbe_read_pba_string_generic, }; static const struct ixgbe_phy_operations phy_ops_X540 = { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c index 277ceaf8a7939..1d2acdb64f459 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c @@ -3959,6 +3959,7 @@ static const struct ixgbe_mac_operations mac_ops_x550em_a_fw = { .validate_checksum = &ixgbe_validate_eeprom_checksum_X550, \ .update_checksum = &ixgbe_update_eeprom_checksum_X550, \ .calc_checksum = &ixgbe_calc_eeprom_checksum_X550, \ + .read_pba_string = &ixgbe_read_pba_string_generic, \ static const struct ixgbe_eeprom_operations eeprom_ops_X550 = { X550_COMMON_EEP diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index 3e3b471e53f06..ac58964b2f087 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -508,7 +508,7 @@ bool ixgbe_clean_xdp_tx_irq(struct ixgbe_q_vector *q_vector, int ixgbe_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_ring *ring; if (test_bit(__IXGBE_DOWN, &adapter->state)) diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c index 18c922dd5fc64..5841e30dff2a9 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c @@ -18,8 +18,6 @@ #include "octep_vf_config.h" #include "octep_vf_main.h" -struct workqueue_struct *octep_vf_wq; - /* Supported Devices */ static const struct pci_device_id octep_vf_pci_id_tbl[] = { {PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_PCI_DEVICE_ID_CN93_VF)}, diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.h b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.h index 1a352f41f823c..b9f13506f4620 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.h +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.h @@ -320,8 +320,6 @@ static inline u16 OCTEP_VF_MINOR_REV(struct octep_vf_device *oct) #define octep_vf_read_csr64(octep_vf_dev, reg_off) \ readq((octep_vf_dev)->mmio.hw_addr + (reg_off)) -extern struct workqueue_struct *octep_vf_wq; - int octep_vf_device_setup(struct octep_vf_device *oct); int octep_vf_setup_iqs(struct octep_vf_device *oct); void octep_vf_free_iqs(struct octep_vf_device *oct); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index 60f085b00a8cc..147d7f5c1fcc0 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @@ -969,8 +969,6 @@ void rvu_npc_enable_promisc_entry(struct rvu *rvu, u16 pcifunc, int nixlf, bool enable); void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc, int nixlf, u64 chan); -void rvu_npc_enable_bcast_entry(struct rvu *rvu, u16 pcifunc, int nixlf, - bool enable); void rvu_npc_install_allmulti_entry(struct rvu *rvu, u16 pcifunc, int nixlf, u64 chan); void rvu_npc_enable_allmulti_entry(struct rvu *rvu, u16 pcifunc, int nixlf, diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index 821fe242f821f..6296a3cdabbb1 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -820,24 +820,6 @@ void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc, rvu_mbox_handler_npc_install_flow(rvu, &req, &rsp); } -void rvu_npc_enable_bcast_entry(struct rvu *rvu, u16 pcifunc, int nixlf, - bool enable) -{ - struct npc_mcam *mcam = &rvu->hw->mcam; - int blkaddr, index; - - blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0); - if (blkaddr < 0) - return; - - /* Get 'pcifunc' of PF device */ - pcifunc = pcifunc & ~RVU_PFVF_FUNC_MASK; - - index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, - NIXLF_BCAST_ENTRY); - npc_enable_mcam_entry(rvu, mcam, blkaddr, index, enable); -} - void rvu_npc_install_allmulti_entry(struct rvu *rvu, u16 pcifunc, int nixlf, u64 chan) { diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c index c3b6e0f60a799..7f6a435ac6806 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c @@ -357,9 +357,12 @@ int cn10k_free_matchall_ipolicer(struct otx2_nic *pfvf) mutex_lock(&pfvf->mbox.lock); /* Remove RQ's policer mapping */ - for (qidx = 0; qidx < hw->rx_queues; qidx++) - cn10k_map_unmap_rq_policer(pfvf, qidx, - hw->matchall_ipolicer, false); + for (qidx = 0; qidx < hw->rx_queues; qidx++) { + rc = cn10k_map_unmap_rq_policer(pfvf, qidx, hw->matchall_ipolicer, false); + if (rc) + dev_warn(pfvf->dev, "Failed to unmap RQ %d's policer (error %d).", + qidx, rc); + } rc = cn10k_free_leaf_profile(pfvf, hw->matchall_ipolicer); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 568bbe5f83f57..d292e6a9e22c3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -154,7 +154,8 @@ mlx5_core-$(CONFIG_MLX5_HW_STEERING) += steering/hws/cmd.o \ steering/hws/vport.o \ steering/hws/bwc_complex.o \ steering/hws/fs_hws_pools.o \ - steering/hws/fs_hws.o + steering/hws/fs_hws.o \ + steering/hws/action_ste_pool.o # # SF device diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index f803e1c935900..5ce1b463b7a8d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -707,8 +707,8 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo); page = xdpi.page.page; - /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) - * as we know this is a page_pool page. + /* No need to check page_pool_page_is_pp() as we + * know this is a page_pool page. */ page_pool_recycle_direct(page->pp, page); } while (++n < num); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 1c121b435016d..19664fa7f2171 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -2424,8 +2424,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(ptp) } if (priv->rx_ptp_opened) { for (i = 0; i < NUM_PTP_RQ_STATS; i++) - ethtool_sprintf(data, ptp_rq_stats_desc[i].format, - MLX5E_PTP_CHANNEL_IX); + ethtool_puts(data, ptp_rq_stats_desc[i].format); } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h index 8de6fcbd3a033..def5dea1463db 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h @@ -54,7 +54,7 @@ #define MLX5E_DECLARE_PTP_TX_STAT(type, fld) "ptp_tx%d_"#fld, offsetof(type, fld) #define MLX5E_DECLARE_PTP_CH_STAT(type, fld) "ptp_ch_"#fld, offsetof(type, fld) #define MLX5E_DECLARE_PTP_CQ_STAT(type, fld) "ptp_cq%d_"#fld, offsetof(type, fld) -#define MLX5E_DECLARE_PTP_RQ_STAT(type, fld) "ptp_rq%d_"#fld, offsetof(type, fld) +#define MLX5E_DECLARE_PTP_RQ_STAT(type, fld) "ptp_rq0_"#fld, offsetof(type, fld) #define MLX5E_DECLARE_QOS_TX_STAT(type, fld) "qos_tx%d_"#fld, offsetof(type, fld) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index 65a94e46edcf1..cec18efadc733 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -643,13 +643,6 @@ static int mlx5_extts_configure(struct ptp_clock_info *ptp, int pin = -1; int err = 0; - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Reject requests to enable time stamping on both edges. */ if ((rq->extts.flags & PTP_STRICT_FLAGS) && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -820,12 +813,6 @@ static int perout_conf_npps_real_time(struct mlx5_core_dev *mdev, struct ptp_clo return 0; } -static bool mlx5_perout_verify_flags(struct mlx5_core_dev *mdev, unsigned int flags) -{ - return ((!mlx5_npps_real_time_supported(mdev) && flags) || - (mlx5_npps_real_time_supported(mdev) && flags & ~PTP_PEROUT_DUTY_CYCLE)); -} - static int mlx5_perout_configure(struct ptp_clock_info *ptp, struct ptp_clock_request *rq, int on) @@ -861,12 +848,6 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, goto unlock; } - /* Reject requests with unsupported flags */ - if (mlx5_perout_verify_flags(mdev, rq->perout.flags)) { - err = -EOPNOTSUPP; - goto unlock; - } - if (on) { pin_mode = MLX5_PIN_MODE_OUT; pattern = MLX5_OUT_PATTERN_PERIODIC; @@ -1034,6 +1015,13 @@ static void mlx5_init_pin_config(struct mlx5_core_dev *mdev) clock->ptp_info.verify = mlx5_ptp_verify; clock->ptp_info.pps = 1; + clock->ptp_info.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; + + if (mlx5_npps_real_time_supported(mdev)) + clock->ptp_info.supported_perout_flags = PTP_PEROUT_DUTY_CYCLE; + for (i = 0; i < clock->ptp_info.n_pins; i++) { snprintf(clock->ptp_info.pin_config[i].name, sizeof(clock->ptp_info.pin_config[i].name), diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 2c5f850c31f68..40024cfa30998 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -148,7 +148,7 @@ int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int function_id, * Free the IRQ and other resources such as rmap from the system. * BUT doesn't free or remove reference from mlx5. * This function is very important for the shutdown flow, where we need to - * cleanup system resoruces but keep mlx5 objects alive, + * cleanup system resources but keep mlx5 objects alive, * see mlx5_irq_table_free_irqs(). */ static void mlx5_system_free_irq(struct mlx5_irq *irq) @@ -588,7 +588,7 @@ static void irq_pool_free(struct mlx5_irq_pool *pool) struct mlx5_irq *irq; unsigned long index; - /* There are cases in which we are destrying the irq_table before + /* There are cases in which we are destroying the irq_table before * freeing all the IRQs, fast teardown for example. Hence, free the irqs * which might not have been freed. */ @@ -617,7 +617,7 @@ static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec, if (!mlx5_sf_max_functions(dev)) return 0; if (sf_vec < MLX5_IRQ_VEC_COMP_BASE_SF) { - mlx5_core_dbg(dev, "Not enught IRQs for SFs. SF may run at lower performance\n"); + mlx5_core_dbg(dev, "Not enough IRQs for SFs. SF may run at lower performance\n"); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c index b5332c54d4fb0..bef4d25c1a2a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c @@ -238,6 +238,7 @@ hws_action_fixup_stc_attr(struct mlx5hws_context *ctx, enum mlx5hws_table_type table_type, bool is_mirror) { + struct mlx5hws_pool *pool; bool use_fixup = false; u32 fw_tbl_type; u32 base_id; @@ -253,13 +254,11 @@ hws_action_fixup_stc_attr(struct mlx5hws_context *ctx, use_fixup = true; break; } + pool = stc_attr->ste_table.ste_pool; if (!is_mirror) - base_id = mlx5hws_pool_chunk_get_base_id(stc_attr->ste_table.ste_pool, - &stc_attr->ste_table.ste); + base_id = mlx5hws_pool_get_base_id(pool); else - base_id = - mlx5hws_pool_chunk_get_base_mirror_id(stc_attr->ste_table.ste_pool, - &stc_attr->ste_table.ste); + base_id = mlx5hws_pool_get_base_mirror_id(pool); *fixup_stc_attr = *stc_attr; fixup_stc_attr->ste_table.ste_obj_id = base_id; @@ -337,7 +336,7 @@ __must_hold(&ctx->ctrl_lock) if (!mlx5hws_context_cap_dynamic_reparse(ctx)) stc_attr->reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE; - obj_0_id = mlx5hws_pool_chunk_get_base_id(stc_pool, stc); + obj_0_id = mlx5hws_pool_get_base_id(stc_pool); /* According to table/action limitation change the stc_attr */ use_fixup = hws_action_fixup_stc_attr(ctx, stc_attr, &fixup_stc_attr, table_type, false); @@ -353,7 +352,7 @@ __must_hold(&ctx->ctrl_lock) if (table_type == MLX5HWS_TABLE_TYPE_FDB) { u32 obj_1_id; - obj_1_id = mlx5hws_pool_chunk_get_base_mirror_id(stc_pool, stc); + obj_1_id = mlx5hws_pool_get_base_mirror_id(stc_pool); use_fixup = hws_action_fixup_stc_attr(ctx, stc_attr, &fixup_stc_attr, @@ -393,11 +392,11 @@ __must_hold(&ctx->ctrl_lock) stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_DROP; stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT; stc_attr.stc_offset = stc->offset; - obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, stc); + obj_id = mlx5hws_pool_get_base_id(stc_pool); mlx5hws_cmd_stc_modify(ctx->mdev, obj_id, &stc_attr); if (table_type == MLX5HWS_TABLE_TYPE_FDB) { - obj_id = mlx5hws_pool_chunk_get_base_mirror_id(stc_pool, stc); + obj_id = mlx5hws_pool_get_base_mirror_id(stc_pool); mlx5hws_cmd_stc_modify(ctx->mdev, obj_id, &stc_attr); } @@ -1575,17 +1574,15 @@ hws_action_create_dest_match_range_definer(struct mlx5hws_context *ctx) return definer; } -static struct mlx5hws_matcher_action_ste * +static struct mlx5hws_range_action_table * hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, struct mlx5hws_definer *definer, u32 miss_ft_id) { struct mlx5hws_cmd_rtc_create_attr rtc_attr = {0}; - struct mlx5hws_action_default_stc *default_stc; - struct mlx5hws_matcher_action_ste *table_ste; + struct mlx5hws_range_action_table *table_ste; struct mlx5hws_pool_attr pool_attr = {0}; struct mlx5hws_pool *ste_pool, *stc_pool; - struct mlx5hws_pool_chunk *ste; u32 *rtc_0_id, *rtc_1_id; u32 obj_id; int ret; @@ -1604,7 +1601,6 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, pool_attr.table_type = MLX5HWS_TABLE_TYPE_FDB; pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE; - pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_STE_ACTION_POOL; pool_attr.alloc_log_sz = 1; table_ste->pool = mlx5hws_pool_create(ctx, &pool_attr); if (!table_ste->pool) { @@ -1616,8 +1612,6 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, rtc_0_id = &table_ste->rtc_0_id; rtc_1_id = &table_ste->rtc_1_id; ste_pool = table_ste->pool; - ste = &table_ste->ste; - ste->order = 1; rtc_attr.log_size = 0; rtc_attr.log_depth = 0; @@ -1629,18 +1623,16 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, rtc_attr.fw_gen_wqe = true; rtc_attr.is_scnd_range = true; - obj_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste); + obj_id = mlx5hws_pool_get_base_id(ste_pool); rtc_attr.pd = ctx->pd_num; rtc_attr.ste_base = obj_id; - rtc_attr.ste_offset = ste->offset; rtc_attr.reparse_mode = mlx5hws_context_get_reparse_mode(ctx); rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(MLX5HWS_TABLE_TYPE_FDB, false); /* STC is a single resource (obj_id), use any STC for the ID */ stc_pool = ctx->stc_pool; - default_stc = ctx->common_res.default_stc; - obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, &default_stc->default_hit); + obj_id = mlx5hws_pool_get_base_id(stc_pool); rtc_attr.stc_base = obj_id; ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_0_id); @@ -1650,11 +1642,11 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, } /* Create mirror RTC */ - obj_id = mlx5hws_pool_chunk_get_base_mirror_id(ste_pool, ste); + obj_id = mlx5hws_pool_get_base_mirror_id(ste_pool); rtc_attr.ste_base = obj_id; rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(MLX5HWS_TABLE_TYPE_FDB, true); - obj_id = mlx5hws_pool_chunk_get_base_mirror_id(stc_pool, &default_stc->default_hit); + obj_id = mlx5hws_pool_get_base_mirror_id(stc_pool); rtc_attr.stc_base = obj_id; ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_1_id); @@ -1677,9 +1669,9 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, return NULL; } -static void -hws_action_destroy_dest_match_range_table(struct mlx5hws_context *ctx, - struct mlx5hws_matcher_action_ste *table_ste) +static void hws_action_destroy_dest_match_range_table( + struct mlx5hws_context *ctx, + struct mlx5hws_range_action_table *table_ste) { mutex_lock(&ctx->ctrl_lock); @@ -1691,12 +1683,11 @@ hws_action_destroy_dest_match_range_table(struct mlx5hws_context *ctx, mutex_unlock(&ctx->ctrl_lock); } -static int -hws_action_create_dest_match_range_fill_table(struct mlx5hws_context *ctx, - struct mlx5hws_matcher_action_ste *table_ste, - struct mlx5hws_action *hit_ft_action, - struct mlx5hws_definer *range_definer, - u32 min, u32 max) +static int hws_action_create_dest_match_range_fill_table( + struct mlx5hws_context *ctx, + struct mlx5hws_range_action_table *table_ste, + struct mlx5hws_action *hit_ft_action, + struct mlx5hws_definer *range_definer, u32 min, u32 max) { struct mlx5hws_wqe_gta_data_seg_ste match_wqe_data = {0}; struct mlx5hws_wqe_gta_data_seg_ste range_wqe_data = {0}; @@ -1792,7 +1783,7 @@ mlx5hws_action_create_dest_match_range(struct mlx5hws_context *ctx, u32 min, u32 max, u32 flags) { struct mlx5hws_cmd_stc_modify_attr stc_attr = {0}; - struct mlx5hws_matcher_action_ste *table_ste; + struct mlx5hws_range_action_table *table_ste; struct mlx5hws_action *hit_ft_action; struct mlx5hws_definer *definer; struct mlx5hws_action *action; @@ -1837,7 +1828,6 @@ mlx5hws_action_create_dest_match_range(struct mlx5hws_context *ctx, stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT; stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_STE_TABLE; stc_attr.reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE; - stc_attr.ste_table.ste = table_ste->ste; stc_attr.ste_table.ste_pool = table_ste->pool; stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h index 64b76075f7f8f..25fa0d4c92216 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h @@ -118,6 +118,12 @@ struct mlx5hws_action_template { u8 only_term; }; +struct mlx5hws_range_action_table { + struct mlx5hws_pool *pool; + u32 rtc_0_id; + u32 rtc_1_id; +}; + struct mlx5hws_action { u8 type; u8 flags; @@ -186,7 +192,7 @@ struct mlx5hws_action { size_t size; } remove_header; struct { - struct mlx5hws_matcher_action_ste *table_ste; + struct mlx5hws_range_action_table *table_ste; struct mlx5hws_action *hit_ft_action; struct mlx5hws_definer *definer; } range; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c new file mode 100644 index 0000000000000..5766a9c82f966 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c @@ -0,0 +1,467 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2025 NVIDIA Corporation & Affiliates */ + +#include "internal.h" + +static const char * +hws_pool_opt_to_str(enum mlx5hws_pool_optimize opt) +{ + switch (opt) { + case MLX5HWS_POOL_OPTIMIZE_NONE: + return "rx-and-tx"; + case MLX5HWS_POOL_OPTIMIZE_ORIG: + return "rx-only"; + case MLX5HWS_POOL_OPTIMIZE_MIRROR: + return "tx-only"; + default: + return "unknown"; + } +} + +static int +hws_action_ste_table_create_pool(struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_table *action_tbl, + enum mlx5hws_pool_optimize opt, size_t log_sz) +{ + struct mlx5hws_pool_attr pool_attr = { 0 }; + + pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE; + pool_attr.table_type = MLX5HWS_TABLE_TYPE_FDB; + pool_attr.flags = MLX5HWS_POOL_FLAG_BUDDY; + pool_attr.opt_type = opt; + pool_attr.alloc_log_sz = log_sz; + + action_tbl->pool = mlx5hws_pool_create(ctx, &pool_attr); + if (!action_tbl->pool) { + mlx5hws_err(ctx, "Failed to allocate STE pool\n"); + return -EINVAL; + } + + return 0; +} + +static int hws_action_ste_table_create_single_rtc( + struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_table *action_tbl, + enum mlx5hws_pool_optimize opt, size_t log_sz, bool tx) +{ + struct mlx5hws_cmd_rtc_create_attr rtc_attr = { 0 }; + u32 *rtc_id; + + rtc_attr.log_depth = 0; + rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET; + /* Action STEs use the default always hit definer. */ + rtc_attr.match_definer_0 = ctx->caps->trivial_match_definer; + rtc_attr.is_frst_jumbo = false; + rtc_attr.miss_ft_id = 0; + rtc_attr.pd = ctx->pd_num; + rtc_attr.reparse_mode = mlx5hws_context_get_reparse_mode(ctx); + + if (tx) { + rtc_attr.table_type = FS_FT_FDB_TX; + rtc_attr.ste_base = + mlx5hws_pool_get_base_mirror_id(action_tbl->pool); + rtc_attr.stc_base = + mlx5hws_pool_get_base_mirror_id(ctx->stc_pool); + rtc_attr.log_size = + opt == MLX5HWS_POOL_OPTIMIZE_ORIG ? 0 : log_sz; + rtc_id = &action_tbl->rtc_1_id; + } else { + rtc_attr.table_type = FS_FT_FDB_RX; + rtc_attr.ste_base = mlx5hws_pool_get_base_id(action_tbl->pool); + rtc_attr.stc_base = mlx5hws_pool_get_base_id(ctx->stc_pool); + rtc_attr.log_size = + opt == MLX5HWS_POOL_OPTIMIZE_MIRROR ? 0 : log_sz; + rtc_id = &action_tbl->rtc_0_id; + } + + return mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_id); +} + +static int +hws_action_ste_table_create_rtcs(struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_table *action_tbl, + enum mlx5hws_pool_optimize opt, size_t log_sz) +{ + int err; + + err = hws_action_ste_table_create_single_rtc(ctx, action_tbl, opt, + log_sz, false); + if (err) + return err; + + err = hws_action_ste_table_create_single_rtc(ctx, action_tbl, opt, + log_sz, true); + if (err) { + mlx5hws_cmd_rtc_destroy(ctx->mdev, action_tbl->rtc_0_id); + return err; + } + + return 0; +} + +static void +hws_action_ste_table_destroy_rtcs(struct mlx5hws_action_ste_table *action_tbl) +{ + mlx5hws_cmd_rtc_destroy(action_tbl->pool->ctx->mdev, + action_tbl->rtc_1_id); + mlx5hws_cmd_rtc_destroy(action_tbl->pool->ctx->mdev, + action_tbl->rtc_0_id); +} + +static int +hws_action_ste_table_create_stc(struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_table *action_tbl) +{ + struct mlx5hws_cmd_stc_modify_attr stc_attr = { 0 }; + + stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT; + stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_STE_TABLE; + stc_attr.reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE; + stc_attr.ste_table.ste_pool = action_tbl->pool; + stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer; + + return mlx5hws_action_alloc_single_stc(ctx, &stc_attr, + MLX5HWS_TABLE_TYPE_FDB, + &action_tbl->stc); +} + +static struct mlx5hws_action_ste_table * +hws_action_ste_table_alloc(struct mlx5hws_action_ste_pool_element *parent_elem) +{ + enum mlx5hws_pool_optimize opt = parent_elem->opt; + struct mlx5hws_context *ctx = parent_elem->ctx; + struct mlx5hws_action_ste_table *action_tbl; + size_t log_sz; + int err; + + log_sz = min(parent_elem->log_sz ? + parent_elem->log_sz + + MLX5HWS_ACTION_STE_TABLE_STEP_LOG_SZ : + MLX5HWS_ACTION_STE_TABLE_INIT_LOG_SZ, + MLX5HWS_ACTION_STE_TABLE_MAX_LOG_SZ); + + action_tbl = kzalloc(sizeof(*action_tbl), GFP_KERNEL); + if (!action_tbl) + return ERR_PTR(-ENOMEM); + + err = hws_action_ste_table_create_pool(ctx, action_tbl, opt, log_sz); + if (err) + goto free_tbl; + + err = hws_action_ste_table_create_rtcs(ctx, action_tbl, opt, log_sz); + if (err) + goto destroy_pool; + + err = hws_action_ste_table_create_stc(ctx, action_tbl); + if (err) + goto destroy_rtcs; + + action_tbl->parent_elem = parent_elem; + INIT_LIST_HEAD(&action_tbl->list_node); + action_tbl->last_used = jiffies; + list_add(&action_tbl->list_node, &parent_elem->available); + parent_elem->log_sz = log_sz; + + mlx5hws_dbg(ctx, + "Allocated %s action STE table log_sz %zu; STEs (%d, %d); RTCs (%d, %d); STC %d\n", + hws_pool_opt_to_str(opt), log_sz, + mlx5hws_pool_get_base_id(action_tbl->pool), + mlx5hws_pool_get_base_mirror_id(action_tbl->pool), + action_tbl->rtc_0_id, action_tbl->rtc_1_id, + action_tbl->stc.offset); + + return action_tbl; + +destroy_rtcs: + hws_action_ste_table_destroy_rtcs(action_tbl); +destroy_pool: + mlx5hws_pool_destroy(action_tbl->pool); +free_tbl: + kfree(action_tbl); + + return ERR_PTR(err); +} + +static void +hws_action_ste_table_destroy(struct mlx5hws_action_ste_table *action_tbl) +{ + struct mlx5hws_context *ctx = action_tbl->parent_elem->ctx; + + mlx5hws_dbg(ctx, + "Destroying %s action STE table: STEs (%d, %d); RTCs (%d, %d); STC %d\n", + hws_pool_opt_to_str(action_tbl->parent_elem->opt), + mlx5hws_pool_get_base_id(action_tbl->pool), + mlx5hws_pool_get_base_mirror_id(action_tbl->pool), + action_tbl->rtc_0_id, action_tbl->rtc_1_id, + action_tbl->stc.offset); + + mlx5hws_action_free_single_stc(ctx, MLX5HWS_TABLE_TYPE_FDB, + &action_tbl->stc); + hws_action_ste_table_destroy_rtcs(action_tbl); + mlx5hws_pool_destroy(action_tbl->pool); + + list_del(&action_tbl->list_node); + kfree(action_tbl); +} + +static int +hws_action_ste_pool_element_init(struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_pool_element *elem, + enum mlx5hws_pool_optimize opt) +{ + elem->ctx = ctx; + elem->opt = opt; + INIT_LIST_HEAD(&elem->available); + INIT_LIST_HEAD(&elem->full); + + return 0; +} + +static void hws_action_ste_pool_element_destroy( + struct mlx5hws_action_ste_pool_element *elem) +{ + struct mlx5hws_action_ste_table *action_tbl, *p; + + /* This should be empty, but attempt to free its elements anyway. */ + list_for_each_entry_safe(action_tbl, p, &elem->full, list_node) + hws_action_ste_table_destroy(action_tbl); + + list_for_each_entry_safe(action_tbl, p, &elem->available, list_node) + hws_action_ste_table_destroy(action_tbl); +} + +static int hws_action_ste_pool_init(struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_pool *pool) +{ + enum mlx5hws_pool_optimize opt; + int err; + + mutex_init(&pool->lock); + + /* Rules which are added for both RX and TX must use the same action STE + * indices for both. If we were to use a single table, then RX-only and + * TX-only rules would waste the unused entries. Thus, we use separate + * table sets for the three cases. + */ + for (opt = MLX5HWS_POOL_OPTIMIZE_NONE; opt < MLX5HWS_POOL_OPTIMIZE_MAX; + opt++) { + err = hws_action_ste_pool_element_init(ctx, &pool->elems[opt], + opt); + if (err) + goto destroy_elems; + pool->elems[opt].parent_pool = pool; + } + + return 0; + +destroy_elems: + while (opt-- > MLX5HWS_POOL_OPTIMIZE_NONE) + hws_action_ste_pool_element_destroy(&pool->elems[opt]); + + return err; +} + +static void hws_action_ste_pool_destroy(struct mlx5hws_action_ste_pool *pool) +{ + int opt; + + for (opt = MLX5HWS_POOL_OPTIMIZE_MAX - 1; + opt >= MLX5HWS_POOL_OPTIMIZE_NONE; opt--) + hws_action_ste_pool_element_destroy(&pool->elems[opt]); +} + +static void hws_action_ste_pool_element_collect_stale( + struct mlx5hws_action_ste_pool_element *elem, struct list_head *cleanup) +{ + struct mlx5hws_action_ste_table *action_tbl, *p; + unsigned long expire_time, now; + + expire_time = secs_to_jiffies(MLX5HWS_ACTION_STE_POOL_EXPIRE_SECONDS); + now = jiffies; + + list_for_each_entry_safe(action_tbl, p, &elem->available, list_node) { + if (mlx5hws_pool_full(action_tbl->pool) && + time_before(action_tbl->last_used + expire_time, now)) + list_move(&action_tbl->list_node, cleanup); + } +} + +static void hws_action_ste_table_cleanup_list(struct list_head *cleanup) +{ + struct mlx5hws_action_ste_table *action_tbl, *p; + + list_for_each_entry_safe(action_tbl, p, cleanup, list_node) + hws_action_ste_table_destroy(action_tbl); +} + +static void hws_action_ste_pool_cleanup(struct work_struct *work) +{ + enum mlx5hws_pool_optimize opt; + struct mlx5hws_context *ctx; + LIST_HEAD(cleanup); + int i; + + ctx = container_of(work, struct mlx5hws_context, + action_ste_cleanup.work); + + for (i = 0; i < ctx->queues; i++) { + struct mlx5hws_action_ste_pool *p = &ctx->action_ste_pool[i]; + + mutex_lock(&p->lock); + for (opt = MLX5HWS_POOL_OPTIMIZE_NONE; + opt < MLX5HWS_POOL_OPTIMIZE_MAX; opt++) + hws_action_ste_pool_element_collect_stale( + &p->elems[opt], &cleanup); + mutex_unlock(&p->lock); + } + + hws_action_ste_table_cleanup_list(&cleanup); + + schedule_delayed_work(&ctx->action_ste_cleanup, + secs_to_jiffies( + MLX5HWS_ACTION_STE_POOL_CLEANUP_SECONDS)); +} + +int mlx5hws_action_ste_pool_init(struct mlx5hws_context *ctx) +{ + struct mlx5hws_action_ste_pool *pool; + size_t queues = ctx->queues; + int i, err; + + pool = kcalloc(queues, sizeof(*pool), GFP_KERNEL); + if (!pool) + return -ENOMEM; + + for (i = 0; i < queues; i++) { + err = hws_action_ste_pool_init(ctx, &pool[i]); + if (err) + goto free_pool; + } + + ctx->action_ste_pool = pool; + + INIT_DELAYED_WORK(&ctx->action_ste_cleanup, + hws_action_ste_pool_cleanup); + schedule_delayed_work( + &ctx->action_ste_cleanup, + secs_to_jiffies(MLX5HWS_ACTION_STE_POOL_CLEANUP_SECONDS)); + + return 0; + +free_pool: + while (i--) + hws_action_ste_pool_destroy(&pool[i]); + kfree(pool); + + return err; +} + +void mlx5hws_action_ste_pool_uninit(struct mlx5hws_context *ctx) +{ + size_t queues = ctx->queues; + int i; + + cancel_delayed_work_sync(&ctx->action_ste_cleanup); + + for (i = 0; i < queues; i++) + hws_action_ste_pool_destroy(&ctx->action_ste_pool[i]); + + kfree(ctx->action_ste_pool); +} + +static struct mlx5hws_action_ste_pool_element * +hws_action_ste_choose_elem(struct mlx5hws_action_ste_pool *pool, + bool skip_rx, bool skip_tx) +{ + if (skip_rx) + return &pool->elems[MLX5HWS_POOL_OPTIMIZE_MIRROR]; + + if (skip_tx) + return &pool->elems[MLX5HWS_POOL_OPTIMIZE_ORIG]; + + return &pool->elems[MLX5HWS_POOL_OPTIMIZE_NONE]; +} + +static int +hws_action_ste_table_chunk_alloc(struct mlx5hws_action_ste_table *action_tbl, + struct mlx5hws_action_ste_chunk *chunk) +{ + int err; + + err = mlx5hws_pool_chunk_alloc(action_tbl->pool, &chunk->ste); + if (err) + return err; + + chunk->action_tbl = action_tbl; + action_tbl->last_used = jiffies; + + return 0; +} + +int mlx5hws_action_ste_chunk_alloc(struct mlx5hws_action_ste_pool *pool, + bool skip_rx, bool skip_tx, + struct mlx5hws_action_ste_chunk *chunk) +{ + struct mlx5hws_action_ste_pool_element *elem; + struct mlx5hws_action_ste_table *action_tbl; + bool found; + int err; + + if (skip_rx && skip_tx) + return -EINVAL; + + mutex_lock(&pool->lock); + + elem = hws_action_ste_choose_elem(pool, skip_rx, skip_tx); + + mlx5hws_dbg(elem->ctx, + "Allocating action STEs skip_rx %d skip_tx %d order %d\n", + skip_rx, skip_tx, chunk->ste.order); + + found = false; + list_for_each_entry(action_tbl, &elem->available, list_node) { + if (!hws_action_ste_table_chunk_alloc(action_tbl, chunk)) { + found = true; + break; + } + } + + if (!found) { + action_tbl = hws_action_ste_table_alloc(elem); + if (IS_ERR(action_tbl)) { + err = PTR_ERR(action_tbl); + goto out; + } + + err = hws_action_ste_table_chunk_alloc(action_tbl, chunk); + if (err) + goto out; + } + + if (mlx5hws_pool_empty(action_tbl->pool)) + list_move(&action_tbl->list_node, &elem->full); + + err = 0; + +out: + mutex_unlock(&pool->lock); + + return err; +} + +void mlx5hws_action_ste_chunk_free(struct mlx5hws_action_ste_chunk *chunk) +{ + struct mutex *lock = &chunk->action_tbl->parent_elem->parent_pool->lock; + + mlx5hws_dbg(chunk->action_tbl->pool->ctx, + "Freeing action STEs offset %d order %d\n", + chunk->ste.offset, chunk->ste.order); + + mutex_lock(lock); + mlx5hws_pool_chunk_free(chunk->action_tbl->pool, &chunk->ste); + chunk->action_tbl->last_used = jiffies; + list_move(&chunk->action_tbl->list_node, + &chunk->action_tbl->parent_elem->available); + mutex_unlock(lock); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h new file mode 100644 index 0000000000000..a8ba97359e314 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2025 NVIDIA Corporation & Affiliates */ + +#ifndef ACTION_STE_POOL_H_ +#define ACTION_STE_POOL_H_ + +#define MLX5HWS_ACTION_STE_TABLE_INIT_LOG_SZ 10 +#define MLX5HWS_ACTION_STE_TABLE_STEP_LOG_SZ 1 +#define MLX5HWS_ACTION_STE_TABLE_MAX_LOG_SZ 20 + +#define MLX5HWS_ACTION_STE_POOL_CLEANUP_SECONDS 300 +#define MLX5HWS_ACTION_STE_POOL_EXPIRE_SECONDS 300 + +struct mlx5hws_action_ste_pool_element; + +struct mlx5hws_action_ste_table { + struct mlx5hws_action_ste_pool_element *parent_elem; + /* Wraps the RTC and STE range for this given action. */ + struct mlx5hws_pool *pool; + /* Match STEs use this STC to jump to this pool's RTC. */ + struct mlx5hws_pool_chunk stc; + u32 rtc_0_id; + u32 rtc_1_id; + struct list_head list_node; + unsigned long last_used; +}; + +struct mlx5hws_action_ste_pool_element { + struct mlx5hws_context *ctx; + struct mlx5hws_action_ste_pool *parent_pool; + size_t log_sz; /* Size of the largest table so far. */ + enum mlx5hws_pool_optimize opt; + struct list_head available; + struct list_head full; +}; + +/* Central repository of action STEs. The context contains one of these pools + * per queue. + */ +struct mlx5hws_action_ste_pool { + /* Protects the entire pool. We have one pool per queue and only one + * operation can be active per rule at a given time. Thus this lock + * protects solely against concurrent garbage collection and we expect + * very little contention. + */ + struct mutex lock; + struct mlx5hws_action_ste_pool_element elems[MLX5HWS_POOL_OPTIMIZE_MAX]; +}; + +/* A chunk of STEs and the table it was allocated from. Used by rules. */ +struct mlx5hws_action_ste_chunk { + struct mlx5hws_action_ste_table *action_tbl; + struct mlx5hws_pool_chunk ste; +}; + +int mlx5hws_action_ste_pool_init(struct mlx5hws_context *ctx); + +void mlx5hws_action_ste_pool_uninit(struct mlx5hws_context *ctx); + +/* Callers are expected to fill chunk->ste.order. On success, this function + * populates chunk->tbl and chunk->ste.offset. + */ +int mlx5hws_action_ste_chunk_alloc(struct mlx5hws_action_ste_pool *pool, + bool skip_rx, bool skip_tx, + struct mlx5hws_action_ste_chunk *chunk); + +void mlx5hws_action_ste_chunk_free(struct mlx5hws_action_ste_chunk *chunk); + +#endif /* ACTION_STE_POOL_H_ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index 19dce1ba512d4..510bfbbe59918 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -90,13 +90,19 @@ int mlx5hws_bwc_matcher_create_simple(struct mlx5hws_bwc_matcher *bwc_matcher, bwc_matcher->priority = priority; bwc_matcher->size_log = MLX5HWS_BWC_MATCHER_INIT_SIZE_LOG; + bwc_matcher->size_of_at_array = MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM; + bwc_matcher->at = kcalloc(bwc_matcher->size_of_at_array, + sizeof(*bwc_matcher->at), GFP_KERNEL); + if (!bwc_matcher->at) + goto free_bwc_matcher_rules; + /* create dummy action template */ bwc_matcher->at[0] = mlx5hws_action_template_create(action_types ? action_types : init_action_types); if (!bwc_matcher->at[0]) { mlx5hws_err(table->ctx, "BWC matcher: failed creating action template\n"); - goto free_bwc_matcher_rules; + goto free_bwc_matcher_at_array; } bwc_matcher->num_of_at = 1; @@ -126,6 +132,8 @@ int mlx5hws_bwc_matcher_create_simple(struct mlx5hws_bwc_matcher *bwc_matcher, mlx5hws_match_template_destroy(bwc_matcher->mt); free_at: mlx5hws_action_template_destroy(bwc_matcher->at[0]); +free_bwc_matcher_at_array: + kfree(bwc_matcher->at); free_bwc_matcher_rules: kfree(bwc_matcher->rules); err: @@ -192,6 +200,7 @@ int mlx5hws_bwc_matcher_destroy_simple(struct mlx5hws_bwc_matcher *bwc_matcher) for (i = 0; i < bwc_matcher->num_of_at; i++) mlx5hws_action_template_destroy(bwc_matcher->at[i]); + kfree(bwc_matcher->at); mlx5hws_match_template_destroy(bwc_matcher->mt); kfree(bwc_matcher->rules); @@ -469,21 +478,9 @@ hws_bwc_matcher_size_maxed_out(struct mlx5hws_bwc_matcher *bwc_matcher) struct mlx5hws_cmd_query_caps *caps = bwc_matcher->matcher->tbl->ctx->caps; /* check the match RTC size */ - if ((bwc_matcher->size_log + - MLX5HWS_MATCHER_ASSURED_MAIN_TBL_DEPTH + - MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP) > - (caps->ste_alloc_log_max - 1)) - return true; - - /* check the action RTC size */ - if ((bwc_matcher->size_log + - MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP + - ilog2(roundup_pow_of_two(bwc_matcher->matcher->action_ste.max_stes)) + - MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT) > - (caps->ste_alloc_log_max - 1)) - return true; - - return false; + return (bwc_matcher->size_log + MLX5HWS_MATCHER_ASSURED_MAIN_TBL_DEPTH + + MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP) > + (caps->ste_alloc_log_max - 1); } static bool @@ -520,6 +517,23 @@ hws_bwc_matcher_extend_at(struct mlx5hws_bwc_matcher *bwc_matcher, struct mlx5hws_rule_action rule_actions[]) { enum mlx5hws_action_type action_types[MLX5HWS_BWC_MAX_ACTS]; + void *p; + + if (unlikely(bwc_matcher->num_of_at >= bwc_matcher->size_of_at_array)) { + if (bwc_matcher->size_of_at_array >= MLX5HWS_MATCHER_MAX_AT) + return -ENOMEM; + bwc_matcher->size_of_at_array *= 2; + p = krealloc(bwc_matcher->at, + bwc_matcher->size_of_at_array * + sizeof(*bwc_matcher->at), + __GFP_ZERO | GFP_KERNEL); + if (!p) { + bwc_matcher->size_of_at_array /= 2; + return -ENOMEM; + } + + bwc_matcher->at = p; + } hws_bwc_rule_actions_to_action_types(rule_actions, action_types); @@ -753,19 +767,6 @@ hws_bwc_matcher_rehash_size(struct mlx5hws_bwc_matcher *bwc_matcher) return hws_bwc_matcher_move(bwc_matcher); } -static int -hws_bwc_matcher_rehash_at(struct mlx5hws_bwc_matcher *bwc_matcher) -{ - /* Rehash by action template doesn't require any additional checking. - * The bwc_matcher already contains the new action template. - * Just do the usual rehash: - * - create new matcher - * - move all the rules to the new matcher - * - destroy the old matcher - */ - return hws_bwc_matcher_move(bwc_matcher); -} - int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, u32 *match_param, struct mlx5hws_rule_action rule_actions[], @@ -805,23 +806,8 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, ret = mlx5hws_matcher_attach_at(bwc_matcher->matcher, bwc_matcher->at[at_idx]); if (unlikely(ret)) { - /* Action template attach failed, possibly due to - * requiring more action STEs. - * Need to attempt creating new matcher with all - * the action templates, including the new one. - */ - ret = hws_bwc_matcher_rehash_at(bwc_matcher); - if (unlikely(ret)) { - mlx5hws_action_template_destroy(bwc_matcher->at[at_idx]); - bwc_matcher->at[at_idx] = NULL; - bwc_matcher->num_of_at--; - - hws_bwc_unlock_all_queues(ctx); - - mlx5hws_err(ctx, - "BWC rule insertion: rehash AT failed (%d)\n", ret); - return ret; - } + hws_bwc_unlock_all_queues(ctx); + return ret; } hws_bwc_unlock_all_queues(ctx); @@ -975,24 +961,8 @@ hws_bwc_rule_action_update(struct mlx5hws_bwc_rule *bwc_rule, ret = mlx5hws_matcher_attach_at(bwc_matcher->matcher, bwc_matcher->at[at_idx]); if (unlikely(ret)) { - /* Action template attach failed, possibly due to - * requiring more action STEs. - * Need to attempt creating new matcher with all - * the action templates, including the new one. - */ - ret = hws_bwc_matcher_rehash_at(bwc_matcher); - if (unlikely(ret)) { - mlx5hws_action_template_destroy(bwc_matcher->at[at_idx]); - bwc_matcher->at[at_idx] = NULL; - bwc_matcher->num_of_at--; - - hws_bwc_unlock_all_queues(ctx); - - mlx5hws_err(ctx, - "BWC rule update: rehash AT failed (%d)\n", - ret); - return ret; - } + hws_bwc_unlock_all_queues(ctx); + return ret; } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h index 47f7ed1415535..bb0cf4b922ceb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h @@ -10,9 +10,7 @@ #define MLX5HWS_BWC_MATCHER_REHASH_BURST_TH 32 /* Max number of AT attach operations for the same matcher. - * When the limit is reached, next attempt to attach new AT - * will result in creation of a new matcher and moving all - * the rules to this matcher. + * When the limit is reached, a larger buffer is allocated for the ATs. */ #define MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM 8 @@ -23,10 +21,11 @@ struct mlx5hws_bwc_matcher { struct mlx5hws_matcher *matcher; struct mlx5hws_match_template *mt; - struct mlx5hws_action_template *at[MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM]; - u32 priority; + struct mlx5hws_action_template **at; u8 num_of_at; + u8 size_of_at_array; u8 size_log; + u32 priority; atomic_t num_of_rules; struct list_head *rules; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c index e8f98c109b999..9c83753e45924 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c @@ -406,7 +406,6 @@ int mlx5hws_cmd_rtc_create(struct mlx5_core_dev *mdev, MLX5_SET(rtc, attr, match_definer_1, rtc_attr->match_definer_1); MLX5_SET(rtc, attr, stc_id, rtc_attr->stc_base); MLX5_SET(rtc, attr, ste_table_base_id, rtc_attr->ste_base); - MLX5_SET(rtc, attr, ste_table_offset, rtc_attr->ste_offset); MLX5_SET(rtc, attr, miss_flow_table_id, rtc_attr->miss_ft_id); MLX5_SET(rtc, attr, reparse_mode, rtc_attr->reparse_mode); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h index 51d9e0291ac16..fa6bff210266c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h @@ -70,7 +70,6 @@ struct mlx5hws_cmd_rtc_create_attr { u32 pd; u32 stc_base; u32 ste_base; - u32 ste_offset; u32 miss_ft_id; bool fw_gen_wqe; u8 update_index_mode; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c index 9cda2774fd64a..428dae8697062 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c @@ -34,7 +34,6 @@ static int hws_context_pools_init(struct mlx5hws_context *ctx) /* Create an STC pool per FT type */ pool_attr.pool_type = MLX5HWS_POOL_TYPE_STC; - pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_STC_POOL; max_log_sz = min(MLX5HWS_POOL_STC_LOG_SZ, ctx->caps->stc_alloc_log_max); pool_attr.alloc_log_sz = max(max_log_sz, ctx->caps->stc_alloc_log_gran); @@ -159,10 +158,16 @@ static int hws_context_init_hws(struct mlx5hws_context *ctx, if (ret) goto pools_uninit; + ret = mlx5hws_action_ste_pool_init(ctx); + if (ret) + goto close_queues; + INIT_LIST_HEAD(&ctx->tbl_list); return 0; +close_queues: + mlx5hws_send_queues_close(ctx); pools_uninit: hws_context_pools_uninit(ctx); uninit_pd: @@ -175,6 +180,7 @@ static void hws_context_uninit_hws(struct mlx5hws_context *ctx) if (!(ctx->flags & MLX5HWS_CONTEXT_FLAG_HWS_SUPPORT)) return; + mlx5hws_action_ste_pool_uninit(ctx); mlx5hws_send_queues_close(ctx); hws_context_pools_uninit(ctx); hws_context_uninit_pd(ctx); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h index 38c3647444adc..3f8938c73dc01 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h @@ -39,6 +39,8 @@ struct mlx5hws_context { struct mlx5hws_cmd_query_caps *caps; u32 pd_num; struct mlx5hws_pool *stc_pool; + struct mlx5hws_action_ste_pool *action_ste_pool; /* One per queue */ + struct delayed_work action_ste_cleanup; struct mlx5hws_context_common_res common_res; struct mlx5hws_pattern_cache *pattern_cache; struct mlx5hws_definer_cache *definer_cache; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c index 696275fd0ce22..91568d6c1dac9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c @@ -118,7 +118,6 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma { enum mlx5hws_table_type tbl_type = matcher->tbl->type; struct mlx5hws_cmd_ft_query_attr ft_attr = {0}; - struct mlx5hws_pool_chunk *ste; struct mlx5hws_pool *ste_pool; u64 icm_addr_0 = 0; u64 icm_addr_1 = 0; @@ -134,12 +133,11 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma matcher->end_ft_id, matcher->col_matcher ? HWS_PTR_TO_ID(matcher->col_matcher) : 0); - ste = &matcher->match_ste.ste; ste_pool = matcher->match_ste.pool; if (ste_pool) { - ste_0_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste); + ste_0_id = mlx5hws_pool_get_base_id(ste_pool); if (tbl_type == MLX5HWS_TABLE_TYPE_FDB) - ste_1_id = mlx5hws_pool_chunk_get_base_mirror_id(ste_pool, ste); + ste_1_id = mlx5hws_pool_get_base_mirror_id(ste_pool); } seq_printf(f, ",%d,%d,%d,%d", @@ -148,19 +146,6 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma matcher->match_ste.rtc_1_id, (int)ste_1_id); - ste = &matcher->action_ste.ste; - ste_pool = matcher->action_ste.pool; - if (ste_pool) { - ste_0_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste); - if (tbl_type == MLX5HWS_TABLE_TYPE_FDB) - ste_1_id = mlx5hws_pool_chunk_get_base_mirror_id(ste_pool, ste); - else - ste_1_id = -1; - } else { - ste_0_id = -1; - ste_1_id = -1; - } - ft_attr.type = matcher->tbl->fw_ft_type; ret = mlx5hws_cmd_flow_table_query(matcher->tbl->ctx->mdev, matcher->end_ft_id, @@ -170,10 +155,7 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma if (ret) return ret; - seq_printf(f, ",%d,%d,%d,%d,%d,0x%llx,0x%llx\n", - matcher->action_ste.rtc_0_id, (int)ste_0_id, - matcher->action_ste.rtc_1_id, (int)ste_1_id, - 0, + seq_printf(f, ",-1,-1,-1,-1,0,0x%llx,0x%llx\n", mlx5hws_debug_icm_to_idx(icm_addr_0), mlx5hws_debug_icm_to_idx(icm_addr_1)); @@ -387,14 +369,17 @@ static int hws_debug_dump_context_stc(struct seq_file *f, struct mlx5hws_context if (!stc_pool) return 0; - if (stc_pool->resource[0]) { - ret = hws_debug_dump_context_stc_resource(f, ctx, stc_pool->resource[0]); + if (stc_pool->resource) { + ret = hws_debug_dump_context_stc_resource(f, ctx, + stc_pool->resource); if (ret) return ret; } - if (stc_pool->mirror_resource[0]) { - ret = hws_debug_dump_context_stc_resource(f, ctx, stc_pool->mirror_resource[0]); + if (stc_pool->mirror_resource) { + struct mlx5hws_pool_resource *res = stc_pool->mirror_resource; + + ret = hws_debug_dump_context_stc_resource(f, ctx, res); if (ret) return ret; } @@ -402,10 +387,41 @@ static int hws_debug_dump_context_stc(struct seq_file *f, struct mlx5hws_context return 0; } +static void +hws_debug_dump_action_ste_table(struct seq_file *f, + struct mlx5hws_action_ste_table *action_tbl) +{ + int ste_0_id = mlx5hws_pool_get_base_id(action_tbl->pool); + int ste_1_id = mlx5hws_pool_get_base_mirror_id(action_tbl->pool); + + seq_printf(f, "%d,0x%llx,%d,%d,%d,%d\n", + MLX5HWS_DEBUG_RES_TYPE_ACTION_STE_TABLE, + HWS_PTR_TO_ID(action_tbl), + action_tbl->rtc_0_id, ste_0_id, + action_tbl->rtc_1_id, ste_1_id); +} + +static void hws_debug_dump_action_ste_pool(struct seq_file *f, + struct mlx5hws_action_ste_pool *pool) +{ + struct mlx5hws_action_ste_table *action_tbl; + enum mlx5hws_pool_optimize opt; + + mutex_lock(&pool->lock); + for (opt = MLX5HWS_POOL_OPTIMIZE_NONE; opt < MLX5HWS_POOL_OPTIMIZE_MAX; + opt++) { + list_for_each_entry(action_tbl, &pool->elems[opt].available, + list_node) { + hws_debug_dump_action_ste_table(f, action_tbl); + } + } + mutex_unlock(&pool->lock); +} + static int hws_debug_dump_context(struct seq_file *f, struct mlx5hws_context *ctx) { struct mlx5hws_table *tbl; - int ret; + int ret, i; ret = hws_debug_dump_context_info(f, ctx); if (ret) @@ -425,6 +441,9 @@ static int hws_debug_dump_context(struct seq_file *f, struct mlx5hws_context *ct return ret; } + for (i = 0; i < ctx->queues; i++) + hws_debug_dump_action_ste_pool(f, &ctx->action_ste_pool[i]); + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.h index e44e7ae28f93e..89c396f9f2660 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.h @@ -26,6 +26,8 @@ enum mlx5hws_debug_res_type { MLX5HWS_DEBUG_RES_TYPE_MATCHER_TEMPLATE_HASH_DEFINER = 4205, MLX5HWS_DEBUG_RES_TYPE_MATCHER_TEMPLATE_RANGE_DEFINER = 4206, MLX5HWS_DEBUG_RES_TYPE_MATCHER_TEMPLATE_COMPARE_MATCH_DEFINER = 4207, + + MLX5HWS_DEBUG_RES_TYPE_ACTION_STE_TABLE = 4300, }; static inline u64 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h index 30ccd635b5054..21279d5031174 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h @@ -17,6 +17,7 @@ #include "context.h" #include "table.h" #include "send.h" +#include "action_ste_pool.h" #include "rule.h" #include "cmd.h" #include "action.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c index b61864b320536..716502732d3da 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c @@ -3,25 +3,6 @@ #include "internal.h" -enum mlx5hws_matcher_rtc_type { - HWS_MATCHER_RTC_TYPE_MATCH, - HWS_MATCHER_RTC_TYPE_STE_ARRAY, - HWS_MATCHER_RTC_TYPE_MAX, -}; - -static const char * const mlx5hws_matcher_rtc_type_str[] = { - [HWS_MATCHER_RTC_TYPE_MATCH] = "MATCH", - [HWS_MATCHER_RTC_TYPE_STE_ARRAY] = "STE_ARRAY", - [HWS_MATCHER_RTC_TYPE_MAX] = "UNKNOWN", -}; - -static const char *hws_matcher_rtc_type_to_str(enum mlx5hws_matcher_rtc_type rtc_type) -{ - if (rtc_type > HWS_MATCHER_RTC_TYPE_MAX) - rtc_type = HWS_MATCHER_RTC_TYPE_MAX; - return mlx5hws_matcher_rtc_type_str[rtc_type]; -} - static bool hws_matcher_requires_col_tbl(u8 log_num_of_rules) { /* Collision table concatenation is done only for large rule tables */ @@ -197,149 +178,96 @@ static int hws_matcher_disconnect(struct mlx5hws_matcher *matcher) static void hws_matcher_set_rtc_attr_sz(struct mlx5hws_matcher *matcher, struct mlx5hws_cmd_rtc_create_attr *rtc_attr, - enum mlx5hws_matcher_rtc_type rtc_type, bool is_mirror) { - struct mlx5hws_pool_chunk *ste = &matcher->action_ste.ste; enum mlx5hws_matcher_flow_src flow_src = matcher->attr.optimize_flow_src; - bool is_match_rtc = rtc_type == HWS_MATCHER_RTC_TYPE_MATCH; if ((flow_src == MLX5HWS_MATCHER_FLOW_SRC_VPORT && !is_mirror) || (flow_src == MLX5HWS_MATCHER_FLOW_SRC_WIRE && is_mirror)) { /* Optimize FDB RTC */ rtc_attr->log_size = 0; rtc_attr->log_depth = 0; - } else { - /* Keep original values */ - rtc_attr->log_size = is_match_rtc ? matcher->attr.table.sz_row_log : ste->order; - rtc_attr->log_depth = is_match_rtc ? matcher->attr.table.sz_col_log : 0; } } -static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, - enum mlx5hws_matcher_rtc_type rtc_type) +static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher) { struct mlx5hws_matcher_attr *attr = &matcher->attr; struct mlx5hws_cmd_rtc_create_attr rtc_attr = {0}; struct mlx5hws_match_template *mt = matcher->mt; struct mlx5hws_context *ctx = matcher->tbl->ctx; - struct mlx5hws_action_default_stc *default_stc; - struct mlx5hws_matcher_action_ste *action_ste; struct mlx5hws_table *tbl = matcher->tbl; - struct mlx5hws_pool *ste_pool, *stc_pool; - struct mlx5hws_pool_chunk *ste; - u32 *rtc_0_id, *rtc_1_id; u32 obj_id; int ret; - switch (rtc_type) { - case HWS_MATCHER_RTC_TYPE_MATCH: - rtc_0_id = &matcher->match_ste.rtc_0_id; - rtc_1_id = &matcher->match_ste.rtc_1_id; - ste_pool = matcher->match_ste.pool; - ste = &matcher->match_ste.ste; - ste->order = attr->table.sz_col_log + attr->table.sz_row_log; - - rtc_attr.log_size = attr->table.sz_row_log; - rtc_attr.log_depth = attr->table.sz_col_log; - rtc_attr.is_frst_jumbo = mlx5hws_matcher_mt_is_jumbo(mt); - rtc_attr.is_scnd_range = 0; - rtc_attr.miss_ft_id = matcher->end_ft_id; - - if (attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_HASH) { - /* The usual Hash Table */ - rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_HASH; - - /* The first mt is used since all share the same definer */ - rtc_attr.match_definer_0 = mlx5hws_definer_get_id(mt->definer); - } else if (attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_INDEX) { - rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET; - rtc_attr.num_hash_definer = 1; - - if (attr->distribute_mode == MLX5HWS_MATCHER_DISTRIBUTE_BY_HASH) { - /* Hash Split Table */ - rtc_attr.access_index_mode = MLX5_IFC_RTC_STE_ACCESS_MODE_BY_HASH; - rtc_attr.match_definer_0 = mlx5hws_definer_get_id(mt->definer); - } else if (attr->distribute_mode == MLX5HWS_MATCHER_DISTRIBUTE_BY_LINEAR) { - /* Linear Lookup Table */ - rtc_attr.access_index_mode = MLX5_IFC_RTC_STE_ACCESS_MODE_LINEAR; - rtc_attr.match_definer_0 = ctx->caps->linear_match_definer; - } - } - - /* Match pool requires implicit allocation */ - ret = mlx5hws_pool_chunk_alloc(ste_pool, ste); - if (ret) { - mlx5hws_err(ctx, "Failed to allocate STE for %s RTC", - hws_matcher_rtc_type_to_str(rtc_type)); - return ret; + rtc_attr.log_size = attr->table.sz_row_log; + rtc_attr.log_depth = attr->table.sz_col_log; + rtc_attr.is_frst_jumbo = mlx5hws_matcher_mt_is_jumbo(mt); + rtc_attr.is_scnd_range = 0; + rtc_attr.miss_ft_id = matcher->end_ft_id; + + if (attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_HASH) { + /* The usual Hash Table */ + rtc_attr.update_index_mode = + MLX5_IFC_RTC_STE_UPDATE_MODE_BY_HASH; + + /* The first mt is used since all share the same definer */ + rtc_attr.match_definer_0 = mlx5hws_definer_get_id(mt->definer); + } else if (attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_INDEX) { + rtc_attr.update_index_mode = + MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET; + rtc_attr.num_hash_definer = 1; + + if (attr->distribute_mode == + MLX5HWS_MATCHER_DISTRIBUTE_BY_HASH) { + /* Hash Split Table */ + rtc_attr.access_index_mode = + MLX5_IFC_RTC_STE_ACCESS_MODE_BY_HASH; + rtc_attr.match_definer_0 = + mlx5hws_definer_get_id(mt->definer); + } else if (attr->distribute_mode == + MLX5HWS_MATCHER_DISTRIBUTE_BY_LINEAR) { + /* Linear Lookup Table */ + rtc_attr.access_index_mode = + MLX5_IFC_RTC_STE_ACCESS_MODE_LINEAR; + rtc_attr.match_definer_0 = + ctx->caps->linear_match_definer; } - break; - - case HWS_MATCHER_RTC_TYPE_STE_ARRAY: - action_ste = &matcher->action_ste; - - rtc_0_id = &action_ste->rtc_0_id; - rtc_1_id = &action_ste->rtc_1_id; - ste_pool = action_ste->pool; - ste = &action_ste->ste; - /* Action RTC size calculation: - * log((max number of rules in matcher) * - * (max number of action STEs per rule) * - * (2 to support writing new STEs for update rule)) - */ - ste->order = ilog2(roundup_pow_of_two(action_ste->max_stes)) + - attr->table.sz_row_log + - MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT; - rtc_attr.log_size = ste->order; - rtc_attr.log_depth = 0; - rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET; - /* The action STEs use the default always hit definer */ - rtc_attr.match_definer_0 = ctx->caps->trivial_match_definer; - rtc_attr.is_frst_jumbo = false; - rtc_attr.miss_ft_id = 0; - break; - - default: - mlx5hws_err(ctx, "HWS Invalid RTC type\n"); - return -EINVAL; } - obj_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste); + obj_id = mlx5hws_pool_get_base_id(matcher->match_ste.pool); rtc_attr.pd = ctx->pd_num; rtc_attr.ste_base = obj_id; - rtc_attr.ste_offset = ste->offset; rtc_attr.reparse_mode = mlx5hws_context_get_reparse_mode(ctx); rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(tbl->type, false); - hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, rtc_type, false); + hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, false); /* STC is a single resource (obj_id), use any STC for the ID */ - stc_pool = ctx->stc_pool; - default_stc = ctx->common_res.default_stc; - obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, &default_stc->default_hit); + obj_id = mlx5hws_pool_get_base_id(ctx->stc_pool); rtc_attr.stc_base = obj_id; - ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_0_id); + ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, + &matcher->match_ste.rtc_0_id); if (ret) { - mlx5hws_err(ctx, "Failed to create matcher RTC of type %s", - hws_matcher_rtc_type_to_str(rtc_type)); - goto free_ste; + mlx5hws_err(ctx, "Failed to create matcher RTC\n"); + return ret; } if (tbl->type == MLX5HWS_TABLE_TYPE_FDB) { - obj_id = mlx5hws_pool_chunk_get_base_mirror_id(ste_pool, ste); + obj_id = mlx5hws_pool_get_base_mirror_id( + matcher->match_ste.pool); rtc_attr.ste_base = obj_id; rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(tbl->type, true); - obj_id = mlx5hws_pool_chunk_get_base_mirror_id(stc_pool, &default_stc->default_hit); + obj_id = mlx5hws_pool_get_base_mirror_id(ctx->stc_pool); rtc_attr.stc_base = obj_id; - hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, rtc_type, true); + hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, true); - ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_1_id); + ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, + &matcher->match_ste.rtc_1_id); if (ret) { - mlx5hws_err(ctx, "Failed to create peer matcher RTC of type %s", - hws_matcher_rtc_type_to_str(rtc_type)); + mlx5hws_err(ctx, "Failed to create mirror matcher RTC\n"); goto destroy_rtc_0; } } @@ -347,46 +275,18 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, return 0; destroy_rtc_0: - mlx5hws_cmd_rtc_destroy(ctx->mdev, *rtc_0_id); -free_ste: - if (rtc_type == HWS_MATCHER_RTC_TYPE_MATCH) - mlx5hws_pool_chunk_free(ste_pool, ste); + mlx5hws_cmd_rtc_destroy(ctx->mdev, matcher->match_ste.rtc_0_id); return ret; } -static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher, - enum mlx5hws_matcher_rtc_type rtc_type) +static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher) { - struct mlx5hws_matcher_action_ste *action_ste; - struct mlx5hws_table *tbl = matcher->tbl; - struct mlx5hws_pool_chunk *ste; - struct mlx5hws_pool *ste_pool; - u32 rtc_0_id, rtc_1_id; - - switch (rtc_type) { - case HWS_MATCHER_RTC_TYPE_MATCH: - rtc_0_id = matcher->match_ste.rtc_0_id; - rtc_1_id = matcher->match_ste.rtc_1_id; - ste_pool = matcher->match_ste.pool; - ste = &matcher->match_ste.ste; - break; - case HWS_MATCHER_RTC_TYPE_STE_ARRAY: - action_ste = &matcher->action_ste; - rtc_0_id = action_ste->rtc_0_id; - rtc_1_id = action_ste->rtc_1_id; - ste_pool = action_ste->pool; - ste = &action_ste->ste; - break; - default: - return; - } + struct mlx5_core_dev *mdev = matcher->tbl->ctx->mdev; - if (tbl->type == MLX5HWS_TABLE_TYPE_FDB) - mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, rtc_1_id); + if (matcher->tbl->type == MLX5HWS_TABLE_TYPE_FDB) + mlx5hws_cmd_rtc_destroy(mdev, matcher->match_ste.rtc_1_id); - mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, rtc_0_id); - if (rtc_type == HWS_MATCHER_RTC_TYPE_MATCH) - mlx5hws_pool_chunk_free(ste_pool, ste); + mlx5hws_cmd_rtc_destroy(mdev, matcher->match_ste.rtc_0_id); } static int @@ -454,85 +354,17 @@ static int hws_matcher_check_and_process_at(struct mlx5hws_matcher *matcher, return 0; } -static int hws_matcher_resize_init(struct mlx5hws_matcher *src_matcher) -{ - struct mlx5hws_matcher_resize_data *resize_data; - - resize_data = kzalloc(sizeof(*resize_data), GFP_KERNEL); - if (!resize_data) - return -ENOMEM; - - resize_data->max_stes = src_matcher->action_ste.max_stes; - - resize_data->stc = src_matcher->action_ste.stc; - resize_data->rtc_0_id = src_matcher->action_ste.rtc_0_id; - resize_data->rtc_1_id = src_matcher->action_ste.rtc_1_id; - resize_data->pool = src_matcher->action_ste.max_stes ? - src_matcher->action_ste.pool : NULL; - - /* Place the new resized matcher on the dst matcher's list */ - list_add(&resize_data->list_node, &src_matcher->resize_dst->resize_data); - - /* Move all the previous resized matchers to the dst matcher's list */ - while (!list_empty(&src_matcher->resize_data)) { - resize_data = list_first_entry(&src_matcher->resize_data, - struct mlx5hws_matcher_resize_data, - list_node); - list_del_init(&resize_data->list_node); - list_add(&resize_data->list_node, &src_matcher->resize_dst->resize_data); - } - - return 0; -} - -static void hws_matcher_resize_uninit(struct mlx5hws_matcher *matcher) -{ - struct mlx5hws_matcher_resize_data *resize_data; - - if (!mlx5hws_matcher_is_resizable(matcher)) - return; - - while (!list_empty(&matcher->resize_data)) { - resize_data = list_first_entry(&matcher->resize_data, - struct mlx5hws_matcher_resize_data, - list_node); - list_del_init(&resize_data->list_node); - - if (resize_data->max_stes) { - mlx5hws_action_free_single_stc(matcher->tbl->ctx, - matcher->tbl->type, - &resize_data->stc); - - if (matcher->tbl->type == MLX5HWS_TABLE_TYPE_FDB) - mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, - resize_data->rtc_1_id); - - mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, - resize_data->rtc_0_id); - - if (resize_data->pool) - mlx5hws_pool_destroy(resize_data->pool); - } - - kfree(resize_data); - } -} - static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher) { bool is_jumbo = mlx5hws_matcher_mt_is_jumbo(matcher->mt); - struct mlx5hws_cmd_stc_modify_attr stc_attr = {0}; - struct mlx5hws_matcher_action_ste *action_ste; - struct mlx5hws_table *tbl = matcher->tbl; - struct mlx5hws_pool_attr pool_attr = {0}; - struct mlx5hws_context *ctx = tbl->ctx; - u32 required_stes; - u8 max_stes = 0; + struct mlx5hws_context *ctx = matcher->tbl->ctx; + u8 required_stes, max_stes; int i, ret; if (matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION) return 0; + max_stes = 0; for (i = 0; i < matcher->num_of_at; i++) { struct mlx5hws_action_template *at = &matcher->at[i]; @@ -548,75 +380,9 @@ static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher) /* Future: Optimize reparse */ } - /* There are no additional STEs required for matcher */ - if (!max_stes) - return 0; - - matcher->action_ste.max_stes = max_stes; - - action_ste = &matcher->action_ste; - - /* Allocate action STE mempool */ - pool_attr.table_type = tbl->type; - pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE; - pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_STE_ACTION_POOL; - /* Pool size is similar to action RTC size */ - pool_attr.alloc_log_sz = ilog2(roundup_pow_of_two(action_ste->max_stes)) + - matcher->attr.table.sz_row_log + - MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT; - hws_matcher_set_pool_attr(&pool_attr, matcher); - action_ste->pool = mlx5hws_pool_create(ctx, &pool_attr); - if (!action_ste->pool) { - mlx5hws_err(ctx, "Failed to create action ste pool\n"); - return -EINVAL; - } - - /* Allocate action RTC */ - ret = hws_matcher_create_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY); - if (ret) { - mlx5hws_err(ctx, "Failed to create action RTC\n"); - goto free_ste_pool; - } - - /* Allocate STC for jumps to STE */ - stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT; - stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_STE_TABLE; - stc_attr.reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE; - stc_attr.ste_table.ste = action_ste->ste; - stc_attr.ste_table.ste_pool = action_ste->pool; - stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer; - - ret = mlx5hws_action_alloc_single_stc(ctx, &stc_attr, tbl->type, - &action_ste->stc); - if (ret) { - mlx5hws_err(ctx, "Failed to create action jump to table STC\n"); - goto free_rtc; - } + matcher->num_of_action_stes = max_stes; return 0; - -free_rtc: - hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY); -free_ste_pool: - mlx5hws_pool_destroy(action_ste->pool); - return ret; -} - -static void hws_matcher_unbind_at(struct mlx5hws_matcher *matcher) -{ - struct mlx5hws_matcher_action_ste *action_ste; - struct mlx5hws_table *tbl = matcher->tbl; - - action_ste = &matcher->action_ste; - - if (!action_ste->max_stes || - matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION || - mlx5hws_matcher_is_in_resize(matcher)) - return; - - mlx5hws_action_free_single_stc(tbl->ctx, tbl->type, &action_ste->stc); - hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY); - mlx5hws_pool_destroy(action_ste->pool); } static int hws_matcher_bind_mt(struct mlx5hws_matcher *matcher) @@ -638,7 +404,6 @@ static int hws_matcher_bind_mt(struct mlx5hws_matcher *matcher) /* Create an STE pool per matcher*/ pool_attr.table_type = matcher->tbl->type; pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE; - pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_MATCHER_STE_POOL; pool_attr.alloc_log_sz = matcher->attr.table.sz_col_log + matcher->attr.table.sz_row_log; hws_matcher_set_pool_attr(&pool_attr, matcher); @@ -761,10 +526,10 @@ static int hws_matcher_create_and_connect(struct mlx5hws_matcher *matcher) /* Create matcher end flow table anchor */ ret = hws_matcher_create_end_ft(matcher); if (ret) - goto unbind_at; + goto unbind_mt; /* Allocate the RTC for the new matcher */ - ret = hws_matcher_create_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH); + ret = hws_matcher_create_rtc(matcher); if (ret) goto destroy_end_ft; @@ -776,11 +541,9 @@ static int hws_matcher_create_and_connect(struct mlx5hws_matcher *matcher) return 0; destroy_rtc: - hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH); + hws_matcher_destroy_rtc(matcher); destroy_end_ft: hws_matcher_destroy_end_ft(matcher); -unbind_at: - hws_matcher_unbind_at(matcher); unbind_mt: hws_matcher_unbind_mt(matcher); return ret; @@ -788,11 +551,9 @@ static int hws_matcher_create_and_connect(struct mlx5hws_matcher *matcher) static void hws_matcher_destroy_and_disconnect(struct mlx5hws_matcher *matcher) { - hws_matcher_resize_uninit(matcher); hws_matcher_disconnect(matcher); - hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH); + hws_matcher_destroy_rtc(matcher); hws_matcher_destroy_end_ft(matcher); - hws_matcher_unbind_at(matcher); hws_matcher_unbind_mt(matcher); } @@ -814,8 +575,6 @@ hws_matcher_create_col_matcher(struct mlx5hws_matcher *matcher) if (!col_matcher) return -ENOMEM; - INIT_LIST_HEAD(&col_matcher->resize_data); - col_matcher->tbl = matcher->tbl; col_matcher->mt = matcher->mt; col_matcher->at = matcher->at; @@ -869,8 +628,6 @@ static int hws_matcher_init(struct mlx5hws_matcher *matcher) struct mlx5hws_context *ctx = matcher->tbl->ctx; int ret; - INIT_LIST_HEAD(&matcher->resize_data); - mutex_lock(&ctx->ctrl_lock); /* Allocate matcher resource and connect to the packet pipe */ @@ -905,18 +662,44 @@ static int hws_matcher_uninit(struct mlx5hws_matcher *matcher) return 0; } +static int hws_matcher_grow_at_array(struct mlx5hws_matcher *matcher) +{ + void *p; + + if (matcher->size_of_at_array >= MLX5HWS_MATCHER_MAX_AT) + return -ENOMEM; + + matcher->size_of_at_array *= 2; + p = krealloc(matcher->at, + matcher->size_of_at_array * sizeof(*matcher->at), + __GFP_ZERO | GFP_KERNEL); + if (!p) { + matcher->size_of_at_array /= 2; + return -ENOMEM; + } + + matcher->at = p; + + return 0; +} + int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher, struct mlx5hws_action_template *at) { bool is_jumbo = mlx5hws_matcher_mt_is_jumbo(matcher->mt); - struct mlx5hws_context *ctx = matcher->tbl->ctx; u32 required_stes; int ret; - if (!matcher->attr.max_num_of_at_attach) { - mlx5hws_dbg(ctx, "Num of current at (%d) exceed allowed value\n", - matcher->num_of_at); - return -EOPNOTSUPP; + if (unlikely(matcher->num_of_at >= matcher->size_of_at_array)) { + ret = hws_matcher_grow_at_array(matcher); + if (ret) + return ret; + + if (matcher->col_matcher) { + ret = hws_matcher_grow_at_array(matcher->col_matcher); + if (ret) + return ret; + } } ret = hws_matcher_check_and_process_at(matcher, at); @@ -924,15 +707,11 @@ int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher, return ret; required_stes = at->num_of_action_stes - (!is_jumbo || at->only_term); - if (matcher->action_ste.max_stes < required_stes) { - mlx5hws_dbg(ctx, "Required STEs [%d] exceeds initial action template STE [%d]\n", - required_stes, matcher->action_ste.max_stes); - return -ENOMEM; - } + if (matcher->num_of_action_stes < required_stes) + matcher->num_of_action_stes = required_stes; matcher->at[matcher->num_of_at] = *at; matcher->num_of_at += 1; - matcher->attr.max_num_of_at_attach -= 1; if (matcher->col_matcher) matcher->col_matcher->num_of_at = matcher->num_of_at; @@ -960,8 +739,9 @@ hws_matcher_set_templates(struct mlx5hws_matcher *matcher, if (!matcher->mt) return -ENOMEM; - matcher->at = kvcalloc(num_of_at + matcher->attr.max_num_of_at_attach, - sizeof(*matcher->at), + matcher->size_of_at_array = + num_of_at + matcher->attr.max_num_of_at_attach; + matcher->at = kvcalloc(matcher->size_of_at_array, sizeof(*matcher->at), GFP_KERNEL); if (!matcher->at) { mlx5hws_err(ctx, "Failed to allocate action template array\n"); @@ -1110,7 +890,7 @@ static int hws_matcher_resize_precheck(struct mlx5hws_matcher *src_matcher, return -EINVAL; } - if (src_matcher->action_ste.max_stes > dst_matcher->action_ste.max_stes) { + if (src_matcher->num_of_action_stes > dst_matcher->num_of_action_stes) { mlx5hws_err(ctx, "Src/dst matcher max STEs mismatch\n"); return -EINVAL; } @@ -1139,10 +919,6 @@ int mlx5hws_matcher_resize_set_target(struct mlx5hws_matcher *src_matcher, src_matcher->resize_dst = dst_matcher; - ret = hws_matcher_resize_init(src_matcher); - if (ret) - src_matcher->resize_dst = NULL; - out: mutex_unlock(&src_matcher->tbl->ctx->ctrl_lock); return ret; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h index 020de70270c50..bad1fa8f77fd2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h @@ -23,6 +23,9 @@ */ #define MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT 1 +/* Maximum number of action templates that can be attached to a matcher. */ +#define MLX5HWS_MATCHER_MAX_AT 128 + enum mlx5hws_matcher_offset { MLX5HWS_MATCHER_OFFSET_TAG_DW1 = 12, MLX5HWS_MATCHER_OFFSET_TAG_DW0 = 13, @@ -42,28 +45,9 @@ struct mlx5hws_match_template { }; struct mlx5hws_matcher_match_ste { - struct mlx5hws_pool_chunk ste; - u32 rtc_0_id; - u32 rtc_1_id; - struct mlx5hws_pool *pool; -}; - -struct mlx5hws_matcher_action_ste { - struct mlx5hws_pool_chunk ste; - struct mlx5hws_pool_chunk stc; u32 rtc_0_id; u32 rtc_1_id; struct mlx5hws_pool *pool; - u8 max_stes; -}; - -struct mlx5hws_matcher_resize_data { - struct mlx5hws_pool_chunk stc; - u32 rtc_0_id; - u32 rtc_1_id; - struct mlx5hws_pool *pool; - u8 max_stes; - struct list_head list_node; }; struct mlx5hws_matcher { @@ -72,16 +56,16 @@ struct mlx5hws_matcher { struct mlx5hws_match_template *mt; struct mlx5hws_action_template *at; u8 num_of_at; + u8 size_of_at_array; u8 num_of_mt; + u8 num_of_action_stes; /* enum mlx5hws_matcher_flags */ u8 flags; u32 end_ft_id; struct mlx5hws_matcher *col_matcher; struct mlx5hws_matcher *resize_dst; struct mlx5hws_matcher_match_ste match_ste; - struct mlx5hws_matcher_action_ste action_ste; struct list_head list_node; - struct list_head resize_data; }; static inline bool diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c index 50a81d360bb2f..7e37d6e9eb836 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c @@ -20,15 +20,14 @@ static void hws_pool_free_one_resource(struct mlx5hws_pool_resource *resource) kfree(resource); } -static void hws_pool_resource_free(struct mlx5hws_pool *pool, - int resource_idx) +static void hws_pool_resource_free(struct mlx5hws_pool *pool) { - hws_pool_free_one_resource(pool->resource[resource_idx]); - pool->resource[resource_idx] = NULL; + hws_pool_free_one_resource(pool->resource); + pool->resource = NULL; if (pool->tbl_type == MLX5HWS_TABLE_TYPE_FDB) { - hws_pool_free_one_resource(pool->mirror_resource[resource_idx]); - pool->mirror_resource[resource_idx] = NULL; + hws_pool_free_one_resource(pool->mirror_resource); + pool->mirror_resource = NULL; } } @@ -61,10 +60,8 @@ hws_pool_create_one_resource(struct mlx5hws_pool *pool, u32 log_range, ret = -EINVAL; } - if (ret) { - mlx5hws_err(pool->ctx, "Failed to allocate resource objects\n"); + if (ret) goto free_resource; - } resource->pool = pool; resource->range = 1 << log_range; @@ -77,199 +74,114 @@ hws_pool_create_one_resource(struct mlx5hws_pool *pool, u32 log_range, return NULL; } -static int -hws_pool_resource_alloc(struct mlx5hws_pool *pool, u32 log_range, int idx) +static int hws_pool_resource_alloc(struct mlx5hws_pool *pool) { struct mlx5hws_pool_resource *resource; u32 fw_ft_type, opt_log_range; fw_ft_type = mlx5hws_table_get_res_fw_ft_type(pool->tbl_type, false); - opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_ORIG ? 0 : log_range; + opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_MIRROR ? + 0 : pool->alloc_log_sz; resource = hws_pool_create_one_resource(pool, opt_log_range, fw_ft_type); if (!resource) { - mlx5hws_err(pool->ctx, "Failed allocating resource\n"); + mlx5hws_err(pool->ctx, "Failed to allocate resource\n"); return -EINVAL; } - pool->resource[idx] = resource; + pool->resource = resource; if (pool->tbl_type == MLX5HWS_TABLE_TYPE_FDB) { struct mlx5hws_pool_resource *mirror_resource; fw_ft_type = mlx5hws_table_get_res_fw_ft_type(pool->tbl_type, true); - opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_MIRROR ? 0 : log_range; + opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_ORIG ? + 0 : pool->alloc_log_sz; mirror_resource = hws_pool_create_one_resource(pool, opt_log_range, fw_ft_type); if (!mirror_resource) { - mlx5hws_err(pool->ctx, "Failed allocating mirrored resource\n"); + mlx5hws_err(pool->ctx, "Failed to allocate mirrored resource\n"); hws_pool_free_one_resource(resource); - pool->resource[idx] = NULL; + pool->resource = NULL; return -EINVAL; } - pool->mirror_resource[idx] = mirror_resource; + pool->mirror_resource = mirror_resource; } return 0; } -static unsigned long *hws_pool_create_and_init_bitmap(u32 log_range) -{ - unsigned long *cur_bmp; - - cur_bmp = bitmap_zalloc(1 << log_range, GFP_KERNEL); - if (!cur_bmp) - return NULL; - - bitmap_fill(cur_bmp, 1 << log_range); - - return cur_bmp; -} - -static void hws_pool_buddy_db_put_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) +static int hws_pool_buddy_init(struct mlx5hws_pool *pool) { struct mlx5hws_buddy_mem *buddy; - buddy = pool->db.buddy_manager->buddies[chunk->resource_idx]; + buddy = mlx5hws_buddy_create(pool->alloc_log_sz); if (!buddy) { - mlx5hws_err(pool->ctx, "No such buddy (%d)\n", chunk->resource_idx); - return; - } - - mlx5hws_buddy_free_mem(buddy, chunk->offset, chunk->order); -} - -static struct mlx5hws_buddy_mem * -hws_pool_buddy_get_next_buddy(struct mlx5hws_pool *pool, int idx, - u32 order, bool *is_new_buddy) -{ - static struct mlx5hws_buddy_mem *buddy; - u32 new_buddy_size; - - buddy = pool->db.buddy_manager->buddies[idx]; - if (buddy) - return buddy; - - new_buddy_size = max(pool->alloc_log_sz, order); - *is_new_buddy = true; - buddy = mlx5hws_buddy_create(new_buddy_size); - if (!buddy) { - mlx5hws_err(pool->ctx, "Failed to create buddy order: %d index: %d\n", - new_buddy_size, idx); - return NULL; + mlx5hws_err(pool->ctx, "Failed to create buddy order: %zu\n", + pool->alloc_log_sz); + return -ENOMEM; } - if (hws_pool_resource_alloc(pool, new_buddy_size, idx) != 0) { - mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d index: %d\n", - pool->type, new_buddy_size, idx); + if (hws_pool_resource_alloc(pool) != 0) { + mlx5hws_err(pool->ctx, "Failed to create resource type: %d size %zu\n", + pool->type, pool->alloc_log_sz); mlx5hws_buddy_cleanup(buddy); - return NULL; + return -ENOMEM; } - pool->db.buddy_manager->buddies[idx] = buddy; + pool->db.buddy = buddy; - return buddy; + return 0; } -static int hws_pool_buddy_get_mem_chunk(struct mlx5hws_pool *pool, - int order, - u32 *buddy_idx, - int *seg) +static int hws_pool_buddy_db_get_chunk(struct mlx5hws_pool *pool, + struct mlx5hws_pool_chunk *chunk) { - struct mlx5hws_buddy_mem *buddy; - bool new_mem = false; - int ret = 0; - int i; - - *seg = -1; - - /* Find the next free place from the buddy array */ - while (*seg < 0) { - for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) { - buddy = hws_pool_buddy_get_next_buddy(pool, i, - order, - &new_mem); - if (!buddy) { - ret = -ENOMEM; - goto out; - } - - *seg = mlx5hws_buddy_alloc_mem(buddy, order); - if (*seg >= 0) - goto found; - - if (pool->flags & MLX5HWS_POOL_FLAGS_ONE_RESOURCE) { - mlx5hws_err(pool->ctx, - "Fail to allocate seg for one resource pool\n"); - ret = -ENOMEM; - goto out; - } - - if (new_mem) { - /* We have new memory pool, should be place for us */ - mlx5hws_err(pool->ctx, - "No memory for order: %d with buddy no: %d\n", - order, i); - ret = -ENOMEM; - goto out; - } - } + struct mlx5hws_buddy_mem *buddy = pool->db.buddy; + + if (!buddy) { + mlx5hws_err(pool->ctx, "Bad buddy state\n"); + return -EINVAL; } -found: - *buddy_idx = i; -out: - return ret; + chunk->offset = mlx5hws_buddy_alloc_mem(buddy, chunk->order); + if (chunk->offset >= 0) + return 0; + + return -ENOMEM; } -static int hws_pool_buddy_db_get_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) +static void hws_pool_buddy_db_put_chunk(struct mlx5hws_pool *pool, + struct mlx5hws_pool_chunk *chunk) { - int ret = 0; + struct mlx5hws_buddy_mem *buddy; - /* Go over the buddies and find next free slot */ - ret = hws_pool_buddy_get_mem_chunk(pool, chunk->order, - &chunk->resource_idx, - &chunk->offset); - if (ret) - mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n", - chunk->order); + buddy = pool->db.buddy; + if (!buddy) { + mlx5hws_err(pool->ctx, "Bad buddy state\n"); + return; + } - return ret; + mlx5hws_buddy_free_mem(buddy, chunk->offset, chunk->order); } static void hws_pool_buddy_db_uninit(struct mlx5hws_pool *pool) { struct mlx5hws_buddy_mem *buddy; - int i; - - for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) { - buddy = pool->db.buddy_manager->buddies[i]; - if (buddy) { - mlx5hws_buddy_cleanup(buddy); - kfree(buddy); - pool->db.buddy_manager->buddies[i] = NULL; - } - } - kfree(pool->db.buddy_manager); + buddy = pool->db.buddy; + if (buddy) { + mlx5hws_buddy_cleanup(buddy); + kfree(buddy); + pool->db.buddy = NULL; + } } -static int hws_pool_buddy_db_init(struct mlx5hws_pool *pool, u32 log_range) +static int hws_pool_buddy_db_init(struct mlx5hws_pool *pool) { - pool->db.buddy_manager = kzalloc(sizeof(*pool->db.buddy_manager), GFP_KERNEL); - if (!pool->db.buddy_manager) - return -ENOMEM; - - if (pool->flags & MLX5HWS_POOL_FLAGS_ALLOC_MEM_ON_CREATE) { - bool new_buddy; + int ret; - if (!hws_pool_buddy_get_next_buddy(pool, 0, log_range, &new_buddy)) { - mlx5hws_err(pool->ctx, - "Failed allocating memory on create log_sz: %d\n", log_range); - kfree(pool->db.buddy_manager); - return -ENOMEM; - } - } + ret = hws_pool_buddy_init(pool); + if (ret) + return ret; pool->p_db_uninit = &hws_pool_buddy_db_uninit; pool->p_get_chunk = &hws_pool_buddy_db_get_chunk; @@ -278,261 +190,105 @@ static int hws_pool_buddy_db_init(struct mlx5hws_pool *pool, u32 log_range) return 0; } -static int hws_pool_create_resource_on_index(struct mlx5hws_pool *pool, - u32 alloc_size, int idx) -{ - int ret = hws_pool_resource_alloc(pool, alloc_size, idx); - - if (ret) { - mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d index: %d\n", - pool->type, alloc_size, idx); - return ret; - } - - return 0; -} - -static struct mlx5hws_pool_elements * -hws_pool_element_create_new_elem(struct mlx5hws_pool *pool, u32 order, int idx) +static unsigned long *hws_pool_create_and_init_bitmap(u32 log_range) { - struct mlx5hws_pool_elements *elem; - u32 alloc_size; - - alloc_size = pool->alloc_log_sz; + unsigned long *bitmap; - elem = kzalloc(sizeof(*elem), GFP_KERNEL); - if (!elem) + bitmap = bitmap_zalloc(1 << log_range, GFP_KERNEL); + if (!bitmap) return NULL; - /* Sharing the same resource, also means that all the elements are with size 1 */ - if ((pool->flags & MLX5HWS_POOL_FLAGS_FIXED_SIZE_OBJECTS) && - !(pool->flags & MLX5HWS_POOL_FLAGS_RESOURCE_PER_CHUNK)) { - /* Currently all chunks in size 1 */ - elem->bitmap = hws_pool_create_and_init_bitmap(alloc_size - order); - if (!elem->bitmap) { - mlx5hws_err(pool->ctx, - "Failed to create bitmap type: %d: size %d index: %d\n", - pool->type, alloc_size, idx); - goto free_elem; - } - - elem->log_size = alloc_size - order; - } - - if (hws_pool_create_resource_on_index(pool, alloc_size, idx)) { - mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d index: %d\n", - pool->type, alloc_size, idx); - goto free_db; - } - - pool->db.element_manager->elements[idx] = elem; - - return elem; + bitmap_fill(bitmap, 1 << log_range); -free_db: - bitmap_free(elem->bitmap); -free_elem: - kfree(elem); - return NULL; + return bitmap; } -static int hws_pool_element_find_seg(struct mlx5hws_pool_elements *elem, int *seg) +static int hws_pool_bitmap_init(struct mlx5hws_pool *pool) { - unsigned int segment, size; - - size = 1 << elem->log_size; + unsigned long *bitmap; - segment = find_first_bit(elem->bitmap, size); - if (segment >= size) { - elem->is_full = true; + bitmap = hws_pool_create_and_init_bitmap(pool->alloc_log_sz); + if (!bitmap) { + mlx5hws_err(pool->ctx, "Failed to create bitmap order: %zu\n", + pool->alloc_log_sz); return -ENOMEM; } - bitmap_clear(elem->bitmap, segment, 1); - *seg = segment; - return 0; -} - -static int -hws_pool_onesize_element_get_mem_chunk(struct mlx5hws_pool *pool, u32 order, - u32 *idx, int *seg) -{ - struct mlx5hws_pool_elements *elem; - - elem = pool->db.element_manager->elements[0]; - if (!elem) - elem = hws_pool_element_create_new_elem(pool, order, 0); - if (!elem) - goto err_no_elem; - - if (hws_pool_element_find_seg(elem, seg) != 0) { - mlx5hws_err(pool->ctx, "No more resources (last request order: %d)\n", order); + if (hws_pool_resource_alloc(pool) != 0) { + mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %zu\n", + pool->type, pool->alloc_log_sz); + bitmap_free(bitmap); return -ENOMEM; } - *idx = 0; - elem->num_of_elements++; - return 0; + pool->db.bitmap = bitmap; -err_no_elem: - mlx5hws_err(pool->ctx, "Failed to allocate element for order: %d\n", order); - return -ENOMEM; -} - -static int -hws_pool_general_element_get_mem_chunk(struct mlx5hws_pool *pool, u32 order, - u32 *idx, int *seg) -{ - int ret, i; - - for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) { - if (!pool->resource[i]) { - ret = hws_pool_create_resource_on_index(pool, order, i); - if (ret) - goto err_no_res; - *idx = i; - *seg = 0; /* One memory slot in that element */ - return 0; - } - } - - mlx5hws_err(pool->ctx, "No more resources (last request order: %d)\n", order); - return -ENOMEM; - -err_no_res: - mlx5hws_err(pool->ctx, "Failed to allocate element for order: %d\n", order); - return -ENOMEM; + return 0; } -static int hws_pool_general_element_db_get_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) +static int hws_pool_bitmap_db_get_chunk(struct mlx5hws_pool *pool, + struct mlx5hws_pool_chunk *chunk) { - int ret; - - /* Go over all memory elements and find/allocate free slot */ - ret = hws_pool_general_element_get_mem_chunk(pool, chunk->order, - &chunk->resource_idx, - &chunk->offset); - if (ret) - mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n", - chunk->order); + unsigned long *bitmap, size; - return ret; -} + if (chunk->order != 0) { + mlx5hws_err(pool->ctx, "Pool only supports order 0 allocs\n"); + return -EINVAL; + } -static void hws_pool_general_element_db_put_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) -{ - if (unlikely(!pool->resource[chunk->resource_idx])) - pr_warn("HWS: invalid resource with index %d\n", chunk->resource_idx); + bitmap = pool->db.bitmap; + if (!bitmap) { + mlx5hws_err(pool->ctx, "Bad bitmap state\n"); + return -EINVAL; + } - if (pool->flags & MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE) - hws_pool_resource_free(pool, chunk->resource_idx); -} + size = 1 << pool->alloc_log_sz; -static void hws_pool_general_element_db_uninit(struct mlx5hws_pool *pool) -{ - (void)pool; -} + chunk->offset = find_first_bit(bitmap, size); + if (chunk->offset >= size) + return -ENOMEM; -/* This memory management works as the following: - * - At start doesn't allocate no mem at all. - * - When new request for chunk arrived: - * allocate resource and give it. - * - When free that chunk: - * the resource is freed. - */ -static int hws_pool_general_element_db_init(struct mlx5hws_pool *pool) -{ - pool->p_db_uninit = &hws_pool_general_element_db_uninit; - pool->p_get_chunk = &hws_pool_general_element_db_get_chunk; - pool->p_put_chunk = &hws_pool_general_element_db_put_chunk; + bitmap_clear(bitmap, chunk->offset, 1); return 0; } -static void hws_onesize_element_db_destroy_element(struct mlx5hws_pool *pool, - struct mlx5hws_pool_elements *elem, - struct mlx5hws_pool_chunk *chunk) -{ - if (unlikely(!pool->resource[chunk->resource_idx])) - pr_warn("HWS: invalid resource with index %d\n", chunk->resource_idx); - - hws_pool_resource_free(pool, chunk->resource_idx); - kfree(elem); - pool->db.element_manager->elements[chunk->resource_idx] = NULL; -} - -static void hws_onesize_element_db_put_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) +static void hws_pool_bitmap_db_put_chunk(struct mlx5hws_pool *pool, + struct mlx5hws_pool_chunk *chunk) { - struct mlx5hws_pool_elements *elem; + unsigned long *bitmap; - if (unlikely(chunk->resource_idx)) - pr_warn("HWS: invalid resource with index %d\n", chunk->resource_idx); - - elem = pool->db.element_manager->elements[chunk->resource_idx]; - if (!elem) { - mlx5hws_err(pool->ctx, "No such element (%d)\n", chunk->resource_idx); + bitmap = pool->db.bitmap; + if (!bitmap) { + mlx5hws_err(pool->ctx, "Bad bitmap state\n"); return; } - bitmap_set(elem->bitmap, chunk->offset, 1); - elem->is_full = false; - elem->num_of_elements--; - - if (pool->flags & MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE && - !elem->num_of_elements) - hws_onesize_element_db_destroy_element(pool, elem, chunk); + bitmap_set(bitmap, chunk->offset, 1); } -static int hws_onesize_element_db_get_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) +static void hws_pool_bitmap_db_uninit(struct mlx5hws_pool *pool) { - int ret = 0; + unsigned long *bitmap; - /* Go over all memory elements and find/allocate free slot */ - ret = hws_pool_onesize_element_get_mem_chunk(pool, chunk->order, - &chunk->resource_idx, - &chunk->offset); - if (ret) - mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n", - chunk->order); - - return ret; -} - -static void hws_onesize_element_db_uninit(struct mlx5hws_pool *pool) -{ - struct mlx5hws_pool_elements *elem; - int i; - - for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) { - elem = pool->db.element_manager->elements[i]; - if (elem) { - bitmap_free(elem->bitmap); - kfree(elem); - pool->db.element_manager->elements[i] = NULL; - } + bitmap = pool->db.bitmap; + if (bitmap) { + bitmap_free(bitmap); + pool->db.bitmap = NULL; } - kfree(pool->db.element_manager); } -/* This memory management works as the following: - * - At start doesn't allocate no mem at all. - * - When new request for chunk arrived: - * aloocate the first and only slot of memory/resource - * when it ended return error. - */ -static int hws_pool_onesize_element_db_init(struct mlx5hws_pool *pool) +static int hws_pool_bitmap_db_init(struct mlx5hws_pool *pool) { - pool->db.element_manager = kzalloc(sizeof(*pool->db.element_manager), GFP_KERNEL); - if (!pool->db.element_manager) - return -ENOMEM; + int ret; - pool->p_db_uninit = &hws_onesize_element_db_uninit; - pool->p_get_chunk = &hws_onesize_element_db_get_chunk; - pool->p_put_chunk = &hws_onesize_element_db_put_chunk; + ret = hws_pool_bitmap_init(pool); + if (ret) + return ret; + + pool->p_db_uninit = &hws_pool_bitmap_db_uninit; + pool->p_get_chunk = &hws_pool_bitmap_db_get_chunk; + pool->p_put_chunk = &hws_pool_bitmap_db_put_chunk; return 0; } @@ -542,15 +298,14 @@ static int hws_pool_db_init(struct mlx5hws_pool *pool, { int ret; - if (db_type == MLX5HWS_POOL_DB_TYPE_GENERAL_SIZE) - ret = hws_pool_general_element_db_init(pool); - else if (db_type == MLX5HWS_POOL_DB_TYPE_ONE_SIZE_RESOURCE) - ret = hws_pool_onesize_element_db_init(pool); + if (db_type == MLX5HWS_POOL_DB_TYPE_BITMAP) + ret = hws_pool_bitmap_db_init(pool); else - ret = hws_pool_buddy_db_init(pool, pool->alloc_log_sz); + ret = hws_pool_buddy_db_init(pool); if (ret) { - mlx5hws_err(pool->ctx, "Failed to init general db : %d (ret: %d)\n", db_type, ret); + mlx5hws_err(pool->ctx, "Failed to init pool type: %d (ret: %d)\n", + db_type, ret); return ret; } @@ -569,6 +324,8 @@ int mlx5hws_pool_chunk_alloc(struct mlx5hws_pool *pool, mutex_lock(&pool->lock); ret = pool->p_get_chunk(pool, chunk); + if (ret == 0) + pool->available_elems -= 1 << chunk->order; mutex_unlock(&pool->lock); return ret; @@ -579,6 +336,7 @@ void mlx5hws_pool_chunk_free(struct mlx5hws_pool *pool, { mutex_lock(&pool->lock); pool->p_put_chunk(pool, chunk); + pool->available_elems += 1 << chunk->order; mutex_unlock(&pool->lock); } @@ -599,17 +357,13 @@ mlx5hws_pool_create(struct mlx5hws_context *ctx, struct mlx5hws_pool_attr *pool_ pool->tbl_type = pool_attr->table_type; pool->opt_type = pool_attr->opt_type; - /* Support general db */ - if (pool->flags == (MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE | - MLX5HWS_POOL_FLAGS_RESOURCE_PER_CHUNK)) - res_db_type = MLX5HWS_POOL_DB_TYPE_GENERAL_SIZE; - else if (pool->flags == (MLX5HWS_POOL_FLAGS_ONE_RESOURCE | - MLX5HWS_POOL_FLAGS_FIXED_SIZE_OBJECTS)) - res_db_type = MLX5HWS_POOL_DB_TYPE_ONE_SIZE_RESOURCE; - else + if (pool->flags & MLX5HWS_POOL_FLAG_BUDDY) res_db_type = MLX5HWS_POOL_DB_TYPE_BUDDY; + else + res_db_type = MLX5HWS_POOL_DB_TYPE_BITMAP; pool->alloc_log_sz = pool_attr->alloc_log_sz; + pool->available_elems = 1 << pool_attr->alloc_log_sz; if (hws_pool_db_init(pool, res_db_type)) goto free_pool; @@ -623,18 +377,17 @@ mlx5hws_pool_create(struct mlx5hws_context *ctx, struct mlx5hws_pool_attr *pool_ return NULL; } -int mlx5hws_pool_destroy(struct mlx5hws_pool *pool) +void mlx5hws_pool_destroy(struct mlx5hws_pool *pool) { - int i; - mutex_destroy(&pool->lock); - for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) - if (pool->resource[i]) - hws_pool_resource_free(pool, i); + if (pool->available_elems != 1 << pool->alloc_log_sz) + mlx5hws_err(pool->ctx, "Attempting to destroy non-empty pool\n"); + + if (pool->resource) + hws_pool_resource_free(pool); hws_pool_db_unint(pool); kfree(pool); - return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h index 621298b352b24..33e33d5f1fb3d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h @@ -6,16 +6,12 @@ #define MLX5HWS_POOL_STC_LOG_SZ 15 -#define MLX5HWS_POOL_RESOURCE_ARR_SZ 100 - enum mlx5hws_pool_type { MLX5HWS_POOL_TYPE_STE, MLX5HWS_POOL_TYPE_STC, }; struct mlx5hws_pool_chunk { - u32 resource_idx; - /* Internal offset, relative to base index */ int offset; int order; }; @@ -27,35 +23,17 @@ struct mlx5hws_pool_resource { }; enum mlx5hws_pool_flags { - /* Only a one resource in that pool */ - MLX5HWS_POOL_FLAGS_ONE_RESOURCE = 1 << 0, - MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE = 1 << 1, - /* No sharing resources between chunks */ - MLX5HWS_POOL_FLAGS_RESOURCE_PER_CHUNK = 1 << 2, - /* All objects are in the same size */ - MLX5HWS_POOL_FLAGS_FIXED_SIZE_OBJECTS = 1 << 3, - /* Managed by buddy allocator */ - MLX5HWS_POOL_FLAGS_BUDDY_MANAGED = 1 << 4, - /* Allocate pool_type memory on pool creation */ - MLX5HWS_POOL_FLAGS_ALLOC_MEM_ON_CREATE = 1 << 5, - - /* These values should be used by the caller */ - MLX5HWS_POOL_FLAGS_FOR_STC_POOL = - MLX5HWS_POOL_FLAGS_ONE_RESOURCE | - MLX5HWS_POOL_FLAGS_FIXED_SIZE_OBJECTS, - MLX5HWS_POOL_FLAGS_FOR_MATCHER_STE_POOL = - MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE | - MLX5HWS_POOL_FLAGS_RESOURCE_PER_CHUNK, - MLX5HWS_POOL_FLAGS_FOR_STE_ACTION_POOL = - MLX5HWS_POOL_FLAGS_ONE_RESOURCE | - MLX5HWS_POOL_FLAGS_BUDDY_MANAGED | - MLX5HWS_POOL_FLAGS_ALLOC_MEM_ON_CREATE, + /* Managed by a buddy allocator. If this is not set only allocations of + * order 0 are supported. + */ + MLX5HWS_POOL_FLAG_BUDDY = BIT(0), }; enum mlx5hws_pool_optimize { MLX5HWS_POOL_OPTIMIZE_NONE = 0x0, MLX5HWS_POOL_OPTIMIZE_ORIG = 0x1, MLX5HWS_POOL_OPTIMIZE_MIRROR = 0x2, + MLX5HWS_POOL_OPTIMIZE_MAX = 0x3, }; struct mlx5hws_pool_attr { @@ -68,34 +46,17 @@ struct mlx5hws_pool_attr { }; enum mlx5hws_db_type { - /* Uses for allocating chunk of big memory, each element has its own resource in the FW*/ - MLX5HWS_POOL_DB_TYPE_GENERAL_SIZE, - /* One resource only, all the elements are with same one size */ - MLX5HWS_POOL_DB_TYPE_ONE_SIZE_RESOURCE, - /* Many resources, the memory allocated with buddy mechanism */ + /* Uses a bitmap, supports only allocations of order 0. */ + MLX5HWS_POOL_DB_TYPE_BITMAP, + /* Entries are managed using a buddy mechanism. */ MLX5HWS_POOL_DB_TYPE_BUDDY, }; -struct mlx5hws_buddy_manager { - struct mlx5hws_buddy_mem *buddies[MLX5HWS_POOL_RESOURCE_ARR_SZ]; -}; - -struct mlx5hws_pool_elements { - u32 num_of_elements; - unsigned long *bitmap; - u32 log_size; - bool is_full; -}; - -struct mlx5hws_element_manager { - struct mlx5hws_pool_elements *elements[MLX5HWS_POOL_RESOURCE_ARR_SZ]; -}; - struct mlx5hws_pool_db { enum mlx5hws_db_type type; union { - struct mlx5hws_element_manager *element_manager; - struct mlx5hws_buddy_manager *buddy_manager; + unsigned long *bitmap; + struct mlx5hws_buddy_mem *buddy; }; }; @@ -111,11 +72,11 @@ struct mlx5hws_pool { enum mlx5hws_pool_flags flags; struct mutex lock; /* protect the pool */ size_t alloc_log_sz; + size_t available_elems; enum mlx5hws_table_type tbl_type; enum mlx5hws_pool_optimize opt_type; - struct mlx5hws_pool_resource *resource[MLX5HWS_POOL_RESOURCE_ARR_SZ]; - struct mlx5hws_pool_resource *mirror_resource[MLX5HWS_POOL_RESOURCE_ARR_SZ]; - /* DB */ + struct mlx5hws_pool_resource *resource; + struct mlx5hws_pool_resource *mirror_resource; struct mlx5hws_pool_db db; /* Functions */ mlx5hws_pool_unint_db p_db_uninit; @@ -127,7 +88,7 @@ struct mlx5hws_pool * mlx5hws_pool_create(struct mlx5hws_context *ctx, struct mlx5hws_pool_attr *pool_attr); -int mlx5hws_pool_destroy(struct mlx5hws_pool *pool); +void mlx5hws_pool_destroy(struct mlx5hws_pool *pool); int mlx5hws_pool_chunk_alloc(struct mlx5hws_pool *pool, struct mlx5hws_pool_chunk *chunk); @@ -135,17 +96,37 @@ int mlx5hws_pool_chunk_alloc(struct mlx5hws_pool *pool, void mlx5hws_pool_chunk_free(struct mlx5hws_pool *pool, struct mlx5hws_pool_chunk *chunk); -static inline u32 -mlx5hws_pool_chunk_get_base_id(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) +static inline u32 mlx5hws_pool_get_base_id(struct mlx5hws_pool *pool) { - return pool->resource[chunk->resource_idx]->base_id; + return pool->resource->base_id; } -static inline u32 -mlx5hws_pool_chunk_get_base_mirror_id(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) +static inline u32 mlx5hws_pool_get_base_mirror_id(struct mlx5hws_pool *pool) { - return pool->mirror_resource[chunk->resource_idx]->base_id; + return pool->mirror_resource->base_id; +} + +static inline bool +mlx5hws_pool_empty(struct mlx5hws_pool *pool) +{ + bool ret; + + mutex_lock(&pool->lock); + ret = pool->available_elems == 0; + mutex_unlock(&pool->lock); + + return ret; +} + +static inline bool +mlx5hws_pool_full(struct mlx5hws_pool *pool) +{ + bool ret; + + mutex_lock(&pool->lock); + ret = pool->available_elems == (1 << pool->alloc_log_sz); + mutex_unlock(&pool->lock); + + return ret; } #endif /* MLX5HWS_POOL_H_ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c index a27a2d5ffc7b1..9e6f35d684456 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c @@ -195,44 +195,30 @@ hws_rule_load_delete_info(struct mlx5hws_rule *rule, } } -static int hws_rule_alloc_action_ste(struct mlx5hws_rule *rule) +static int mlx5hws_rule_alloc_action_ste(struct mlx5hws_rule *rule, + u16 queue_id, bool skip_rx, + bool skip_tx) { struct mlx5hws_matcher *matcher = rule->matcher; - struct mlx5hws_matcher_action_ste *action_ste; - struct mlx5hws_pool_chunk ste = {0}; - int ret; - - action_ste = &matcher->action_ste; - ste.order = ilog2(roundup_pow_of_two(action_ste->max_stes)); - ret = mlx5hws_pool_chunk_alloc(action_ste->pool, &ste); - if (unlikely(ret)) { - mlx5hws_err(matcher->tbl->ctx, - "Failed to allocate STE for rule actions"); - return ret; - } - - rule->action_ste.pool = matcher->action_ste.pool; - rule->action_ste.num_stes = matcher->action_ste.max_stes; - rule->action_ste.index = ste.offset; + struct mlx5hws_context *ctx = matcher->tbl->ctx; - return 0; + rule->action_ste.ste.order = + ilog2(roundup_pow_of_two(matcher->num_of_action_stes)); + return mlx5hws_action_ste_chunk_alloc(&ctx->action_ste_pool[queue_id], + skip_rx, skip_tx, + &rule->action_ste); } -void mlx5hws_rule_free_action_ste(struct mlx5hws_rule_action_ste_info *action_ste) +void mlx5hws_rule_free_action_ste(struct mlx5hws_action_ste_chunk *action_ste) { - struct mlx5hws_pool_chunk ste = {0}; - - if (!action_ste->num_stes) + if (!action_ste->action_tbl) return; - ste.order = ilog2(roundup_pow_of_two(action_ste->num_stes)); - ste.offset = action_ste->index; - /* This release is safe only when the rule match STE was deleted * (when the rule is being deleted) or replaced with the new STE that * isn't pointing to old action STEs (when the rule is being updated). */ - mlx5hws_pool_chunk_free(action_ste->pool, &ste); + mlx5hws_action_ste_chunk_free(action_ste); } static void hws_rule_create_init(struct mlx5hws_rule *rule, @@ -250,22 +236,15 @@ static void hws_rule_create_init(struct mlx5hws_rule *rule, rule->rtc_0 = 0; rule->rtc_1 = 0; - rule->action_ste.pool = NULL; - rule->action_ste.num_stes = 0; - rule->action_ste.index = -1; - rule->status = MLX5HWS_RULE_STATUS_CREATING; } else { rule->status = MLX5HWS_RULE_STATUS_UPDATING; + /* Save the old action STE info so we can free it after writing + * new action STEs and a corresponding match STE. + */ + rule->old_action_ste = rule->action_ste; } - /* Initialize the old action STE info - shallow-copy action_ste. - * In create flow this will set old_action_ste fields to initial values. - * In update flow this will save the existing action STE info, - * so that we will later use it to free old STEs. - */ - rule->old_action_ste = rule->action_ste; - rule->pending_wqes = 0; /* Init default send STE attributes */ @@ -277,7 +256,6 @@ static void hws_rule_create_init(struct mlx5hws_rule *rule, /* Init default action apply */ apply->tbl_type = tbl->type; apply->common_res = &ctx->common_res; - apply->jump_to_action_stc = matcher->action_ste.stc.offset; apply->require_dep = 0; } @@ -353,17 +331,24 @@ static int hws_rule_create_hws(struct mlx5hws_rule *rule, if (action_stes) { /* Allocate action STEs for rules that need more than match STE */ - ret = hws_rule_alloc_action_ste(rule); + ret = mlx5hws_rule_alloc_action_ste(rule, attr->queue_id, + !!ste_attr.rtc_0, + !!ste_attr.rtc_1); if (ret) { mlx5hws_err(ctx, "Failed to allocate action memory %d", ret); mlx5hws_send_abort_new_dep_wqe(queue); return ret; } + apply.jump_to_action_stc = + rule->action_ste.action_tbl->stc.offset; /* Skip RX/TX based on the dep_wqe init */ - ste_attr.rtc_0 = dep_wqe->rtc_0 ? matcher->action_ste.rtc_0_id : 0; - ste_attr.rtc_1 = dep_wqe->rtc_1 ? matcher->action_ste.rtc_1_id : 0; + ste_attr.rtc_0 = dep_wqe->rtc_0 ? + rule->action_ste.action_tbl->rtc_0_id : 0; + ste_attr.rtc_1 = dep_wqe->rtc_1 ? + rule->action_ste.action_tbl->rtc_1_id : 0; /* Action STEs are written to a specific index last to first */ - ste_attr.direct_index = rule->action_ste.index + action_stes; + ste_attr.direct_index = + rule->action_ste.ste.offset + action_stes; apply.next_direct_idx = ste_attr.direct_index; } else { apply.next_direct_idx = 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h index b5ee94ac449b1..1c47a9c11572a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h @@ -43,12 +43,6 @@ struct mlx5hws_rule_match_tag { }; }; -struct mlx5hws_rule_action_ste_info { - struct mlx5hws_pool *pool; - int index; /* STE array index */ - u8 num_stes; -}; - struct mlx5hws_rule_resize_info { u32 rtc_0; u32 rtc_1; @@ -64,8 +58,8 @@ struct mlx5hws_rule { struct mlx5hws_rule_match_tag tag; struct mlx5hws_rule_resize_info *resize_info; }; - struct mlx5hws_rule_action_ste_info action_ste; - struct mlx5hws_rule_action_ste_info old_action_ste; + struct mlx5hws_action_ste_chunk action_ste; + struct mlx5hws_action_ste_chunk old_action_ste; u32 rtc_0; /* The RTC into which the STE was inserted */ u32 rtc_1; /* The RTC into which the STE was inserted */ u8 status; /* enum mlx5hws_rule_status */ @@ -75,7 +69,7 @@ struct mlx5hws_rule { */ }; -void mlx5hws_rule_free_action_ste(struct mlx5hws_rule_action_ste_info *action_ste); +void mlx5hws_rule_free_action_ste(struct mlx5hws_action_ste_chunk *action_ste); int mlx5hws_rule_move_hws_remove(struct mlx5hws_rule *rule, void *queue, void *user_data); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c index 3f64cdbabfa3c..0a8fb9c842d3b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c @@ -262,7 +262,7 @@ static int mlxsw_sp_port_set_pauseparam(struct net_device *dev, } struct mlxsw_sp_port_hw_stats { - char str[ETH_GSTRING_LEN]; + char str[ETH_GSTRING_LEN] __nonstring; u64 (*getter)(const char *payload); bool cells_bytes; }; diff --git a/drivers/net/ethernet/meta/fbnic/fbnic.h b/drivers/net/ethernet/meta/fbnic/fbnic.h index 4ca7b99ef1315..80d54edaac557 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic.h @@ -81,6 +81,9 @@ struct fbnic_dev { /* Local copy of hardware statistics */ struct fbnic_hw_stats hw_stats; + + /* Lock protecting access to hw_stats */ + spinlock_t hw_stats_lock; }; /* Reserve entry 0 in the MSI-X "others" array until we have filled all diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h index 3b12a0ab59065..0c217c195c6a3 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h @@ -397,6 +397,15 @@ enum { #define FBNIC_TCE_DROP_CTRL_TTI_FRM_DROP_EN CSR_BIT(1) #define FBNIC_TCE_DROP_CTRL_TTI_TBI_DROP_EN CSR_BIT(2) +#define FBNIC_TCE_TTI_CM_DROP_PKTS 0x0403e /* 0x100f8 */ +#define FBNIC_TCE_TTI_CM_DROP_BYTE_L 0x0403f /* 0x100fc */ +#define FBNIC_TCE_TTI_CM_DROP_BYTE_H 0x04040 /* 0x10100 */ +#define FBNIC_TCE_TTI_FRAME_DROP_PKTS 0x04041 /* 0x10104 */ +#define FBNIC_TCE_TTI_FRAME_DROP_BYTE_L 0x04042 /* 0x10108 */ +#define FBNIC_TCE_TTI_FRAME_DROP_BYTE_H 0x04043 /* 0x1010c */ +#define FBNIC_TCE_TBI_DROP_PKTS 0x04044 /* 0x10110 */ +#define FBNIC_TCE_TBI_DROP_BYTE_L 0x04045 /* 0x10114 */ + #define FBNIC_TCE_TCAM_IDX2DEST_MAP 0x0404A /* 0x10128 */ #define FBNIC_TCE_TCAM_IDX2DEST_MAP_DEST_ID_0 CSR_GENMASK(3, 0) enum { @@ -432,6 +441,11 @@ enum { #define FBNIC_TMI_SOP_PROT_CTRL 0x04400 /* 0x11000 */ #define FBNIC_TMI_DROP_CTRL 0x04401 /* 0x11004 */ #define FBNIC_TMI_DROP_CTRL_EN CSR_BIT(0) +#define FBNIC_TMI_DROP_PKTS 0x04402 /* 0x11008 */ +#define FBNIC_TMI_DROP_BYTE_L 0x04403 /* 0x1100c */ +#define FBNIC_TMI_ILLEGAL_PTP_REQS 0x04409 /* 0x11024 */ +#define FBNIC_TMI_GOOD_PTP_TS 0x0440a /* 0x11028 */ +#define FBNIC_TMI_BAD_PTP_TS 0x0440b /* 0x1102c */ #define FBNIC_CSR_END_TMI 0x0443f /* CSR section delimiter */ /* Precision Time Protocol Registers */ @@ -485,6 +499,14 @@ enum { FBNIC_RXB_FIFO_INDICES = 8 }; +enum { + FBNIC_RXB_INTF_NET = 0, + FBNIC_RXB_INTF_RBT = 1, + /* Unused */ + /* Unused */ + FBNIC_RXB_INTF_INDICES = 4 +}; + #define FBNIC_RXB_CT_SIZE(n) (0x08000 + (n)) /* 0x20000 + 4*n */ #define FBNIC_RXB_CT_SIZE_CNT 8 #define FBNIC_RXB_CT_SIZE_HEADER CSR_GENMASK(5, 0) @@ -864,6 +886,12 @@ enum { #define FBNIC_QUEUE_TWQ1_BAL 0x022 /* 0x088 */ #define FBNIC_QUEUE_TWQ1_BAH 0x023 /* 0x08c */ +/* Tx Work Queue Statistics Registers */ +#define FBNIC_QUEUE_TWQ0_PKT_CNT 0x062 /* 0x188 */ +#define FBNIC_QUEUE_TWQ0_ERR_CNT 0x063 /* 0x18c */ +#define FBNIC_QUEUE_TWQ1_PKT_CNT 0x072 /* 0x1c8 */ +#define FBNIC_QUEUE_TWQ1_ERR_CNT 0x073 /* 0x1cc */ + /* Tx Completion Queue Registers */ #define FBNIC_QUEUE_TCQ_CTL 0x080 /* 0x200 */ #define FBNIC_QUEUE_TCQ_CTL_RESET CSR_BIT(0) @@ -953,6 +981,12 @@ enum { FBNIC_QUEUE_RDE_CTL1_PAYLD_PACK_RSS = 2, }; +/* Rx Per CQ Statistics Counters */ +#define FBNIC_QUEUE_RDE_PKT_CNT 0x2a2 /* 0xa88 */ +#define FBNIC_QUEUE_RDE_PKT_ERR_CNT 0x2a3 /* 0xa8c */ +#define FBNIC_QUEUE_RDE_CQ_DROP_CNT 0x2a4 /* 0xa90 */ +#define FBNIC_QUEUE_RDE_BDQ_DROP_CNT 0x2a5 /* 0xa94 */ + /* Rx Interrupt Manager Registers */ #define FBNIC_QUEUE_RIM_CTL 0x2c0 /* 0xb00 */ #define FBNIC_QUEUE_RIM_CTL_MSIX_MASK CSR_GENMASK(7, 0) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index 0a751a2aaf733..5c7556c8c4c5f 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -27,6 +27,19 @@ struct fbnic_stat { FBNIC_STAT_FIELDS(fbnic_hw_stats, name, stat) static const struct fbnic_stat fbnic_gstrings_hw_stats[] = { + /* TTI */ + FBNIC_HW_STAT("tti_cm_drop_frames", tti.cm_drop.frames), + FBNIC_HW_STAT("tti_cm_drop_bytes", tti.cm_drop.bytes), + FBNIC_HW_STAT("tti_frame_drop_frames", tti.frame_drop.frames), + FBNIC_HW_STAT("tti_frame_drop_bytes", tti.frame_drop.bytes), + FBNIC_HW_STAT("tti_tbi_drop_frames", tti.tbi_drop.frames), + FBNIC_HW_STAT("tti_tbi_drop_bytes", tti.tbi_drop.bytes), + + /* TMI */ + FBNIC_HW_STAT("ptp_illegal_req", tmi.ptp_illegal_req), + FBNIC_HW_STAT("ptp_good_ts", tmi.ptp_good_ts), + FBNIC_HW_STAT("ptp_bad_ts", tmi.ptp_bad_ts), + /* RPC */ FBNIC_HW_STAT("rpc_unkn_etype", rpc.unkn_etype), FBNIC_HW_STAT("rpc_unkn_ext_hdr", rpc.unkn_ext_hdr), @@ -39,7 +52,64 @@ static const struct fbnic_stat fbnic_gstrings_hw_stats[] = { }; #define FBNIC_HW_FIXED_STATS_LEN ARRAY_SIZE(fbnic_gstrings_hw_stats) -#define FBNIC_HW_STATS_LEN FBNIC_HW_FIXED_STATS_LEN + +#define FBNIC_RXB_ENQUEUE_STAT(name, stat) \ + FBNIC_STAT_FIELDS(fbnic_rxb_enqueue_stats, name, stat) + +static const struct fbnic_stat fbnic_gstrings_rxb_enqueue_stats[] = { + FBNIC_RXB_ENQUEUE_STAT("rxb_integrity_err%u", integrity_err), + FBNIC_RXB_ENQUEUE_STAT("rxb_mac_err%u", mac_err), + FBNIC_RXB_ENQUEUE_STAT("rxb_parser_err%u", parser_err), + FBNIC_RXB_ENQUEUE_STAT("rxb_frm_err%u", frm_err), + + FBNIC_RXB_ENQUEUE_STAT("rxb_drbo%u_frames", drbo.frames), + FBNIC_RXB_ENQUEUE_STAT("rxb_drbo%u_bytes", drbo.bytes), +}; + +#define FBNIC_HW_RXB_ENQUEUE_STATS_LEN \ + ARRAY_SIZE(fbnic_gstrings_rxb_enqueue_stats) + +#define FBNIC_RXB_FIFO_STAT(name, stat) \ + FBNIC_STAT_FIELDS(fbnic_rxb_fifo_stats, name, stat) + +static const struct fbnic_stat fbnic_gstrings_rxb_fifo_stats[] = { + FBNIC_RXB_FIFO_STAT("rxb_fifo%u_drop", trans_drop), + FBNIC_RXB_FIFO_STAT("rxb_fifo%u_dropped_frames", drop.frames), + FBNIC_RXB_FIFO_STAT("rxb_fifo%u_ecn", trans_ecn), + FBNIC_RXB_FIFO_STAT("rxb_fifo%u_level", level), +}; + +#define FBNIC_HW_RXB_FIFO_STATS_LEN ARRAY_SIZE(fbnic_gstrings_rxb_fifo_stats) + +#define FBNIC_RXB_DEQUEUE_STAT(name, stat) \ + FBNIC_STAT_FIELDS(fbnic_rxb_dequeue_stats, name, stat) + +static const struct fbnic_stat fbnic_gstrings_rxb_dequeue_stats[] = { + FBNIC_RXB_DEQUEUE_STAT("rxb_intf%u_frames", intf.frames), + FBNIC_RXB_DEQUEUE_STAT("rxb_intf%u_bytes", intf.bytes), + FBNIC_RXB_DEQUEUE_STAT("rxb_pbuf%u_frames", pbuf.frames), + FBNIC_RXB_DEQUEUE_STAT("rxb_pbuf%u_bytes", pbuf.bytes), +}; + +#define FBNIC_HW_RXB_DEQUEUE_STATS_LEN \ + ARRAY_SIZE(fbnic_gstrings_rxb_dequeue_stats) + +#define FBNIC_HW_Q_STAT(name, stat) \ + FBNIC_STAT_FIELDS(fbnic_hw_q_stats, name, stat.value) + +static const struct fbnic_stat fbnic_gstrings_hw_q_stats[] = { + FBNIC_HW_Q_STAT("rde_%u_pkt_err", rde_pkt_err), + FBNIC_HW_Q_STAT("rde_%u_pkt_cq_drop", rde_pkt_cq_drop), + FBNIC_HW_Q_STAT("rde_%u_pkt_bdq_drop", rde_pkt_bdq_drop), +}; + +#define FBNIC_HW_Q_STATS_LEN ARRAY_SIZE(fbnic_gstrings_hw_q_stats) +#define FBNIC_HW_STATS_LEN \ + (FBNIC_HW_FIXED_STATS_LEN + \ + FBNIC_HW_RXB_ENQUEUE_STATS_LEN * FBNIC_RXB_ENQUEUE_INDICES + \ + FBNIC_HW_RXB_FIFO_STATS_LEN * FBNIC_RXB_FIFO_INDICES + \ + FBNIC_HW_RXB_DEQUEUE_STATS_LEN * FBNIC_RXB_DEQUEUE_INDICES + \ + FBNIC_HW_Q_STATS_LEN * FBNIC_MAX_QUEUES) static void fbnic_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) @@ -298,31 +368,125 @@ fbnic_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring, return err; } -static void fbnic_get_strings(struct net_device *dev, u32 sset, u8 *data) +static void fbnic_get_rxb_enqueue_strings(u8 **data, unsigned int idx) +{ + const struct fbnic_stat *stat; + int i; + + stat = fbnic_gstrings_rxb_enqueue_stats; + for (i = 0; i < FBNIC_HW_RXB_ENQUEUE_STATS_LEN; i++, stat++) + ethtool_sprintf(data, stat->string, idx); +} + +static void fbnic_get_rxb_fifo_strings(u8 **data, unsigned int idx) +{ + const struct fbnic_stat *stat; + int i; + + stat = fbnic_gstrings_rxb_fifo_stats; + for (i = 0; i < FBNIC_HW_RXB_FIFO_STATS_LEN; i++, stat++) + ethtool_sprintf(data, stat->string, idx); +} + +static void fbnic_get_rxb_dequeue_strings(u8 **data, unsigned int idx) { + const struct fbnic_stat *stat; int i; + stat = fbnic_gstrings_rxb_dequeue_stats; + for (i = 0; i < FBNIC_HW_RXB_DEQUEUE_STATS_LEN; i++, stat++) + ethtool_sprintf(data, stat->string, idx); +} + +static void fbnic_get_strings(struct net_device *dev, u32 sset, u8 *data) +{ + const struct fbnic_stat *stat; + int i, idx; + switch (sset) { case ETH_SS_STATS: - for (i = 0; i < FBNIC_HW_STATS_LEN; i++) + for (i = 0; i < FBNIC_HW_FIXED_STATS_LEN; i++) ethtool_puts(&data, fbnic_gstrings_hw_stats[i].string); + + for (i = 0; i < FBNIC_RXB_ENQUEUE_INDICES; i++) + fbnic_get_rxb_enqueue_strings(&data, i); + + for (i = 0; i < FBNIC_RXB_FIFO_INDICES; i++) + fbnic_get_rxb_fifo_strings(&data, i); + + for (i = 0; i < FBNIC_RXB_DEQUEUE_INDICES; i++) + fbnic_get_rxb_dequeue_strings(&data, i); + + for (idx = 0; idx < FBNIC_MAX_QUEUES; idx++) { + stat = fbnic_gstrings_hw_q_stats; + + for (i = 0; i < FBNIC_HW_Q_STATS_LEN; i++, stat++) + ethtool_sprintf(&data, stat->string, idx); + } break; } } +static void fbnic_report_hw_stats(const struct fbnic_stat *stat, + const void *base, int len, u64 **data) +{ + while (len--) { + u8 *curr = (u8 *)base + stat->offset; + + **data = *(u64 *)curr; + + stat++; + (*data)++; + } +} + static void fbnic_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct fbnic_net *fbn = netdev_priv(dev); - const struct fbnic_stat *stat; + struct fbnic_dev *fbd = fbn->fbd; int i; fbnic_get_hw_stats(fbn->fbd); - for (i = 0; i < FBNIC_HW_STATS_LEN; i++) { - stat = &fbnic_gstrings_hw_stats[i]; - data[i] = *(u64 *)((u8 *)&fbn->fbd->hw_stats + stat->offset); + spin_lock(&fbd->hw_stats_lock); + fbnic_report_hw_stats(fbnic_gstrings_hw_stats, &fbd->hw_stats, + FBNIC_HW_FIXED_STATS_LEN, &data); + + for (i = 0; i < FBNIC_RXB_ENQUEUE_INDICES; i++) { + const struct fbnic_rxb_enqueue_stats *enq; + + enq = &fbd->hw_stats.rxb.enq[i]; + fbnic_report_hw_stats(fbnic_gstrings_rxb_enqueue_stats, + enq, FBNIC_HW_RXB_ENQUEUE_STATS_LEN, + &data); + } + + for (i = 0; i < FBNIC_RXB_FIFO_INDICES; i++) { + const struct fbnic_rxb_fifo_stats *fifo; + + fifo = &fbd->hw_stats.rxb.fifo[i]; + fbnic_report_hw_stats(fbnic_gstrings_rxb_fifo_stats, + fifo, FBNIC_HW_RXB_FIFO_STATS_LEN, + &data); + } + + for (i = 0; i < FBNIC_RXB_DEQUEUE_INDICES; i++) { + const struct fbnic_rxb_dequeue_stats *deq; + + deq = &fbd->hw_stats.rxb.deq[i]; + fbnic_report_hw_stats(fbnic_gstrings_rxb_dequeue_stats, + deq, FBNIC_HW_RXB_DEQUEUE_STATS_LEN, + &data); + } + + for (i = 0; i < FBNIC_MAX_QUEUES; i++) { + const struct fbnic_hw_q_stats *hw_q = &fbd->hw_stats.hw_q[i]; + + fbnic_report_hw_stats(fbnic_gstrings_hw_q_stats, hw_q, + FBNIC_HW_Q_STATS_LEN, &data); } + spin_unlock(&fbd->hw_stats_lock); } static int fbnic_get_sset_count(struct net_device *dev, int sset) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c index 89ac6bc8c7fc3..4223d8100e64e 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c @@ -70,6 +70,100 @@ static void fbnic_hw_stat_rd64(struct fbnic_dev *fbd, u32 reg, s32 offset, stat->u.old_reg_value_64 = new_reg_value; } +static void fbnic_reset_tmi_stats(struct fbnic_dev *fbd, + struct fbnic_tmi_stats *tmi) +{ + fbnic_hw_stat_rst32(fbd, FBNIC_TMI_DROP_PKTS, &tmi->drop.frames); + fbnic_hw_stat_rst64(fbd, FBNIC_TMI_DROP_BYTE_L, 1, &tmi->drop.bytes); + + fbnic_hw_stat_rst32(fbd, + FBNIC_TMI_ILLEGAL_PTP_REQS, + &tmi->ptp_illegal_req); + fbnic_hw_stat_rst32(fbd, FBNIC_TMI_GOOD_PTP_TS, &tmi->ptp_good_ts); + fbnic_hw_stat_rst32(fbd, FBNIC_TMI_BAD_PTP_TS, &tmi->ptp_bad_ts); +} + +static void fbnic_get_tmi_stats32(struct fbnic_dev *fbd, + struct fbnic_tmi_stats *tmi) +{ + fbnic_hw_stat_rd32(fbd, FBNIC_TMI_DROP_PKTS, &tmi->drop.frames); + + fbnic_hw_stat_rd32(fbd, + FBNIC_TMI_ILLEGAL_PTP_REQS, + &tmi->ptp_illegal_req); + fbnic_hw_stat_rd32(fbd, FBNIC_TMI_GOOD_PTP_TS, &tmi->ptp_good_ts); + fbnic_hw_stat_rd32(fbd, FBNIC_TMI_BAD_PTP_TS, &tmi->ptp_bad_ts); +} + +static void fbnic_get_tmi_stats(struct fbnic_dev *fbd, + struct fbnic_tmi_stats *tmi) +{ + fbnic_hw_stat_rd64(fbd, FBNIC_TMI_DROP_BYTE_L, 1, &tmi->drop.bytes); +} + +static void fbnic_reset_tti_stats(struct fbnic_dev *fbd, + struct fbnic_tti_stats *tti) +{ + fbnic_hw_stat_rst32(fbd, + FBNIC_TCE_TTI_CM_DROP_PKTS, + &tti->cm_drop.frames); + fbnic_hw_stat_rst64(fbd, + FBNIC_TCE_TTI_CM_DROP_BYTE_L, + 1, + &tti->cm_drop.bytes); + + fbnic_hw_stat_rst32(fbd, + FBNIC_TCE_TTI_FRAME_DROP_PKTS, + &tti->frame_drop.frames); + fbnic_hw_stat_rst64(fbd, + FBNIC_TCE_TTI_FRAME_DROP_BYTE_L, + 1, + &tti->frame_drop.bytes); + + fbnic_hw_stat_rst32(fbd, + FBNIC_TCE_TBI_DROP_PKTS, + &tti->tbi_drop.frames); + fbnic_hw_stat_rst64(fbd, + FBNIC_TCE_TBI_DROP_BYTE_L, + 1, + &tti->tbi_drop.bytes); +} + +static void fbnic_get_tti_stats32(struct fbnic_dev *fbd, + struct fbnic_tti_stats *tti) +{ + fbnic_hw_stat_rd32(fbd, + FBNIC_TCE_TTI_CM_DROP_PKTS, + &tti->cm_drop.frames); + + fbnic_hw_stat_rd32(fbd, + FBNIC_TCE_TTI_FRAME_DROP_PKTS, + &tti->frame_drop.frames); + + fbnic_hw_stat_rd32(fbd, + FBNIC_TCE_TBI_DROP_PKTS, + &tti->tbi_drop.frames); +} + +static void fbnic_get_tti_stats(struct fbnic_dev *fbd, + struct fbnic_tti_stats *tti) +{ + fbnic_hw_stat_rd64(fbd, + FBNIC_TCE_TTI_CM_DROP_BYTE_L, + 1, + &tti->cm_drop.bytes); + + fbnic_hw_stat_rd64(fbd, + FBNIC_TCE_TTI_FRAME_DROP_BYTE_L, + 1, + &tti->frame_drop.bytes); + + fbnic_hw_stat_rd64(fbd, + FBNIC_TCE_TBI_DROP_BYTE_L, + 1, + &tti->tbi_drop.bytes); +} + static void fbnic_reset_rpc_stats(struct fbnic_dev *fbd, struct fbnic_rpc_stats *rpc) { @@ -117,6 +211,221 @@ static void fbnic_get_rpc_stats32(struct fbnic_dev *fbd, &rpc->ovr_size_err); } +static void fbnic_reset_rxb_fifo_stats(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_fifo_stats *fifo) +{ + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_DROP_FRMS_STS(i), + &fifo->drop.frames); + fbnic_hw_stat_rst64(fbd, FBNIC_RXB_DROP_BYTES_STS_L(i), 1, + &fifo->drop.bytes); + + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_TRUN_FRMS_STS(i), + &fifo->trunc.frames); + fbnic_hw_stat_rst64(fbd, FBNIC_RXB_TRUN_BYTES_STS_L(i), 1, + &fifo->trunc.bytes); + + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_TRANS_DROP_STS(i), + &fifo->trans_drop); + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_TRANS_ECN_STS(i), + &fifo->trans_ecn); + + fifo->level.u.old_reg_value_32 = 0; +} + +static void fbnic_reset_rxb_enq_stats(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_enqueue_stats *enq) +{ + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_DRBO_FRM_CNT_SRC(i), + &enq->drbo.frames); + fbnic_hw_stat_rst64(fbd, FBNIC_RXB_DRBO_BYTE_CNT_SRC_L(i), 4, + &enq->drbo.bytes); + + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_INTEGRITY_ERR(i), + &enq->integrity_err); + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_MAC_ERR(i), + &enq->mac_err); + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_PARSER_ERR(i), + &enq->parser_err); + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_FRM_ERR(i), + &enq->frm_err); +} + +static void fbnic_reset_rxb_deq_stats(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_dequeue_stats *deq) +{ + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_INTF_FRM_CNT_DST(i), + &deq->intf.frames); + fbnic_hw_stat_rst64(fbd, FBNIC_RXB_INTF_BYTE_CNT_DST_L(i), 4, + &deq->intf.bytes); + + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_PBUF_FRM_CNT_DST(i), + &deq->pbuf.frames); + fbnic_hw_stat_rst64(fbd, FBNIC_RXB_PBUF_BYTE_CNT_DST_L(i), 4, + &deq->pbuf.bytes); +} + +static void fbnic_reset_rxb_stats(struct fbnic_dev *fbd, + struct fbnic_rxb_stats *rxb) +{ + int i; + + for (i = 0; i < FBNIC_RXB_FIFO_INDICES; i++) + fbnic_reset_rxb_fifo_stats(fbd, i, &rxb->fifo[i]); + + for (i = 0; i < FBNIC_RXB_INTF_INDICES; i++) { + fbnic_reset_rxb_enq_stats(fbd, i, &rxb->enq[i]); + fbnic_reset_rxb_deq_stats(fbd, i, &rxb->deq[i]); + } +} + +static void fbnic_get_rxb_fifo_stats32(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_fifo_stats *fifo) +{ + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_DROP_FRMS_STS(i), + &fifo->drop.frames); + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_TRUN_FRMS_STS(i), + &fifo->trunc.frames); + + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_TRANS_DROP_STS(i), + &fifo->trans_drop); + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_TRANS_ECN_STS(i), + &fifo->trans_ecn); + + fifo->level.value = rd32(fbd, FBNIC_RXB_PBUF_FIFO_LEVEL(i)); +} + +static void fbnic_get_rxb_fifo_stats(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_fifo_stats *fifo) +{ + fbnic_hw_stat_rd64(fbd, FBNIC_RXB_DROP_BYTES_STS_L(i), 1, + &fifo->drop.bytes); + fbnic_hw_stat_rd64(fbd, FBNIC_RXB_TRUN_BYTES_STS_L(i), 1, + &fifo->trunc.bytes); + + fbnic_get_rxb_fifo_stats32(fbd, i, fifo); +} + +static void fbnic_get_rxb_enq_stats32(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_enqueue_stats *enq) +{ + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_DRBO_FRM_CNT_SRC(i), + &enq->drbo.frames); + + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_INTEGRITY_ERR(i), + &enq->integrity_err); + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_MAC_ERR(i), + &enq->mac_err); + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_PARSER_ERR(i), + &enq->parser_err); + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_FRM_ERR(i), + &enq->frm_err); +} + +static void fbnic_get_rxb_enq_stats(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_enqueue_stats *enq) +{ + fbnic_hw_stat_rd64(fbd, FBNIC_RXB_DRBO_BYTE_CNT_SRC_L(i), 4, + &enq->drbo.bytes); + + fbnic_get_rxb_enq_stats32(fbd, i, enq); +} + +static void fbnic_get_rxb_deq_stats32(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_dequeue_stats *deq) +{ + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_INTF_FRM_CNT_DST(i), + &deq->intf.frames); + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_PBUF_FRM_CNT_DST(i), + &deq->pbuf.frames); +} + +static void fbnic_get_rxb_deq_stats(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_dequeue_stats *deq) +{ + fbnic_hw_stat_rd64(fbd, FBNIC_RXB_INTF_BYTE_CNT_DST_L(i), 4, + &deq->intf.bytes); + fbnic_hw_stat_rd64(fbd, FBNIC_RXB_PBUF_BYTE_CNT_DST_L(i), 4, + &deq->pbuf.bytes); + + fbnic_get_rxb_deq_stats32(fbd, i, deq); +} + +static void fbnic_get_rxb_stats32(struct fbnic_dev *fbd, + struct fbnic_rxb_stats *rxb) +{ + int i; + + for (i = 0; i < FBNIC_RXB_FIFO_INDICES; i++) + fbnic_get_rxb_fifo_stats32(fbd, i, &rxb->fifo[i]); + + for (i = 0; i < FBNIC_RXB_INTF_INDICES; i++) { + fbnic_get_rxb_enq_stats32(fbd, i, &rxb->enq[i]); + fbnic_get_rxb_deq_stats32(fbd, i, &rxb->deq[i]); + } +} + +static void fbnic_get_rxb_stats(struct fbnic_dev *fbd, + struct fbnic_rxb_stats *rxb) +{ + int i; + + for (i = 0; i < FBNIC_RXB_FIFO_INDICES; i++) + fbnic_get_rxb_fifo_stats(fbd, i, &rxb->fifo[i]); + + for (i = 0; i < FBNIC_RXB_INTF_INDICES; i++) { + fbnic_get_rxb_enq_stats(fbd, i, &rxb->enq[i]); + fbnic_get_rxb_deq_stats(fbd, i, &rxb->deq[i]); + } +} + +static void fbnic_reset_hw_rxq_stats(struct fbnic_dev *fbd, + struct fbnic_hw_q_stats *hw_q) +{ + int i; + + for (i = 0; i < fbd->max_num_queues; i++, hw_q++) { + u32 base = FBNIC_QUEUE(i); + + fbnic_hw_stat_rst32(fbd, + base + FBNIC_QUEUE_RDE_PKT_ERR_CNT, + &hw_q->rde_pkt_err); + fbnic_hw_stat_rst32(fbd, + base + FBNIC_QUEUE_RDE_CQ_DROP_CNT, + &hw_q->rde_pkt_cq_drop); + fbnic_hw_stat_rst32(fbd, + base + FBNIC_QUEUE_RDE_BDQ_DROP_CNT, + &hw_q->rde_pkt_bdq_drop); + } +} + +static void fbnic_get_hw_rxq_stats32(struct fbnic_dev *fbd, + struct fbnic_hw_q_stats *hw_q) +{ + int i; + + for (i = 0; i < fbd->max_num_queues; i++, hw_q++) { + u32 base = FBNIC_QUEUE(i); + + fbnic_hw_stat_rd32(fbd, + base + FBNIC_QUEUE_RDE_PKT_ERR_CNT, + &hw_q->rde_pkt_err); + fbnic_hw_stat_rd32(fbd, + base + FBNIC_QUEUE_RDE_CQ_DROP_CNT, + &hw_q->rde_pkt_cq_drop); + fbnic_hw_stat_rd32(fbd, + base + FBNIC_QUEUE_RDE_BDQ_DROP_CNT, + &hw_q->rde_pkt_bdq_drop); + } +} + +void fbnic_get_hw_q_stats(struct fbnic_dev *fbd, + struct fbnic_hw_q_stats *hw_q) +{ + spin_lock(&fbd->hw_stats_lock); + fbnic_get_hw_rxq_stats32(fbd, hw_q); + spin_unlock(&fbd->hw_stats_lock); +} + static void fbnic_reset_pcie_stats_asic(struct fbnic_dev *fbd, struct fbnic_pcie_stats *pcie) { @@ -203,18 +512,40 @@ static void fbnic_get_pcie_stats_asic64(struct fbnic_dev *fbd, void fbnic_reset_hw_stats(struct fbnic_dev *fbd) { + spin_lock(&fbd->hw_stats_lock); + fbnic_reset_tmi_stats(fbd, &fbd->hw_stats.tmi); + fbnic_reset_tti_stats(fbd, &fbd->hw_stats.tti); fbnic_reset_rpc_stats(fbd, &fbd->hw_stats.rpc); + fbnic_reset_rxb_stats(fbd, &fbd->hw_stats.rxb); + fbnic_reset_hw_rxq_stats(fbd, fbd->hw_stats.hw_q); fbnic_reset_pcie_stats_asic(fbd, &fbd->hw_stats.pcie); + spin_unlock(&fbd->hw_stats_lock); } -void fbnic_get_hw_stats32(struct fbnic_dev *fbd) +static void __fbnic_get_hw_stats32(struct fbnic_dev *fbd) { + fbnic_get_tmi_stats32(fbd, &fbd->hw_stats.tmi); + fbnic_get_tti_stats32(fbd, &fbd->hw_stats.tti); fbnic_get_rpc_stats32(fbd, &fbd->hw_stats.rpc); + fbnic_get_rxb_stats32(fbd, &fbd->hw_stats.rxb); + fbnic_get_hw_rxq_stats32(fbd, fbd->hw_stats.hw_q); +} + +void fbnic_get_hw_stats32(struct fbnic_dev *fbd) +{ + spin_lock(&fbd->hw_stats_lock); + __fbnic_get_hw_stats32(fbd); + spin_unlock(&fbd->hw_stats_lock); } void fbnic_get_hw_stats(struct fbnic_dev *fbd) { - fbnic_get_hw_stats32(fbd); + spin_lock(&fbd->hw_stats_lock); + __fbnic_get_hw_stats32(fbd); + fbnic_get_tmi_stats(fbd, &fbd->hw_stats.tmi); + fbnic_get_tti_stats(fbd, &fbd->hw_stats.tti); + fbnic_get_rxb_stats(fbd, &fbd->hw_stats.rxb); fbnic_get_pcie_stats_asic64(fbd, &fbd->hw_stats.pcie); + spin_unlock(&fbd->hw_stats_lock); } diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h index 78df56b877456..07e54bb75bf3b 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h @@ -17,6 +17,11 @@ struct fbnic_stat_counter { bool reported; }; +struct fbnic_hw_stat { + struct fbnic_stat_counter frames; + struct fbnic_stat_counter bytes; +}; + struct fbnic_eth_mac_stats { struct fbnic_stat_counter FramesTransmittedOK; struct fbnic_stat_counter FramesReceivedOK; @@ -37,12 +42,49 @@ struct fbnic_mac_stats { struct fbnic_eth_mac_stats eth_mac; }; +struct fbnic_tmi_stats { + struct fbnic_hw_stat drop; + struct fbnic_stat_counter ptp_illegal_req, ptp_good_ts, ptp_bad_ts; +}; + +struct fbnic_tti_stats { + struct fbnic_hw_stat cm_drop, frame_drop, tbi_drop; +}; + struct fbnic_rpc_stats { struct fbnic_stat_counter unkn_etype, unkn_ext_hdr; struct fbnic_stat_counter ipv4_frag, ipv6_frag, ipv4_esp, ipv6_esp; struct fbnic_stat_counter tcp_opt_err, out_of_hdr_err, ovr_size_err; }; +struct fbnic_rxb_enqueue_stats { + struct fbnic_hw_stat drbo; + struct fbnic_stat_counter integrity_err, mac_err; + struct fbnic_stat_counter parser_err, frm_err; +}; + +struct fbnic_rxb_fifo_stats { + struct fbnic_hw_stat drop, trunc; + struct fbnic_stat_counter trans_drop, trans_ecn; + struct fbnic_stat_counter level; +}; + +struct fbnic_rxb_dequeue_stats { + struct fbnic_hw_stat intf, pbuf; +}; + +struct fbnic_rxb_stats { + struct fbnic_rxb_enqueue_stats enq[FBNIC_RXB_ENQUEUE_INDICES]; + struct fbnic_rxb_fifo_stats fifo[FBNIC_RXB_FIFO_INDICES]; + struct fbnic_rxb_dequeue_stats deq[FBNIC_RXB_DEQUEUE_INDICES]; +}; + +struct fbnic_hw_q_stats { + struct fbnic_stat_counter rde_pkt_err; + struct fbnic_stat_counter rde_pkt_cq_drop; + struct fbnic_stat_counter rde_pkt_bdq_drop; +}; + struct fbnic_pcie_stats { struct fbnic_stat_counter ob_rd_tlp, ob_rd_dword; struct fbnic_stat_counter ob_wr_tlp, ob_wr_dword; @@ -55,13 +97,19 @@ struct fbnic_pcie_stats { struct fbnic_hw_stats { struct fbnic_mac_stats mac; + struct fbnic_tmi_stats tmi; + struct fbnic_tti_stats tti; struct fbnic_rpc_stats rpc; + struct fbnic_rxb_stats rxb; + struct fbnic_hw_q_stats hw_q[FBNIC_MAX_QUEUES]; struct fbnic_pcie_stats pcie; }; u64 fbnic_stat_rd64(struct fbnic_dev *fbd, u32 reg, u32 offset); void fbnic_reset_hw_stats(struct fbnic_dev *fbd); +void fbnic_get_hw_q_stats(struct fbnic_dev *fbd, + struct fbnic_hw_q_stats *hw_q); void fbnic_get_hw_stats32(struct fbnic_dev *fbd); void fbnic_get_hw_stats(struct fbnic_dev *fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index 79a01fdd1dd16..d699f58dda211 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -403,12 +403,16 @@ static int fbnic_hwtstamp_set(struct net_device *netdev, static void fbnic_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats64) { + u64 rx_bytes, rx_packets, rx_dropped = 0, rx_errors = 0; u64 tx_bytes, tx_packets, tx_dropped = 0; - u64 rx_bytes, rx_packets, rx_dropped = 0; struct fbnic_net *fbn = netdev_priv(dev); + struct fbnic_dev *fbd = fbn->fbd; struct fbnic_queue_stats *stats; + u64 rx_over = 0, rx_missed = 0; unsigned int start, i; + fbnic_get_hw_stats(fbd); + stats = &fbn->tx_stats; tx_bytes = stats->bytes; @@ -419,6 +423,12 @@ static void fbnic_get_stats64(struct net_device *dev, stats64->tx_packets = tx_packets; stats64->tx_dropped = tx_dropped; + /* Record drops from Tx HW Datapath */ + tx_dropped += fbd->hw_stats.tmi.drop.frames.value + + fbd->hw_stats.tti.frame_drop.frames.value + + fbd->hw_stats.tti.tbi_drop.frames.value + + fbd->hw_stats.tmi.drop.frames.value; + for (i = 0; i < fbn->num_tx_queues; i++) { struct fbnic_ring *txr = fbn->tx[i]; @@ -444,9 +454,34 @@ static void fbnic_get_stats64(struct net_device *dev, rx_packets = stats->packets; rx_dropped = stats->dropped; + spin_lock(&fbd->hw_stats_lock); + /* Record drops for the host FIFOs. + * 4: network to Host, 6: BMC to Host + * Exclude the BMC and MC FIFOs as those stats may contain drops + * due to unrelated items such as TCAM misses. They are still + * accessible through the ethtool stats. + */ + i = FBNIC_RXB_FIFO_HOST; + rx_missed += fbd->hw_stats.rxb.fifo[i].drop.frames.value; + i = FBNIC_RXB_FIFO_BMC_TO_HOST; + rx_missed += fbd->hw_stats.rxb.fifo[i].drop.frames.value; + + for (i = 0; i < fbd->max_num_queues; i++) { + /* Report packets dropped due to CQ/BDQ being full/empty */ + rx_over += fbd->hw_stats.hw_q[i].rde_pkt_cq_drop.value; + rx_over += fbd->hw_stats.hw_q[i].rde_pkt_bdq_drop.value; + + /* Report packets with errors */ + rx_errors += fbd->hw_stats.hw_q[i].rde_pkt_err.value; + } + spin_unlock(&fbd->hw_stats_lock); + stats64->rx_bytes = rx_bytes; stats64->rx_packets = rx_packets; stats64->rx_dropped = rx_dropped; + stats64->rx_over_errors = rx_over; + stats64->rx_errors = rx_errors; + stats64->rx_missed_errors = rx_missed; for (i = 0; i < fbn->num_rx_queues; i++) { struct fbnic_ring *rxr = fbn->rx[i]; @@ -486,6 +521,7 @@ static void fbnic_get_queue_stats_rx(struct net_device *dev, int idx, { struct fbnic_net *fbn = netdev_priv(dev); struct fbnic_ring *rxr = fbn->rx[idx]; + struct fbnic_dev *fbd = fbn->fbd; struct fbnic_queue_stats *stats; u64 bytes, packets, alloc_fail; u64 csum_complete, csum_none; @@ -509,6 +545,15 @@ static void fbnic_get_queue_stats_rx(struct net_device *dev, int idx, rx->alloc_fail = alloc_fail; rx->csum_complete = csum_complete; rx->csum_none = csum_none; + + fbnic_get_hw_q_stats(fbd, fbd->hw_stats.hw_q); + + spin_lock(&fbd->hw_stats_lock); + rx->hw_drop_overruns = fbd->hw_stats.hw_q[idx].rde_pkt_cq_drop.value + + fbd->hw_stats.hw_q[idx].rde_pkt_bdq_drop.value; + rx->hw_drops = fbd->hw_stats.hw_q[idx].rde_pkt_err.value + + rx->hw_drop_overruns; + spin_unlock(&fbd->hw_stats_lock); } static void fbnic_get_queue_stats_tx(struct net_device *dev, int idx, diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index 6cbbc2ee3e1f9..1f76ebdd6ad18 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -292,6 +292,7 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) fbnic_devlink_register(fbd); fbnic_dbg_fbd_init(fbd); + spin_lock_init(&fbd->hw_stats_lock); /* Capture snapshot of hardware stats so netdev can calculate delta */ fbnic_reset_hw_stats(fbd); diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 23760b613d3ec..8b6b9b6efe18e 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -2495,8 +2495,7 @@ static int lan743x_rx_process_buffer(struct lan743x_rx *rx) /* save existing skb, allocate new skb and map to dma */ skb = buffer_info->skb; - if (lan743x_rx_init_ring_element(rx, rx->last_head, - GFP_ATOMIC | GFP_DMA)) { + if (lan743x_rx_init_ring_element(rx, rx->last_head, GFP_ATOMIC)) { /* failed to allocate next skb. * Memory is very low. * Drop this packet and reuse buffer. diff --git a/drivers/net/ethernet/microchip/lan743x_ptp.c b/drivers/net/ethernet/microchip/lan743x_ptp.c index 0be44dcb33938..b07f5b099a2b4 100644 --- a/drivers/net/ethernet/microchip/lan743x_ptp.c +++ b/drivers/net/ethernet/microchip/lan743x_ptp.c @@ -463,10 +463,6 @@ static int lan743x_ptp_perout(struct lan743x_adapter *adapter, int on, struct lan743x_ptp_perout *perout = &ptp->perout[index]; int ret = 0; - /* Reject requests with unsupported flags */ - if (perout_request->flags & ~PTP_PEROUT_DUTY_CYCLE) - return -EOPNOTSUPP; - if (on) { perout_pin = ptp_find_pin(ptp->ptp_clock, PTP_PF_PEROUT, perout_request->index); @@ -942,12 +938,6 @@ static int lan743x_ptp_io_extts(struct lan743x_adapter *adapter, int on, extts = &ptp->extts[index]; - if (extts_request->flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - if (on) { extts_pin = ptp_find_pin(ptp->ptp_clock, PTP_PF_EXTTS, index); if (extts_pin < 0) @@ -1543,6 +1533,10 @@ int lan743x_ptp_open(struct lan743x_adapter *adapter) ptp->ptp_clock_info.n_per_out = LAN743X_PTP_N_EVENT_CHAN; ptp->ptp_clock_info.n_pins = n_pins; ptp->ptp_clock_info.pps = LAN743X_PTP_N_PPS; + ptp->ptp_clock_info.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; + ptp->ptp_clock_info.supported_perout_flags = PTP_PEROUT_DUTY_CYCLE; ptp->ptp_clock_info.pin_config = ptp->pin_config; ptp->ptp_clock_info.adjfine = lan743x_ptpci_adjfine; ptp->ptp_clock_info.adjtime = lan743x_ptpci_adjtime; diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c index 63905bb5a63a8..098406e2e5bb2 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c @@ -815,10 +815,6 @@ static int lan966x_ptp_perout(struct ptp_clock_info *ptp, bool pps = false; int pin; - if (rq->perout.flags & ~(PTP_PEROUT_DUTY_CYCLE | - PTP_PEROUT_PHASE)) - return -EOPNOTSUPP; - pin = ptp_find_pin(phc->clock, PTP_PF_PEROUT, rq->perout.index); if (pin == -1 || pin >= LAN966X_PHC_PINS_NUM) return -EINVAL; @@ -917,12 +913,6 @@ static int lan966x_ptp_extts(struct ptp_clock_info *ptp, if (lan966x->ptp_ext_irq <= 0) return -EOPNOTSUPP; - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - pin = ptp_find_pin(phc->clock, PTP_PF_EXTTS, rq->extts.index); if (pin == -1 || pin >= LAN966X_PHC_PINS_NUM) return -EINVAL; @@ -978,6 +968,10 @@ static struct ptp_clock_info lan966x_ptp_clock_info = { .n_per_out = LAN966X_PHC_PINS_NUM, .n_ext_ts = LAN966X_PHC_PINS_NUM, .n_pins = LAN966X_PHC_PINS_NUM, + .supported_extts_flags = PTP_RISING_EDGE | + PTP_STRICT_FLAGS, + .supported_perout_flags = PTP_PEROUT_DUTY_CYCLE | + PTP_PEROUT_PHASE, }; static int lan966x_ptp_phc_init(struct lan966x *lan966x, diff --git a/drivers/net/ethernet/mscc/ocelot_ptp.c b/drivers/net/ethernet/mscc/ocelot_ptp.c index cc1088988da09..d2a0a32f75ea9 100644 --- a/drivers/net/ethernet/mscc/ocelot_ptp.c +++ b/drivers/net/ethernet/mscc/ocelot_ptp.c @@ -211,11 +211,6 @@ int ocelot_ptp_enable(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_PEROUT: - /* Reject requests with unsupported flags */ - if (rq->perout.flags & ~(PTP_PEROUT_DUTY_CYCLE | - PTP_PEROUT_PHASE)) - return -EOPNOTSUPP; - pin = ptp_find_pin(ocelot->ptp_clock, PTP_PF_PEROUT, rq->perout.index); if (pin == 0) diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index 055b55651a49f..498eec8ae61d8 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -108,6 +108,8 @@ static struct ptp_clock_info ocelot_ptp_clock_info = { .n_ext_ts = 0, .n_per_out = OCELOT_PTP_PINS_NUM, .n_pins = OCELOT_PTP_PINS_NUM, + .supported_perout_flags = PTP_PEROUT_DUTY_CYCLE | + PTP_PEROUT_PHASE, .pps = 0, .gettime64 = ocelot_ptp_gettime64, .settime64 = ocelot_ptp_settime64, diff --git a/drivers/net/ethernet/netronome/nfp/nfd3/dp.c b/drivers/net/ethernet/netronome/nfp/nfd3/dp.c index f1c6c47564b17..08086eb76996f 100644 --- a/drivers/net/ethernet/netronome/nfp/nfd3/dp.c +++ b/drivers/net/ethernet/netronome/nfp/nfd3/dp.c @@ -779,7 +779,7 @@ nfp_nfd3_parse_meta(struct net_device *netdev, struct nfp_meta_parsed *meta, case NFP_NET_META_CSUM: meta->csum_type = CHECKSUM_COMPLETE; meta->csum = - (__force __wsum)__get_unaligned_cpu32(data); + (__force __wsum)get_unaligned((u32 *)data); data += 4; break; case NFP_NET_META_RESYNC_INFO: diff --git a/drivers/net/ethernet/netronome/nfp/nfdk/dp.c b/drivers/net/ethernet/netronome/nfp/nfdk/dp.c index ebeb6ab4465c6..ab3cd06ed63ed 100644 --- a/drivers/net/ethernet/netronome/nfp/nfdk/dp.c +++ b/drivers/net/ethernet/netronome/nfp/nfdk/dp.c @@ -779,7 +779,7 @@ nfp_nfdk_parse_meta(struct net_device *netdev, struct nfp_meta_parsed *meta, case NFP_NET_META_CSUM: meta->csum_type = CHECKSUM_COMPLETE; meta->csum = - (__force __wsum)__get_unaligned_cpu32(data); + (__force __wsum)get_unaligned((u32 *)data); data += 4; break; case NFP_NET_META_RESYNC_INFO: diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c index a2d4336d27662..92f30ff2d6316 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c @@ -948,64 +948,20 @@ static int ionic_get_tunable(struct net_device *netdev, return 0; } -static int ionic_get_module_info(struct net_device *netdev, - struct ethtool_modinfo *modinfo) - -{ - struct ionic_lif *lif = netdev_priv(netdev); - struct ionic_dev *idev = &lif->ionic->idev; - struct ionic_xcvr_status *xcvr; - struct sfp_eeprom_base *sfp; - - xcvr = &idev->port_info->status.xcvr; - sfp = (struct sfp_eeprom_base *) xcvr->sprom; - - /* report the module data type and length */ - switch (sfp->phys_id) { - case SFF8024_ID_SFP: - modinfo->type = ETH_MODULE_SFF_8079; - modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN; - break; - case SFF8024_ID_QSFP_8436_8636: - case SFF8024_ID_QSFP28_8636: - case SFF8024_ID_QSFP_PLUS_CMIS: - modinfo->type = ETH_MODULE_SFF_8436; - modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN; - break; - default: - netdev_info(netdev, "unknown xcvr type 0x%02x\n", - xcvr->sprom[0]); - modinfo->type = 0; - modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN; - break; - } - - return 0; -} - -static int ionic_get_module_eeprom(struct net_device *netdev, - struct ethtool_eeprom *ee, - u8 *data) +static int ionic_do_module_copy(u8 *dst, u8 *src, u32 len) { - struct ionic_lif *lif = netdev_priv(netdev); - struct ionic_dev *idev = &lif->ionic->idev; - struct ionic_xcvr_status *xcvr; - char tbuf[sizeof(xcvr->sprom)]; + char tbuf[sizeof_field(struct ionic_xcvr_status, sprom)]; int count = 10; - u32 len; /* The NIC keeps the module prom up-to-date in the DMA space * so we can simply copy the module bytes into the data buffer. */ - xcvr = &idev->port_info->status.xcvr; - len = min_t(u32, sizeof(xcvr->sprom), ee->len); - do { - memcpy(data, &xcvr->sprom[ee->offset], len); - memcpy(tbuf, &xcvr->sprom[ee->offset], len); + memcpy(dst, src, len); + memcpy(tbuf, src, len); /* Let's make sure we got a consistent copy */ - if (!memcmp(data, tbuf, len)) + if (!memcmp(dst, tbuf, len)) break; } while (--count); @@ -1016,6 +972,48 @@ static int ionic_get_module_eeprom(struct net_device *netdev, return 0; } +static int ionic_get_module_eeprom_by_page(struct net_device *netdev, + const struct ethtool_module_eeprom *page_data, + struct netlink_ext_ack *extack) +{ + struct ionic_lif *lif = netdev_priv(netdev); + struct ionic_dev *idev = &lif->ionic->idev; + u32 err = -EINVAL; + u8 *src; + + if (!page_data->length) + return -EINVAL; + + if (page_data->bank != 0) { + NL_SET_ERR_MSG_MOD(extack, "Only bank 0 is supported"); + return -EINVAL; + } + + switch (page_data->page) { + case 0: + src = &idev->port_info->status.xcvr.sprom[page_data->offset]; + break; + case 1: + src = &idev->port_info->sprom_page1[page_data->offset - 128]; + break; + case 2: + src = &idev->port_info->sprom_page2[page_data->offset - 128]; + break; + case 17: + src = &idev->port_info->sprom_page17[page_data->offset - 128]; + break; + default: + return -EOPNOTSUPP; + } + + memset(page_data->data, 0, page_data->length); + err = ionic_do_module_copy(page_data->data, src, page_data->length); + if (err) + return err; + + return page_data->length; +} + static int ionic_get_ts_info(struct net_device *netdev, struct kernel_ethtool_ts_info *info) { @@ -1161,8 +1159,7 @@ static const struct ethtool_ops ionic_ethtool_ops = { .set_rxfh = ionic_set_rxfh, .get_tunable = ionic_get_tunable, .set_tunable = ionic_set_tunable, - .get_module_info = ionic_get_module_info, - .get_module_eeprom = ionic_get_module_eeprom, + .get_module_eeprom_by_page = ionic_get_module_eeprom_by_page, .get_pauseparam = ionic_get_pauseparam, .set_pauseparam = ionic_set_pauseparam, .get_fecparam = ionic_get_fecparam, diff --git a/drivers/net/ethernet/pensando/ionic/ionic_if.h b/drivers/net/ethernet/pensando/ionic/ionic_if.h index 830c8adbfbee0..f1ddbe9994a3b 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_if.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_if.h @@ -2839,6 +2839,10 @@ union ionic_port_identity { * @status: Port status data * @stats: Port statistics data * @mgmt_stats: Port management statistics data + * @sprom_epage: Extended Transceiver sprom + * @sprom_page1: Extended Transceiver sprom, page 1 + * @sprom_page2: Extended Transceiver sprom, page 2 + * @sprom_page17: Extended Transceiver sprom, page 17 * @rsvd: reserved byte(s) * @pb_stats: uplink pb drop stats */ @@ -2849,8 +2853,17 @@ struct ionic_port_info { struct ionic_port_stats stats; struct ionic_mgmt_port_stats mgmt_stats; }; - /* room for pb_stats to start at 2k offset */ - u8 rsvd[760]; + union { + u8 sprom_epage[384]; + struct { + u8 sprom_page1[128]; + u8 sprom_page2[128]; + u8 sprom_page17[128]; + }; + }; + u8 rsvd[376]; + + /* pb_stats must start at 2k offset */ struct ionic_port_pb_stats pb_stats; }; diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h index b7def3b549376..016b575861b9e 100644 --- a/drivers/net/ethernet/qlogic/qed/qed.h +++ b/drivers/net/ethernet/qlogic/qed/qed.h @@ -939,7 +939,6 @@ u16 qed_get_cm_pq_idx_ofld_mtc(struct qed_hwfn *p_hwfn, u8 tc); u16 qed_get_cm_pq_idx_llt_mtc(struct qed_hwfn *p_hwfn, u8 tc); /* doorbell recovery mechanism */ -void qed_db_recovery_dp(struct qed_hwfn *p_hwfn); void qed_db_recovery_execute(struct qed_hwfn *p_hwfn); bool qed_edpm_enabled(struct qed_hwfn *p_hwfn); diff --git a/drivers/net/ethernet/qlogic/qed/qed_dbg_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_dbg_hsi.h index f6cd1b3efdfdf..27e91d0d39f8c 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dbg_hsi.h +++ b/drivers/net/ethernet/qlogic/qed/qed_dbg_hsi.h @@ -1304,37 +1304,6 @@ enum dbg_status qed_print_mcp_trace_results(struct qed_hwfn *p_hwfn, u32 num_dumped_dwords, char *results_buf); -/** - * qed_print_mcp_trace_results_cont(): Prints MCP Trace results, and - * keeps the MCP trace meta data allocated, to support continuous MCP Trace - * parsing. After the continuous parsing ends, mcp_trace_free_meta_data should - * be called to free the meta data. - * - * @p_hwfn: HW device data. - * @dump_buf: MVP trace dump buffer, starting from the header. - * @results_buf: Buffer for printing the mcp trace results. - * - * Return: Error if the parsing fails, ok otherwise. - */ -enum dbg_status qed_print_mcp_trace_results_cont(struct qed_hwfn *p_hwfn, - u32 *dump_buf, - char *results_buf); - -/** - * qed_print_mcp_trace_line(): Prints MCP Trace results for a single line - * - * @p_hwfn: HW device data. - * @dump_buf: MCP trace dump buffer, starting from the header. - * @num_dumped_bytes: Number of bytes that were dumped. - * @results_buf: Buffer for printing the mcp trace results. - * - * Return: Error if the parsing fails, ok otherwise. - */ -enum dbg_status qed_print_mcp_trace_line(struct qed_hwfn *p_hwfn, - u8 *dump_buf, - u32 num_dumped_bytes, - char *results_buf); - /** * qed_mcp_trace_free_meta_data(): Frees the MCP Trace meta data. * Should be called after continuous MCP Trace parsing. diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c index 464a72afb7589..9c3d3dd2f8475 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_debug.c +++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c @@ -7614,31 +7614,6 @@ enum dbg_status qed_print_mcp_trace_results(struct qed_hwfn *p_hwfn, results_buf, &parsed_buf_size, true); } -enum dbg_status qed_print_mcp_trace_results_cont(struct qed_hwfn *p_hwfn, - u32 *dump_buf, - char *results_buf) -{ - u32 parsed_buf_size; - - return qed_parse_mcp_trace_dump(p_hwfn, dump_buf, results_buf, - &parsed_buf_size, false); -} - -enum dbg_status qed_print_mcp_trace_line(struct qed_hwfn *p_hwfn, - u8 *dump_buf, - u32 num_dumped_bytes, - char *results_buf) -{ - u32 parsed_results_bytes; - - return qed_parse_mcp_trace_buf(p_hwfn, - dump_buf, - num_dumped_bytes, - 0, - num_dumped_bytes, - results_buf, &parsed_results_bytes); -} - /* Frees the specified MCP Trace meta data */ void qed_mcp_trace_free_meta_data(struct qed_hwfn *p_hwfn) { diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index 86a93cac26470..9659ce5b07125 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -255,25 +255,6 @@ static void qed_db_recovery_teardown(struct qed_hwfn *p_hwfn) p_hwfn->db_recovery_info.db_recovery_counter = 0; } -/* Print the content of the doorbell recovery mechanism */ -void qed_db_recovery_dp(struct qed_hwfn *p_hwfn) -{ - struct qed_db_recovery_entry *db_entry = NULL; - - DP_NOTICE(p_hwfn, - "Displaying doorbell recovery database. Counter was %d\n", - p_hwfn->db_recovery_info.db_recovery_counter); - - /* Protect the list */ - spin_lock_bh(&p_hwfn->db_recovery_info.lock); - list_for_each_entry(db_entry, - &p_hwfn->db_recovery_info.list, list_entry) { - qed_db_recovery_dp_entry(p_hwfn, db_entry, "Printing"); - } - - spin_unlock_bh(&p_hwfn->db_recovery_info.lock); -} - /* Ring the doorbell of a single doorbell recovery entry */ static void qed_db_recovery_ring(struct qed_hwfn *p_hwfn, struct qed_db_recovery_entry *db_entry) diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h index ed1a84542ad2d..10e355397ceeb 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h +++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h @@ -2666,58 +2666,6 @@ void qed_gft_config(struct qed_hwfn *p_hwfn, void qed_enable_context_validation(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt); -/** - * qed_calc_session_ctx_validation(): Calcualte validation byte for - * session context. - * - * @p_ctx_mem: Pointer to context memory. - * @ctx_size: Context size. - * @ctx_type: Context type. - * @cid: Context cid. - * - * Return: Void. - */ -void qed_calc_session_ctx_validation(void *p_ctx_mem, - u16 ctx_size, u8 ctx_type, u32 cid); - -/** - * qed_calc_task_ctx_validation(): Calcualte validation byte for task - * context. - * - * @p_ctx_mem: Pointer to context memory. - * @ctx_size: Context size. - * @ctx_type: Context type. - * @tid: Context tid. - * - * Return: Void. - */ -void qed_calc_task_ctx_validation(void *p_ctx_mem, - u16 ctx_size, u8 ctx_type, u32 tid); - -/** - * qed_memset_session_ctx(): Memset session context to 0 while - * preserving validation bytes. - * - * @p_ctx_mem: Pointer to context memory. - * @ctx_size: Size to initialzie. - * @ctx_type: Context type. - * - * Return: Void. - */ -void qed_memset_session_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type); - -/** - * qed_memset_task_ctx(): Memset task context to 0 while preserving - * validation bytes. - * - * @p_ctx_mem: Pointer to context memory. - * @ctx_size: size to initialzie. - * @ctx_type: context type. - * - * Return: Void. - */ -void qed_memset_task_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type); - #define NUM_STORMS 6 /** diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.c b/drivers/net/ethernet/qlogic/qed/qed_hw.c index 9e5f0dbc8a076..9907973399dc4 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hw.c +++ b/drivers/net/ethernet/qlogic/qed/qed_hw.c @@ -69,17 +69,6 @@ int qed_ptt_pool_alloc(struct qed_hwfn *p_hwfn) return 0; } -void qed_ptt_invalidate(struct qed_hwfn *p_hwfn) -{ - struct qed_ptt *p_ptt; - int i; - - for (i = 0; i < PXP_EXTERNAL_BAR_PF_WINDOW_NUM; i++) { - p_ptt = &p_hwfn->p_ptt_pool->ptts[i]; - p_ptt->pxp.offset = QED_BAR_INVALID_OFFSET; - } -} - void qed_ptt_pool_free(struct qed_hwfn *p_hwfn) { kfree(p_hwfn->p_ptt_pool); diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.h b/drivers/net/ethernet/qlogic/qed/qed_hw.h index e535983ce21bb..3c98f58a184fa 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hw.h +++ b/drivers/net/ethernet/qlogic/qed/qed_hw.h @@ -61,15 +61,6 @@ enum _dmae_cmd_crc_mask { */ void qed_gtt_init(struct qed_hwfn *p_hwfn); -/** - * qed_ptt_invalidate(): Forces all ptt entries to be re-configured - * - * @p_hwfn: HW device data. - * - * Return: Void. - */ -void qed_ptt_invalidate(struct qed_hwfn *p_hwfn); - /** * qed_ptt_pool_alloc(): Allocate and initialize PTT pool. * diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c index 407029a36fa14..aa20bb8caa9a9 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c +++ b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c @@ -18,16 +18,6 @@ #define CDU_VALIDATION_DEFAULT_CFG CDU_CONTEXT_VALIDATION_DEFAULT_CFG -static u16 con_region_offsets[3][NUM_OF_CONNECTION_TYPES] = { - {400, 336, 352, 368, 304, 384, 416, 352}, /* region 3 offsets */ - {528, 496, 416, 512, 448, 512, 544, 480}, /* region 4 offsets */ - {608, 544, 496, 576, 576, 592, 624, 560} /* region 5 offsets */ -}; - -static u16 task_region_offsets[1][NUM_OF_CONNECTION_TYPES] = { - {240, 240, 112, 0, 0, 0, 0, 96} /* region 1 offsets */ -}; - /* General constants */ #define QM_PQ_MEM_4KB(pq_size) (pq_size ? DIV_ROUND_UP((pq_size + 1) * \ QM_PQ_ELEMENT_SIZE, \ @@ -1576,134 +1566,6 @@ void qed_gft_config(struct qed_hwfn *p_hwfn, qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_GFT, 1); } -DECLARE_CRC8_TABLE(cdu_crc8_table); - -/* Calculate and return CDU validation byte per connection type/region/cid */ -static u8 qed_calc_cdu_validation_byte(u8 conn_type, u8 region, u32 cid) -{ - const u8 validation_cfg = CDU_VALIDATION_DEFAULT_CFG; - u8 crc, validation_byte = 0; - static u8 crc8_table_valid; /* automatically initialized to 0 */ - u32 validation_string = 0; - __be32 data_to_crc; - - if (!crc8_table_valid) { - crc8_populate_msb(cdu_crc8_table, 0x07); - crc8_table_valid = 1; - } - - /* The CRC is calculated on the String-to-compress: - * [31:8] = {CID[31:20],CID[11:0]} - * [7:4] = Region - * [3:0] = Type - */ - if ((validation_cfg >> CDU_CONTEXT_VALIDATION_CFG_USE_CID) & 1) - validation_string |= (cid & 0xFFF00000) | ((cid & 0xFFF) << 8); - - if ((validation_cfg >> CDU_CONTEXT_VALIDATION_CFG_USE_REGION) & 1) - validation_string |= ((region & 0xF) << 4); - - if ((validation_cfg >> CDU_CONTEXT_VALIDATION_CFG_USE_TYPE) & 1) - validation_string |= (conn_type & 0xF); - - /* Convert to big-endian and calculate CRC8 */ - data_to_crc = cpu_to_be32(validation_string); - crc = crc8(cdu_crc8_table, (u8 *)&data_to_crc, sizeof(data_to_crc), - CRC8_INIT_VALUE); - - /* The validation byte [7:0] is composed: - * for type A validation - * [7] = active configuration bit - * [6:0] = crc[6:0] - * - * for type B validation - * [7] = active configuration bit - * [6:3] = connection_type[3:0] - * [2:0] = crc[2:0] - */ - validation_byte |= - ((validation_cfg >> - CDU_CONTEXT_VALIDATION_CFG_USE_ACTIVE) & 1) << 7; - - if ((validation_cfg >> - CDU_CONTEXT_VALIDATION_CFG_VALIDATION_TYPE_SHIFT) & 1) - validation_byte |= ((conn_type & 0xF) << 3) | (crc & 0x7); - else - validation_byte |= crc & 0x7F; - - return validation_byte; -} - -/* Calcualte and set validation bytes for session context */ -void qed_calc_session_ctx_validation(void *p_ctx_mem, - u16 ctx_size, u8 ctx_type, u32 cid) -{ - u8 *x_val_ptr, *t_val_ptr, *u_val_ptr, *p_ctx; - - p_ctx = (u8 * const)p_ctx_mem; - x_val_ptr = &p_ctx[con_region_offsets[0][ctx_type]]; - t_val_ptr = &p_ctx[con_region_offsets[1][ctx_type]]; - u_val_ptr = &p_ctx[con_region_offsets[2][ctx_type]]; - - memset(p_ctx, 0, ctx_size); - - *x_val_ptr = qed_calc_cdu_validation_byte(ctx_type, 3, cid); - *t_val_ptr = qed_calc_cdu_validation_byte(ctx_type, 4, cid); - *u_val_ptr = qed_calc_cdu_validation_byte(ctx_type, 5, cid); -} - -/* Calcualte and set validation bytes for task context */ -void qed_calc_task_ctx_validation(void *p_ctx_mem, - u16 ctx_size, u8 ctx_type, u32 tid) -{ - u8 *p_ctx, *region1_val_ptr; - - p_ctx = (u8 * const)p_ctx_mem; - region1_val_ptr = &p_ctx[task_region_offsets[0][ctx_type]]; - - memset(p_ctx, 0, ctx_size); - - *region1_val_ptr = qed_calc_cdu_validation_byte(ctx_type, 1, tid); -} - -/* Memset session context to 0 while preserving validation bytes */ -void qed_memset_session_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type) -{ - u8 *x_val_ptr, *t_val_ptr, *u_val_ptr, *p_ctx; - u8 x_val, t_val, u_val; - - p_ctx = (u8 * const)p_ctx_mem; - x_val_ptr = &p_ctx[con_region_offsets[0][ctx_type]]; - t_val_ptr = &p_ctx[con_region_offsets[1][ctx_type]]; - u_val_ptr = &p_ctx[con_region_offsets[2][ctx_type]]; - - x_val = *x_val_ptr; - t_val = *t_val_ptr; - u_val = *u_val_ptr; - - memset(p_ctx, 0, ctx_size); - - *x_val_ptr = x_val; - *t_val_ptr = t_val; - *u_val_ptr = u_val; -} - -/* Memset task context to 0 while preserving validation bytes */ -void qed_memset_task_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type) -{ - u8 *p_ctx, *region1_val_ptr; - u8 region1_val; - - p_ctx = (u8 * const)p_ctx_mem; - region1_val_ptr = &p_ctx[task_region_offsets[0][ctx_type]]; - - region1_val = *region1_val_ptr; - - memset(p_ctx, 0, ctx_size); - - *region1_val_ptr = region1_val; -} - /* Enable and configure context validation */ void qed_enable_context_validation(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) diff --git a/drivers/net/ethernet/realtek/r8169.h b/drivers/net/ethernet/realtek/r8169.h index 7a194a8ab989f..9f784840ea0d2 100644 --- a/drivers/net/ethernet/realtek/r8169.h +++ b/drivers/net/ethernet/realtek/r8169.h @@ -73,7 +73,8 @@ enum mac_version { RTL_GIGA_MAC_VER_66, RTL_GIGA_MAC_VER_70, RTL_GIGA_MAC_VER_71, - RTL_GIGA_MAC_NONE + RTL_GIGA_MAC_NONE, + RTL_GIGA_MAC_VER_LAST = RTL_GIGA_MAC_NONE - 1 }; struct rtl8169_private; diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 4eebd9cb40a37..3ef7126f8ec50 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -91,61 +91,114 @@ #define JUMBO_9K (9 * SZ_1K - VLAN_ETH_HLEN - ETH_FCS_LEN) #define JUMBO_16K (SZ_16K - VLAN_ETH_HLEN - ETH_FCS_LEN) -static const struct { +static const struct rtl_chip_info { + u16 mask; + u16 val; + enum mac_version mac_version; const char *name; const char *fw_name; } rtl_chip_infos[] = { - /* PCI devices. */ - [RTL_GIGA_MAC_VER_02] = {"RTL8169s" }, - [RTL_GIGA_MAC_VER_03] = {"RTL8110s" }, - [RTL_GIGA_MAC_VER_04] = {"RTL8169sb/8110sb" }, - [RTL_GIGA_MAC_VER_05] = {"RTL8169sc/8110sc" }, - [RTL_GIGA_MAC_VER_06] = {"RTL8169sc/8110sc" }, - /* PCI-E devices. */ - [RTL_GIGA_MAC_VER_07] = {"RTL8102e" }, - [RTL_GIGA_MAC_VER_08] = {"RTL8102e" }, - [RTL_GIGA_MAC_VER_09] = {"RTL8102e/RTL8103e" }, - [RTL_GIGA_MAC_VER_10] = {"RTL8101e/RTL8100e" }, - [RTL_GIGA_MAC_VER_14] = {"RTL8401" }, - [RTL_GIGA_MAC_VER_17] = {"RTL8168b/8111b" }, - [RTL_GIGA_MAC_VER_18] = {"RTL8168cp/8111cp" }, - [RTL_GIGA_MAC_VER_19] = {"RTL8168c/8111c" }, - [RTL_GIGA_MAC_VER_20] = {"RTL8168c/8111c" }, - [RTL_GIGA_MAC_VER_21] = {"RTL8168c/8111c" }, - [RTL_GIGA_MAC_VER_22] = {"RTL8168c/8111c" }, - [RTL_GIGA_MAC_VER_23] = {"RTL8168cp/8111cp" }, - [RTL_GIGA_MAC_VER_24] = {"RTL8168cp/8111cp" }, - [RTL_GIGA_MAC_VER_25] = {"RTL8168d/8111d", FIRMWARE_8168D_1}, - [RTL_GIGA_MAC_VER_26] = {"RTL8168d/8111d", FIRMWARE_8168D_2}, - [RTL_GIGA_MAC_VER_28] = {"RTL8168dp/8111dp" }, - [RTL_GIGA_MAC_VER_29] = {"RTL8105e", FIRMWARE_8105E_1}, - [RTL_GIGA_MAC_VER_30] = {"RTL8105e", FIRMWARE_8105E_1}, - [RTL_GIGA_MAC_VER_31] = {"RTL8168dp/8111dp" }, - [RTL_GIGA_MAC_VER_32] = {"RTL8168e/8111e", FIRMWARE_8168E_1}, - [RTL_GIGA_MAC_VER_33] = {"RTL8168e/8111e", FIRMWARE_8168E_2}, - [RTL_GIGA_MAC_VER_34] = {"RTL8168evl/8111evl", FIRMWARE_8168E_3}, - [RTL_GIGA_MAC_VER_35] = {"RTL8168f/8111f", FIRMWARE_8168F_1}, - [RTL_GIGA_MAC_VER_36] = {"RTL8168f/8111f", FIRMWARE_8168F_2}, - [RTL_GIGA_MAC_VER_37] = {"RTL8402", FIRMWARE_8402_1 }, - [RTL_GIGA_MAC_VER_38] = {"RTL8411", FIRMWARE_8411_1 }, - [RTL_GIGA_MAC_VER_39] = {"RTL8106e", FIRMWARE_8106E_1}, - [RTL_GIGA_MAC_VER_40] = {"RTL8168g/8111g", FIRMWARE_8168G_2}, - [RTL_GIGA_MAC_VER_42] = {"RTL8168gu/8111gu", FIRMWARE_8168G_3}, - [RTL_GIGA_MAC_VER_43] = {"RTL8106eus", FIRMWARE_8106E_2}, - [RTL_GIGA_MAC_VER_44] = {"RTL8411b", FIRMWARE_8411_2 }, - [RTL_GIGA_MAC_VER_46] = {"RTL8168h/8111h", FIRMWARE_8168H_2}, - [RTL_GIGA_MAC_VER_48] = {"RTL8107e", FIRMWARE_8107E_2}, - [RTL_GIGA_MAC_VER_51] = {"RTL8168ep/8111ep" }, - [RTL_GIGA_MAC_VER_52] = {"RTL8168fp/RTL8117", FIRMWARE_8168FP_3}, - [RTL_GIGA_MAC_VER_53] = {"RTL8168fp/RTL8117", }, - [RTL_GIGA_MAC_VER_61] = {"RTL8125A", FIRMWARE_8125A_3}, - /* reserve 62 for CFG_METHOD_4 in the vendor driver */ - [RTL_GIGA_MAC_VER_63] = {"RTL8125B", FIRMWARE_8125B_2}, - [RTL_GIGA_MAC_VER_64] = {"RTL8125D", FIRMWARE_8125D_1}, - [RTL_GIGA_MAC_VER_65] = {"RTL8125D", FIRMWARE_8125D_2}, - [RTL_GIGA_MAC_VER_66] = {"RTL8125BP", FIRMWARE_8125BP_2}, - [RTL_GIGA_MAC_VER_70] = {"RTL8126A", FIRMWARE_8126A_2}, - [RTL_GIGA_MAC_VER_71] = {"RTL8126A", FIRMWARE_8126A_3}, + /* 8126A family. */ + { 0x7cf, 0x64a, RTL_GIGA_MAC_VER_71, "RTL8126A", FIRMWARE_8126A_3 }, + { 0x7cf, 0x649, RTL_GIGA_MAC_VER_70, "RTL8126A", FIRMWARE_8126A_2 }, + + /* 8125BP family. */ + { 0x7cf, 0x681, RTL_GIGA_MAC_VER_66, "RTL8125BP", FIRMWARE_8125BP_2 }, + + /* 8125D family. */ + { 0x7cf, 0x689, RTL_GIGA_MAC_VER_65, "RTL8125D", FIRMWARE_8125D_2 }, + { 0x7cf, 0x688, RTL_GIGA_MAC_VER_64, "RTL8125D", FIRMWARE_8125D_1 }, + + /* 8125B family. */ + { 0x7cf, 0x641, RTL_GIGA_MAC_VER_63, "RTL8125B", FIRMWARE_8125B_2 }, + + /* 8125A family. */ + { 0x7cf, 0x609, RTL_GIGA_MAC_VER_61, "RTL8125A", FIRMWARE_8125A_3 }, + + /* RTL8117 */ + { 0x7cf, 0x54b, RTL_GIGA_MAC_VER_53, "RTL8168fp/RTL8117" }, + { 0x7cf, 0x54a, RTL_GIGA_MAC_VER_52, "RTL8168fp/RTL8117", + FIRMWARE_8168FP_3 }, + + /* 8168EP family. */ + { 0x7cf, 0x502, RTL_GIGA_MAC_VER_51, "RTL8168ep/8111ep" }, + + /* 8168H family. */ + { 0x7cf, 0x541, RTL_GIGA_MAC_VER_46, "RTL8168h/8111h", + FIRMWARE_8168H_2 }, + /* Realtek calls it RTL8168M, but it's handled like RTL8168H */ + { 0x7cf, 0x6c0, RTL_GIGA_MAC_VER_46, "RTL8168M", FIRMWARE_8168H_2 }, + + /* 8168G family. */ + { 0x7cf, 0x5c8, RTL_GIGA_MAC_VER_44, "RTL8411b", FIRMWARE_8411_2 }, + { 0x7cf, 0x509, RTL_GIGA_MAC_VER_42, "RTL8168gu/8111gu", + FIRMWARE_8168G_3 }, + { 0x7cf, 0x4c0, RTL_GIGA_MAC_VER_40, "RTL8168g/8111g", + FIRMWARE_8168G_2 }, + + /* 8168F family. */ + { 0x7c8, 0x488, RTL_GIGA_MAC_VER_38, "RTL8411", FIRMWARE_8411_1 }, + { 0x7cf, 0x481, RTL_GIGA_MAC_VER_36, "RTL8168f/8111f", + FIRMWARE_8168F_2 }, + { 0x7cf, 0x480, RTL_GIGA_MAC_VER_35, "RTL8168f/8111f", + FIRMWARE_8168F_1 }, + + /* 8168E family. */ + { 0x7c8, 0x2c8, RTL_GIGA_MAC_VER_34, "RTL8168evl/8111evl", + FIRMWARE_8168E_3 }, + { 0x7cf, 0x2c1, RTL_GIGA_MAC_VER_32, "RTL8168e/8111e", + FIRMWARE_8168E_1 }, + { 0x7c8, 0x2c0, RTL_GIGA_MAC_VER_33, "RTL8168e/8111e", + FIRMWARE_8168E_2 }, + + /* 8168D family. */ + { 0x7cf, 0x281, RTL_GIGA_MAC_VER_25, "RTL8168d/8111d", + FIRMWARE_8168D_1 }, + { 0x7c8, 0x280, RTL_GIGA_MAC_VER_26, "RTL8168d/8111d", + FIRMWARE_8168D_2 }, + + /* 8168DP family. */ + { 0x7cf, 0x28a, RTL_GIGA_MAC_VER_28, "RTL8168dp/8111dp" }, + { 0x7cf, 0x28b, RTL_GIGA_MAC_VER_31, "RTL8168dp/8111dp" }, + + /* 8168C family. */ + { 0x7cf, 0x3c9, RTL_GIGA_MAC_VER_23, "RTL8168cp/8111cp" }, + { 0x7cf, 0x3c8, RTL_GIGA_MAC_VER_18, "RTL8168cp/8111cp" }, + { 0x7c8, 0x3c8, RTL_GIGA_MAC_VER_24, "RTL8168cp/8111cp" }, + { 0x7cf, 0x3c0, RTL_GIGA_MAC_VER_19, "RTL8168c/8111c" }, + { 0x7cf, 0x3c2, RTL_GIGA_MAC_VER_20, "RTL8168c/8111c" }, + { 0x7cf, 0x3c3, RTL_GIGA_MAC_VER_21, "RTL8168c/8111c" }, + { 0x7c8, 0x3c0, RTL_GIGA_MAC_VER_22, "RTL8168c/8111c" }, + + /* 8168B family. */ + { 0x7c8, 0x380, RTL_GIGA_MAC_VER_17, "RTL8168b/8111b" }, + /* This one is very old and rare, support has been removed. + * { 0x7c8, 0x300, RTL_GIGA_MAC_VER_11, "RTL8168b/8111b" }, + */ + + /* 8101 family. */ + { 0x7c8, 0x448, RTL_GIGA_MAC_VER_39, "RTL8106e", FIRMWARE_8106E_1 }, + { 0x7c8, 0x440, RTL_GIGA_MAC_VER_37, "RTL8402", FIRMWARE_8402_1 }, + { 0x7cf, 0x409, RTL_GIGA_MAC_VER_29, "RTL8105e", FIRMWARE_8105E_1 }, + { 0x7c8, 0x408, RTL_GIGA_MAC_VER_30, "RTL8105e", FIRMWARE_8105E_1 }, + { 0x7cf, 0x349, RTL_GIGA_MAC_VER_08, "RTL8102e" }, + { 0x7cf, 0x249, RTL_GIGA_MAC_VER_08, "RTL8102e" }, + { 0x7cf, 0x348, RTL_GIGA_MAC_VER_07, "RTL8102e" }, + { 0x7cf, 0x248, RTL_GIGA_MAC_VER_07, "RTL8102e" }, + { 0x7cf, 0x240, RTL_GIGA_MAC_VER_14, "RTL8401" }, + { 0x7c8, 0x348, RTL_GIGA_MAC_VER_09, "RTL8102e/RTL8103e" }, + { 0x7c8, 0x248, RTL_GIGA_MAC_VER_09, "RTL8102e/RTL8103e" }, + { 0x7c8, 0x340, RTL_GIGA_MAC_VER_10, "RTL8101e/RTL8100e" }, + + /* 8110 family. */ + { 0xfc8, 0x980, RTL_GIGA_MAC_VER_06, "RTL8169sc/8110sc" }, + { 0xfc8, 0x180, RTL_GIGA_MAC_VER_05, "RTL8169sc/8110sc" }, + { 0xfc8, 0x100, RTL_GIGA_MAC_VER_04, "RTL8169sb/8110sb" }, + { 0xfc8, 0x040, RTL_GIGA_MAC_VER_03, "RTL8110s" }, + { 0xfc8, 0x008, RTL_GIGA_MAC_VER_02, "RTL8169s" }, + + /* Catch-all */ + { 0x000, 0x000, RTL_GIGA_MAC_NONE } }; static const struct pci_device_id rtl8169_pci_tbl[] = { @@ -1236,7 +1289,7 @@ static void rtl_writephy(struct rtl8169_private *tp, int location, int val) case RTL_GIGA_MAC_VER_31: r8168dp_2_mdio_write(tp, location, val); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_LAST: r8168g_mdio_write(tp, location, val); break; default: @@ -1251,7 +1304,7 @@ static int rtl_readphy(struct rtl8169_private *tp, int location) case RTL_GIGA_MAC_VER_28: case RTL_GIGA_MAC_VER_31: return r8168dp_2_mdio_read(tp, location); - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_LAST: return r8168g_mdio_read(tp, location); default: return r8169_mdio_read(tp, location); @@ -1603,7 +1656,7 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts) break; case RTL_GIGA_MAC_VER_34: case RTL_GIGA_MAC_VER_37: - case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_LAST: r8169_mod_reg8_cond(tp, Config2, PME_SIGNAL, wolopts); break; default: @@ -2076,7 +2129,7 @@ static void rtl_set_eee_txidle_timer(struct rtl8169_private *tp) tp->tx_lpi_timer = timer_val; r8168_mac_ocp_write(tp, 0xe048, timer_val); break; - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_LAST: tp->tx_lpi_timer = timer_val; RTL_W16(tp, EEE_TXIDLE_TIMER_8125, timer_val); break; @@ -2265,151 +2318,30 @@ static const struct ethtool_ops rtl8169_ethtool_ops = { .get_eth_ctrl_stats = rtl8169_get_eth_ctrl_stats, }; -static enum mac_version rtl8169_get_mac_version(u16 xid, bool gmii) +static const struct rtl_chip_info *rtl8169_get_chip_version(u16 xid, bool gmii) { - /* - * The driver currently handles the 8168Bf and the 8168Be identically - * but they can be identified more specifically through the test below - * if needed: - * - * (RTL_R32(tp, TxConfig) & 0x700000) == 0x500000 ? 8168Bf : 8168Be - * - * Same thing for the 8101Eb and the 8101Ec: - * - * (RTL_R32(tp, TxConfig) & 0x700000) == 0x200000 ? 8101Eb : 8101Ec - */ - static const struct rtl_mac_info { - u16 mask; - u16 val; - enum mac_version ver; - } mac_info[] = { - /* 8126A family. */ - { 0x7cf, 0x64a, RTL_GIGA_MAC_VER_71 }, - { 0x7cf, 0x649, RTL_GIGA_MAC_VER_70 }, - - /* 8125BP family. */ - { 0x7cf, 0x681, RTL_GIGA_MAC_VER_66 }, - - /* 8125D family. */ - { 0x7cf, 0x689, RTL_GIGA_MAC_VER_65 }, - { 0x7cf, 0x688, RTL_GIGA_MAC_VER_64 }, - - /* 8125B family. */ - { 0x7cf, 0x641, RTL_GIGA_MAC_VER_63 }, - - /* 8125A family. */ - { 0x7cf, 0x609, RTL_GIGA_MAC_VER_61 }, - /* It seems only XID 609 made it to the mass market. - * { 0x7cf, 0x608, RTL_GIGA_MAC_VER_60 }, - * { 0x7c8, 0x608, RTL_GIGA_MAC_VER_61 }, - */ - - /* RTL8117 */ - { 0x7cf, 0x54b, RTL_GIGA_MAC_VER_53 }, - { 0x7cf, 0x54a, RTL_GIGA_MAC_VER_52 }, - - /* 8168EP family. */ - { 0x7cf, 0x502, RTL_GIGA_MAC_VER_51 }, - /* It seems this chip version never made it to - * the wild. Let's disable detection. - * { 0x7cf, 0x501, RTL_GIGA_MAC_VER_50 }, - * { 0x7cf, 0x500, RTL_GIGA_MAC_VER_49 }, - */ - - /* 8168H family. */ - { 0x7cf, 0x541, RTL_GIGA_MAC_VER_46 }, - /* It seems this chip version never made it to - * the wild. Let's disable detection. - * { 0x7cf, 0x540, RTL_GIGA_MAC_VER_45 }, - */ - /* Realtek calls it RTL8168M, but it's handled like RTL8168H */ - { 0x7cf, 0x6c0, RTL_GIGA_MAC_VER_46 }, - - /* 8168G family. */ - { 0x7cf, 0x5c8, RTL_GIGA_MAC_VER_44 }, - { 0x7cf, 0x509, RTL_GIGA_MAC_VER_42 }, - /* It seems this chip version never made it to - * the wild. Let's disable detection. - * { 0x7cf, 0x4c1, RTL_GIGA_MAC_VER_41 }, - */ - { 0x7cf, 0x4c0, RTL_GIGA_MAC_VER_40 }, - - /* 8168F family. */ - { 0x7c8, 0x488, RTL_GIGA_MAC_VER_38 }, - { 0x7cf, 0x481, RTL_GIGA_MAC_VER_36 }, - { 0x7cf, 0x480, RTL_GIGA_MAC_VER_35 }, - - /* 8168E family. */ - { 0x7c8, 0x2c8, RTL_GIGA_MAC_VER_34 }, - { 0x7cf, 0x2c1, RTL_GIGA_MAC_VER_32 }, - { 0x7c8, 0x2c0, RTL_GIGA_MAC_VER_33 }, - - /* 8168D family. */ - { 0x7cf, 0x281, RTL_GIGA_MAC_VER_25 }, - { 0x7c8, 0x280, RTL_GIGA_MAC_VER_26 }, - - /* 8168DP family. */ - /* It seems this early RTL8168dp version never made it to - * the wild. Support has been removed. - * { 0x7cf, 0x288, RTL_GIGA_MAC_VER_27 }, - */ - { 0x7cf, 0x28a, RTL_GIGA_MAC_VER_28 }, - { 0x7cf, 0x28b, RTL_GIGA_MAC_VER_31 }, - - /* 8168C family. */ - { 0x7cf, 0x3c9, RTL_GIGA_MAC_VER_23 }, - { 0x7cf, 0x3c8, RTL_GIGA_MAC_VER_18 }, - { 0x7c8, 0x3c8, RTL_GIGA_MAC_VER_24 }, - { 0x7cf, 0x3c0, RTL_GIGA_MAC_VER_19 }, - { 0x7cf, 0x3c2, RTL_GIGA_MAC_VER_20 }, - { 0x7cf, 0x3c3, RTL_GIGA_MAC_VER_21 }, - { 0x7c8, 0x3c0, RTL_GIGA_MAC_VER_22 }, - - /* 8168B family. */ - { 0x7c8, 0x380, RTL_GIGA_MAC_VER_17 }, - /* This one is very old and rare, support has been removed. - * { 0x7c8, 0x300, RTL_GIGA_MAC_VER_11 }, - */ - - /* 8101 family. */ - { 0x7c8, 0x448, RTL_GIGA_MAC_VER_39 }, - { 0x7c8, 0x440, RTL_GIGA_MAC_VER_37 }, - { 0x7cf, 0x409, RTL_GIGA_MAC_VER_29 }, - { 0x7c8, 0x408, RTL_GIGA_MAC_VER_30 }, - { 0x7cf, 0x349, RTL_GIGA_MAC_VER_08 }, - { 0x7cf, 0x249, RTL_GIGA_MAC_VER_08 }, - { 0x7cf, 0x348, RTL_GIGA_MAC_VER_07 }, - { 0x7cf, 0x248, RTL_GIGA_MAC_VER_07 }, - { 0x7cf, 0x240, RTL_GIGA_MAC_VER_14 }, - { 0x7c8, 0x348, RTL_GIGA_MAC_VER_09 }, - { 0x7c8, 0x248, RTL_GIGA_MAC_VER_09 }, - { 0x7c8, 0x340, RTL_GIGA_MAC_VER_10 }, - - /* 8110 family. */ - { 0xfc8, 0x980, RTL_GIGA_MAC_VER_06 }, - { 0xfc8, 0x180, RTL_GIGA_MAC_VER_05 }, - { 0xfc8, 0x100, RTL_GIGA_MAC_VER_04 }, - { 0xfc8, 0x040, RTL_GIGA_MAC_VER_03 }, - { 0xfc8, 0x008, RTL_GIGA_MAC_VER_02 }, - - /* Catch-all */ - { 0x000, 0x000, RTL_GIGA_MAC_NONE } + /* Chips combining a 1Gbps MAC with a 100Mbps PHY */ + static const struct rtl_chip_info rtl8106eus_info = { + .mac_version = RTL_GIGA_MAC_VER_43, + .name = "RTL8106eus", + .fw_name = FIRMWARE_8106E_2, + }; + static const struct rtl_chip_info rtl8107e_info = { + .mac_version = RTL_GIGA_MAC_VER_48, + .name = "RTL8107e", + .fw_name = FIRMWARE_8107E_2, }; - const struct rtl_mac_info *p = mac_info; - enum mac_version ver; + const struct rtl_chip_info *p = rtl_chip_infos; while ((xid & p->mask) != p->val) p++; - ver = p->ver; - if (ver != RTL_GIGA_MAC_NONE && !gmii) { - if (ver == RTL_GIGA_MAC_VER_42) - ver = RTL_GIGA_MAC_VER_43; - else if (ver == RTL_GIGA_MAC_VER_46) - ver = RTL_GIGA_MAC_VER_48; - } + if (p->mac_version == RTL_GIGA_MAC_VER_42 && !gmii) + return &rtl8106eus_info; + if (p->mac_version == RTL_GIGA_MAC_VER_46 && !gmii) + return &rtl8107e_info; - return ver; + return p; } static void rtl_release_firmware(struct rtl8169_private *tp) @@ -2559,7 +2491,7 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_61: RTL_W32(tp, RxConfig, RX_FETCH_DFLT_8125 | RX_DMA_BURST); break; - case RTL_GIGA_MAC_VER_63 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_63 ... RTL_GIGA_MAC_VER_LAST: RTL_W32(tp, RxConfig, RX_FETCH_DFLT_8125 | RX_DMA_BURST | RX_PAUSE_SLOT_ON); break; @@ -2691,7 +2623,7 @@ static void rtl_wait_txrx_fifo_empty(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_61: rtl_loop_wait_high(tp, &rtl_rxtx_empty_cond, 100, 42); break; - case RTL_GIGA_MAC_VER_63 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_63 ... RTL_GIGA_MAC_VER_LAST: RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) | StopReq); rtl_loop_wait_high(tp, &rtl_rxtx_empty_cond, 100, 42); rtl_loop_wait_high(tp, &rtl_rxtx_empty_cond_2, 100, 42); @@ -2852,10 +2784,23 @@ static u32 rtl_csi_read(struct rtl8169_private *tp, int addr) RTL_R32(tp, CSIDR) : ~0; } +static void rtl_csi_mod(struct rtl8169_private *tp, int addr, + u32 mask, u32 set) +{ + u32 val; + + WARN(addr % 4, "Invalid CSI address %#x\n", addr); + + netdev_notice_once(tp->dev, + "No native access to PCI extended config space, falling back to CSI\n"); + + val = rtl_csi_read(tp, addr); + rtl_csi_write(tp, addr, (val & ~mask) | set); +} + static void rtl_disable_zrxdc_timeout(struct rtl8169_private *tp) { struct pci_dev *pdev = tp->pci_dev; - u32 csi; int rc; u8 val; @@ -2872,16 +2817,12 @@ static void rtl_disable_zrxdc_timeout(struct rtl8169_private *tp) } } - netdev_notice_once(tp->dev, - "No native access to PCI extended config space, falling back to CSI\n"); - csi = rtl_csi_read(tp, RTL_GEN3_RELATED_OFF); - rtl_csi_write(tp, RTL_GEN3_RELATED_OFF, csi & ~RTL_GEN3_ZRXDC_NONCOMPL); + rtl_csi_mod(tp, RTL_GEN3_RELATED_OFF, RTL_GEN3_ZRXDC_NONCOMPL, 0); } static void rtl_set_aspm_entry_latency(struct rtl8169_private *tp, u8 val) { struct pci_dev *pdev = tp->pci_dev; - u32 csi; /* According to Realtek the value at config space address 0x070f * controls the L0s/L1 entrance latency. We try standard ECAM access @@ -2893,10 +2834,7 @@ static void rtl_set_aspm_entry_latency(struct rtl8169_private *tp, u8 val) pci_write_config_byte(pdev, 0x070f, val) == PCIBIOS_SUCCESSFUL) return; - netdev_notice_once(tp->dev, - "No native access to PCI extended config space, falling back to CSI\n"); - csi = rtl_csi_read(tp, 0x070c) & 0x00ffffff; - rtl_csi_write(tp, 0x070c, csi | val << 24); + rtl_csi_mod(tp, 0x070c, 0xff000000, val << 24); } static void rtl_set_def_aspm_entry_latency(struct rtl8169_private *tp) @@ -2960,7 +2898,7 @@ static void rtl_enable_exit_l1(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_37 ... RTL_GIGA_MAC_VER_38: rtl_eri_set_bits(tp, 0xd4, 0x0c00); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_LAST: r8168_mac_ocp_modify(tp, 0xc0ac, 0, 0x1f80); break; default: @@ -2974,7 +2912,7 @@ static void rtl_disable_exit_l1(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_34 ... RTL_GIGA_MAC_VER_38: rtl_eri_clear_bits(tp, 0xd4, 0x1f00); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_LAST: r8168_mac_ocp_modify(tp, 0xc0ac, 0x1f80, 0); break; default: @@ -3012,7 +2950,7 @@ static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) switch (tp->mac_version) { case RTL_GIGA_MAC_VER_46 ... RTL_GIGA_MAC_VER_48: - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_LAST: /* reset ephy tx/rx disable timer */ r8168_mac_ocp_modify(tp, 0xe094, 0xff00, 0); /* chip can trigger L1.2 */ @@ -3024,7 +2962,7 @@ static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) } else { switch (tp->mac_version) { case RTL_GIGA_MAC_VER_46 ... RTL_GIGA_MAC_VER_48: - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_LAST: r8168_mac_ocp_modify(tp, 0xe092, 0x00ff, 0); break; default: @@ -4156,7 +4094,7 @@ static void rtl8169_cleanup(struct rtl8169_private *tp) RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) | StopReq); rtl_loop_wait_high(tp, &rtl_txcfg_empty_cond, 100, 666); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_LAST: rtl_enable_rxdvgate(tp); fsleep(2000); break; @@ -4313,7 +4251,7 @@ static unsigned int rtl_quirk_packet_padto(struct rtl8169_private *tp, switch (tp->mac_version) { case RTL_GIGA_MAC_VER_34: - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_LAST: padto = max_t(unsigned int, padto, ETH_ZLEN); break; default: @@ -5101,10 +5039,8 @@ static void rtl_shutdown(struct pci_dev *pdev) /* Restore original MAC address */ rtl_rar_set(tp, tp->dev->perm_addr); - if (system_state == SYSTEM_POWER_OFF && !tp->dash_enabled) { - pci_wake_from_d3(pdev, tp->saved_wolopts); - pci_set_power_state(pdev, PCI_D3hot); - } + if (system_state == SYSTEM_POWER_OFF && !tp->dash_enabled) + pci_prepare_to_sleep(pdev); } static void rtl_remove_one(struct pci_dev *pdev) @@ -5364,7 +5300,7 @@ static void rtl_hw_initialize(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_48: rtl_hw_init_8168g(tp); break; - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_LAST: rtl_hw_init_8125(tp); break; default: @@ -5389,7 +5325,7 @@ static int rtl_jumbo_max(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_18 ... RTL_GIGA_MAC_VER_24: return JUMBO_6K; /* RTL8125/8126 */ - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_LAST: return JUMBO_16K; default: return JUMBO_9K; @@ -5434,9 +5370,9 @@ static bool rtl_aspm_is_safe(struct rtl8169_private *tp) static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { + const struct rtl_chip_info *chip; struct rtl8169_private *tp; int jumbo_max, region, rc; - enum mac_version chipset; struct net_device *dev; u32 txconfig; u16 xid; @@ -5486,12 +5422,13 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) xid = (txconfig >> 20) & 0xfcf; /* Identify chip attached to board */ - chipset = rtl8169_get_mac_version(xid, tp->supports_gmii); - if (chipset == RTL_GIGA_MAC_NONE) + chip = rtl8169_get_chip_version(xid, tp->supports_gmii); + if (chip->mac_version == RTL_GIGA_MAC_NONE) return dev_err_probe(&pdev->dev, -ENODEV, "unknown chip XID %03x, contact r8169 maintainers (see MAINTAINERS file)\n", xid); - tp->mac_version = chipset; + tp->mac_version = chip->mac_version; + tp->fw_name = chip->fw_name; /* Disable ASPM L1 as that cause random device stop working * problems as well as full system hangs for some PCIe devices users. @@ -5596,8 +5533,6 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) rtl_set_irq_mask(tp); - tp->fw_name = rtl_chip_infos[chipset].fw_name; - tp->counters = dmam_alloc_coherent (&pdev->dev, sizeof(*tp->counters), &tp->counters_phys_addr, GFP_KERNEL); @@ -5622,7 +5557,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) } netdev_info(dev, "%s, %pM, XID %03x, IRQ %d\n", - rtl_chip_infos[chipset].name, dev->dev_addr, xid, tp->irq); + chip->name, dev->dev_addr, xid, tp->irq); if (jumbo_max) netdev_info(dev, "jumbo features [frames: %d bytes, tx checksumming: %s]\n", diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c index cf95e579c65d7..748ca8b212ba1 100644 --- a/drivers/net/ethernet/realtek/r8169_phy_config.c +++ b/drivers/net/ethernet/realtek/r8169_phy_config.c @@ -50,6 +50,15 @@ static void r8168g_phy_param(struct phy_device *phydev, u16 parm, phy_restore_page(phydev, oldpage, 0); } +static void rtl8125_phy_param(struct phy_device *phydev, u16 parm, + u16 mask, u16 val) +{ + phy_lock_mdio_bus(phydev); + __phy_write_mmd(phydev, MDIO_MMD_VEND2, 0xb87c, parm); + __phy_modify_mmd(phydev, MDIO_MMD_VEND2, 0xb87e, mask, val); + phy_unlock_mdio_bus(phydev); +} + struct phy_reg { u16 reg; u16 val; @@ -1004,12 +1013,8 @@ static void rtl8125a_2_hw_phy_config(struct rtl8169_private *tp, phy_write_paged(phydev, 0xac5, 0x16, 0x01ff); phy_modify_paged(phydev, 0xac8, 0x15, 0x00f0, 0x0030); - phy_write(phydev, 0x1f, 0x0b87); - phy_write(phydev, 0x16, 0x80a2); - phy_write(phydev, 0x17, 0x0153); - phy_write(phydev, 0x16, 0x809c); - phy_write(phydev, 0x17, 0x0153); - phy_write(phydev, 0x1f, 0x0000); + rtl8125_phy_param(phydev, 0x80a2, 0xffff, 0x0153); + rtl8125_phy_param(phydev, 0x809c, 0xffff, 0x0153); phy_write(phydev, 0x1f, 0x0a43); phy_write(phydev, 0x13, 0x81B3); @@ -1061,14 +1066,9 @@ static void rtl8125b_hw_phy_config(struct rtl8169_private *tp, phy_modify_paged(phydev, 0xac4, 0x13, 0x00f0, 0x0090); phy_modify_paged(phydev, 0xad3, 0x10, 0x0003, 0x0001); - phy_write(phydev, 0x1f, 0x0b87); - phy_write(phydev, 0x16, 0x80f5); - phy_write(phydev, 0x17, 0x760e); - phy_write(phydev, 0x16, 0x8107); - phy_write(phydev, 0x17, 0x360e); - phy_write(phydev, 0x16, 0x8551); - phy_modify(phydev, 0x17, 0xff00, 0x0800); - phy_write(phydev, 0x1f, 0x0000); + rtl8125_phy_param(phydev, 0x80f5, 0xffff, 0x760e); + rtl8125_phy_param(phydev, 0x8107, 0xffff, 0x360e); + rtl8125_phy_param(phydev, 0x8551, 0xff00, 0x0800); phy_modify_paged(phydev, 0xbf0, 0x10, 0xe000, 0xa000); phy_modify_paged(phydev, 0xbf4, 0x13, 0x0f00, 0x0300); @@ -1110,12 +1110,8 @@ static void rtl8125bp_hw_phy_config(struct rtl8169_private *tp, r8168g_phy_param(phydev, 0x8010, 0x0800, 0x0000); - phy_write(phydev, 0x1f, 0x0b87); - phy_write(phydev, 0x16, 0x8088); - phy_modify(phydev, 0x17, 0xff00, 0x9000); - phy_write(phydev, 0x16, 0x808f); - phy_modify(phydev, 0x17, 0xff00, 0x9000); - phy_write(phydev, 0x1f, 0x0000); + rtl8125_phy_param(phydev, 0x8088, 0xff00, 0x9000); + rtl8125_phy_param(phydev, 0x808f, 0xff00, 0x9000); r8168g_phy_param(phydev, 0x8174, 0x2000, 0x1800); diff --git a/drivers/net/ethernet/realtek/rtase/rtase.h b/drivers/net/ethernet/realtek/rtase/rtase.h index 2bbfcad613abb..498cfe4d0cac3 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase.h +++ b/drivers/net/ethernet/realtek/rtase/rtase.h @@ -170,6 +170,7 @@ enum rtase_registers { #define RTASE_TC_MODE_MASK GENMASK(11, 10) RTASE_TOKSEL = 0x2046, + RTASE_TXQCRDT_0 = 0x2500, RTASE_RFIFONFULL = 0x4406, RTASE_INT_MITI_TX = 0x0A00, RTASE_INT_MITI_RX = 0x0A80, @@ -259,6 +260,12 @@ union rtase_rx_desc { #define RTASE_VLAN_TAG_MASK GENMASK(15, 0) #define RTASE_RX_PKT_SIZE_MASK GENMASK(13, 0) +/* txqos hardware definitions */ +#define RTASE_1T_CLOCK 64 +#define RTASE_1T_POWER 10000000 +#define RTASE_IDLESLOPE_INT_SHIFT 25 +#define RTASE_IDLESLOPE_INT_MASK GENMASK(31, 25) + #define RTASE_IVEC_NAME_SIZE (IFNAMSIZ + 10) struct rtase_int_vector { @@ -294,6 +301,13 @@ struct rtase_ring { u64 alloc_fail; }; +struct rtase_txqos { + int hicredit; + int locredit; + int idleslope; + int sendslope; +}; + struct rtase_stats { u64 tx_dropped; u64 rx_dropped; @@ -313,6 +327,7 @@ struct rtase_private { struct page_pool *page_pool; struct rtase_ring tx_ring[RTASE_NUM_TX_QUEUE]; + struct rtase_txqos tx_qos[RTASE_NUM_TX_QUEUE]; struct rtase_ring rx_ring[RTASE_NUM_RX_QUEUE]; struct rtase_counters *tally_vaddr; dma_addr_t tally_paddr; diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c b/drivers/net/ethernet/realtek/rtase/rtase_main.c index 2aacc1996796d..6251548d50ffc 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase_main.c +++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c @@ -1661,6 +1661,65 @@ static void rtase_get_stats64(struct net_device *dev, stats->rx_length_errors = tp->stats.rx_length_errors; } +static void rtase_set_hw_cbs(const struct rtase_private *tp, u32 queue) +{ + u32 idle = tp->tx_qos[queue].idleslope * RTASE_1T_CLOCK; + u32 val, i; + + val = u32_encode_bits(idle / RTASE_1T_POWER, RTASE_IDLESLOPE_INT_MASK); + idle %= RTASE_1T_POWER; + + for (i = 1; i <= RTASE_IDLESLOPE_INT_SHIFT; i++) { + idle *= 2; + if ((idle / RTASE_1T_POWER) == 1) + val |= BIT(RTASE_IDLESLOPE_INT_SHIFT - i); + + idle %= RTASE_1T_POWER; + } + + rtase_w32(tp, RTASE_TXQCRDT_0 + queue * 4, val); +} + +static int rtase_setup_tc_cbs(struct rtase_private *tp, + const struct tc_cbs_qopt_offload *qopt) +{ + int queue = qopt->queue; + + if (queue < 0 || queue >= tp->func_tx_queue_num) + return -EINVAL; + + if (!qopt->enable) { + tp->tx_qos[queue].hicredit = 0; + tp->tx_qos[queue].locredit = 0; + tp->tx_qos[queue].idleslope = 0; + tp->tx_qos[queue].sendslope = 0; + + rtase_w32(tp, RTASE_TXQCRDT_0 + queue * 4, 0); + } else { + tp->tx_qos[queue].hicredit = qopt->hicredit; + tp->tx_qos[queue].locredit = qopt->locredit; + tp->tx_qos[queue].idleslope = qopt->idleslope; + tp->tx_qos[queue].sendslope = qopt->sendslope; + + rtase_set_hw_cbs(tp, queue); + } + + return 0; +} + +static int rtase_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data) +{ + struct rtase_private *tp = netdev_priv(dev); + + switch (type) { + case TC_SETUP_QDISC_CBS: + return rtase_setup_tc_cbs(tp, type_data); + default: + return -EOPNOTSUPP; + } +} + static netdev_features_t rtase_fix_features(struct net_device *dev, netdev_features_t features) { @@ -1696,6 +1755,7 @@ static const struct net_device_ops rtase_netdev_ops = { .ndo_change_mtu = rtase_change_mtu, .ndo_tx_timeout = rtase_tx_timeout, .ndo_get_stats64 = rtase_get_stats64, + .ndo_setup_tc = rtase_setup_tc, .ndo_fix_features = rtase_fix_features, .ndo_set_features = rtase_set_features, }; diff --git a/drivers/net/ethernet/renesas/ravb_ptp.c b/drivers/net/ethernet/renesas/ravb_ptp.c index b4365906669f3..226c6c0ab945b 100644 --- a/drivers/net/ethernet/renesas/ravb_ptp.c +++ b/drivers/net/ethernet/renesas/ravb_ptp.c @@ -176,12 +176,6 @@ static int ravb_ptp_extts(struct ptp_clock_info *ptp, struct net_device *ndev = priv->ndev; unsigned long flags; - /* Reject requests with unsupported flags */ - if (req->flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE)) - return -EOPNOTSUPP; - if (req->index) return -EINVAL; @@ -212,10 +206,6 @@ static int ravb_ptp_perout(struct ptp_clock_info *ptp, unsigned long flags; int error = 0; - /* Reject requests with unsupported flags */ - if (req->flags) - return -EOPNOTSUPP; - if (req->index) return -EINVAL; @@ -287,6 +277,7 @@ static const struct ptp_clock_info ravb_ptp_info = { .max_adj = 50000000, .n_ext_ts = N_EXT_TS, .n_per_out = N_PER_OUT, + .supported_extts_flags = PTP_RISING_EDGE | PTP_FALLING_EDGE, .adjfine = ravb_ptp_adjfine, .adjtime = ravb_ptp_adjtime, .gettime64 = ravb_ptp_gettime64, diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c index d5db26103d82e..19985d13e2434 100644 --- a/drivers/net/ethernet/rocker/rocker_ofdpa.c +++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c @@ -1933,7 +1933,7 @@ static int ofdpa_port_fdb(struct ofdpa_port *ofdpa_port, spin_unlock_irqrestore(&ofdpa->fdb_tbl_lock, lock_flags); /* Check if adding and already exists, or removing and can't find */ - if (!found != !removing) { + if (!found == removing) { kfree(fdb); if (!found && removing) return 0; diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig b/drivers/net/ethernet/stmicro/stmmac/Kconfig index 3c820ef567759..2c99b23f0faae 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Kconfig +++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig @@ -131,6 +131,17 @@ config DWMAC_QCOM_ETHQOS This selects the Qualcomm ETHQOS glue layer support for the stmmac device driver. +config DWMAC_RENESAS_GBETH + tristate "Renesas RZ/V2H(P) GBETH support" + default ARCH_RENESAS + depends on OF && (ARCH_RENESAS || COMPILE_TEST) + help + Support for Gigabit Ethernet Interface (GBETH) on Renesas + RZ/V2H(P) SoCs. + + This selects the Renesas RZ/V2H(P) Soc specific glue layer support + for the stmmac device driver. + config DWMAC_ROCKCHIP tristate "Rockchip dwmac support" default ARCH_ROCKCHIP diff --git a/drivers/net/ethernet/stmicro/stmmac/Makefile b/drivers/net/ethernet/stmicro/stmmac/Makefile index 594883fb41644..91050215511b2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Makefile +++ b/drivers/net/ethernet/stmicro/stmmac/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_DWMAC_LPC18XX) += dwmac-lpc18xx.o obj-$(CONFIG_DWMAC_MEDIATEK) += dwmac-mediatek.o obj-$(CONFIG_DWMAC_MESON) += dwmac-meson.o dwmac-meson8b.o obj-$(CONFIG_DWMAC_QCOM_ETHQOS) += dwmac-qcom-ethqos.o +obj-$(CONFIG_DWMAC_RENESAS_GBETH) += dwmac-renesas-gbeth.o obj-$(CONFIG_DWMAC_ROCKCHIP) += dwmac-rk.o obj-$(CONFIG_DWMAC_RZN1) += dwmac-rzn1.o obj-$(CONFIG_DWMAC_S32) += dwmac-s32.o diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c index 37fe7c2888784..84072c8ed741c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c @@ -65,13 +65,12 @@ anarion_config_dt(struct platform_device *pdev, { struct anarion_gmac *gmac; void __iomem *ctl_block; - int err; ctl_block = devm_platform_ioremap_resource(pdev, 1); if (IS_ERR(ctl_block)) { - err = PTR_ERR(ctl_block); - dev_err(&pdev->dev, "Cannot get reset region (%d)!\n", err); - return ERR_PTR(err); + dev_err(&pdev->dev, "Cannot get reset region (%pe)!\n", + ctl_block); + return ERR_CAST(ctl_block); } gmac = devm_kzalloc(&pdev->dev, sizeof(*gmac), GFP_KERNEL); @@ -80,17 +79,11 @@ anarion_config_dt(struct platform_device *pdev, gmac->ctl_block = ctl_block; - switch (plat_dat->phy_interface) { - case PHY_INTERFACE_MODE_RGMII: - fallthrough; - case PHY_INTERFACE_MODE_RGMII_ID: - case PHY_INTERFACE_MODE_RGMII_RXID: - case PHY_INTERFACE_MODE_RGMII_TXID: + if (phy_interface_mode_is_rgmii(plat_dat->phy_interface)) { gmac->phy_intf_sel = GMAC_CONFIG_INTF_RGMII; - break; - default: - dev_err(&pdev->dev, "Unsupported phy-mode (%d)\n", - plat_dat->phy_interface); + } else { + dev_err(&pdev->dev, "Unsupported phy-mode (%s)\n", + phy_modes(plat_dat->phy_interface)); return ERR_PTR(-ENOTSUPP); } @@ -118,10 +111,9 @@ static int anarion_dwmac_probe(struct platform_device *pdev) plat_dat->init = anarion_gmac_init; plat_dat->exit = anarion_gmac_exit; - anarion_gmac_init(pdev, gmac); plat_dat->bsp_priv = gmac; - return stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); + return devm_stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); } static const struct of_device_id anarion_dwmac_match[] = { @@ -132,7 +124,6 @@ MODULE_DEVICE_TABLE(of, anarion_dwmac_match); static struct platform_driver anarion_dwmac_driver = { .probe = anarion_dwmac_probe, - .remove = stmmac_pltfr_remove, .driver = { .name = "anarion-dwmac", .pm = &stmmac_pltfr_pm_ops, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index cd431f84f34f6..fa900b4991d06 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -29,21 +29,10 @@ struct tegra_eqos { void __iomem *regs; struct reset_control *rst; - struct clk *clk_slave; struct gpio_desc *reset; }; -static struct clk *dwc_eth_find_clk(struct plat_stmmacenet_data *plat_dat, - const char *name) -{ - for (int i = 0; i < plat_dat->num_clks; i++) - if (strcmp(plat_dat->clks[i].id, name) == 0) - return plat_dat->clks[i].clk; - - return NULL; -} - static int dwc_eth_dwmac_config_dt(struct platform_device *pdev, struct plat_stmmacenet_data *plat_dat) { @@ -132,7 +121,7 @@ static int dwc_qos_probe(struct platform_device *pdev, struct plat_stmmacenet_data *plat_dat, struct stmmac_resources *stmmac_res) { - plat_dat->pclk = dwc_eth_find_clk(plat_dat, "phy_ref_clk"); + plat_dat->pclk = stmmac_pltfr_find_clk(plat_dat, "phy_ref_clk"); return 0; } @@ -209,20 +198,6 @@ static void tegra_eqos_fix_speed(void *priv, int speed, unsigned int mode) } } -static int tegra_eqos_init(struct platform_device *pdev, void *priv) -{ - struct tegra_eqos *eqos = priv; - unsigned long rate; - u32 value; - - rate = clk_get_rate(eqos->clk_slave); - - value = (rate / 1000000) - 1; - writel(value, eqos->regs + GMAC_1US_TIC_COUNTER); - - return 0; -} - static int tegra_eqos_probe(struct platform_device *pdev, struct plat_stmmacenet_data *plat_dat, struct stmmac_resources *res) @@ -237,12 +212,11 @@ static int tegra_eqos_probe(struct platform_device *pdev, eqos->dev = &pdev->dev; eqos->regs = res->addr; - eqos->clk_slave = plat_dat->stmmac_clk; if (!is_of_node(dev->fwnode)) goto bypass_clk_reset_gpio; - plat_dat->clk_tx_i = dwc_eth_find_clk(plat_dat, "tx"); + plat_dat->clk_tx_i = stmmac_pltfr_find_clk(plat_dat, "tx"); eqos->reset = devm_gpiod_get(&pdev->dev, "phy-reset", GPIOD_OUT_HIGH); if (IS_ERR(eqos->reset)) { @@ -277,17 +251,12 @@ static int tegra_eqos_probe(struct platform_device *pdev, bypass_clk_reset_gpio: plat_dat->fix_mac_speed = tegra_eqos_fix_speed; plat_dat->set_clk_tx_rate = stmmac_set_clk_tx_rate; - plat_dat->init = tegra_eqos_init; plat_dat->bsp_priv = eqos; - plat_dat->flags |= STMMAC_FLAG_SPH_DISABLE; - - err = tegra_eqos_init(pdev, eqos); - if (err < 0) - goto reset; + plat_dat->flags |= STMMAC_FLAG_SPH_DISABLE | + STMMAC_FLAG_EN_TX_LPI_CLK_PHY_CAP; return 0; -reset: - reset_control_assert(eqos->rst); + reset_phy: gpiod_set_value(eqos->reset, 1); @@ -362,8 +331,8 @@ static int dwc_eth_dwmac_probe(struct platform_device *pdev) if (ret) return dev_err_probe(&pdev->dev, ret, "Failed to enable clocks\n"); - plat_dat->stmmac_clk = dwc_eth_find_clk(plat_dat, - data->stmmac_clk_name); + plat_dat->stmmac_clk = stmmac_pltfr_find_clk(plat_dat, + data->stmmac_clk_name); if (data->probe) ret = data->probe(pdev, plat_dat, &stmmac_res); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c index 5d279fa54b3e8..889e2bb6f7f5d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -379,10 +379,6 @@ static int imx_dwmac_probe(struct platform_device *pdev) if (ret) return ret; - ret = imx_dwmac_init(pdev, dwmac); - if (ret) - goto err_dwmac_init; - if (dwmac->ops->fix_mac_speed) { plat_dat->fix_mac_speed = dwmac->ops->fix_mac_speed; } else if (!dwmac->ops->mac_rgmii_txclk_auto_adj) { @@ -392,16 +388,10 @@ static int imx_dwmac_probe(struct platform_device *pdev) dwmac->plat_dat->fix_soc_reset = dwmac->ops->fix_soc_reset; - ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); + ret = stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); if (ret) - goto err_drv_probe; - - return 0; + imx_dwmac_clks_config(dwmac, false); -err_drv_probe: - imx_dwmac_exit(pdev, plat_dat->bsp_priv); -err_dwmac_init: - imx_dwmac_clks_config(dwmac, false); return ret; } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c index 066783d664228..15abe214131f8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c @@ -56,6 +56,7 @@ enum ingenic_mac_version { struct ingenic_mac { const struct ingenic_soc_info *soc_info; + struct plat_stmmacenet_data *plat_dat; struct device *dev; struct regmap *regmap; @@ -70,13 +71,13 @@ struct ingenic_soc_info { int (*set_mode)(struct plat_stmmacenet_data *plat_dat); }; -static int ingenic_mac_init(struct plat_stmmacenet_data *plat_dat) +static int ingenic_mac_init(struct platform_device *pdev, void *bsp_priv) { - struct ingenic_mac *mac = plat_dat->bsp_priv; + struct ingenic_mac *mac = bsp_priv; int ret; if (mac->soc_info->set_mode) { - ret = mac->soc_info->set_mode(plat_dat); + ret = mac->soc_info->set_mode(mac->plat_dat); if (ret) return ret; } @@ -284,44 +285,14 @@ static int ingenic_mac_probe(struct platform_device *pdev) mac->soc_info = data; mac->dev = &pdev->dev; + mac->plat_dat = plat_dat; plat_dat->bsp_priv = mac; + plat_dat->init = ingenic_mac_init; - ret = ingenic_mac_init(plat_dat); - if (ret) - return ret; - - return stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); -} - -#ifdef CONFIG_PM_SLEEP -static int ingenic_mac_suspend(struct device *dev) -{ - int ret; - - ret = stmmac_suspend(dev); - - return ret; + return devm_stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); } -static int ingenic_mac_resume(struct device *dev) -{ - struct net_device *ndev = dev_get_drvdata(dev); - struct stmmac_priv *priv = netdev_priv(ndev); - int ret; - - ret = ingenic_mac_init(priv->plat); - if (ret) - return ret; - - ret = stmmac_resume(dev); - - return ret; -} -#endif /* CONFIG_PM_SLEEP */ - -static SIMPLE_DEV_PM_OPS(ingenic_mac_pm_ops, ingenic_mac_suspend, ingenic_mac_resume); - static struct ingenic_soc_info jz4775_soc_info = { .version = ID_JZ4775, .mask = MACPHYC_TXCLK_SEL_MASK | MACPHYC_SOFT_RST_MASK | MACPHYC_PHY_INFT_MASK, @@ -370,10 +341,9 @@ MODULE_DEVICE_TABLE(of, ingenic_mac_of_matches); static struct platform_driver ingenic_mac_driver = { .probe = ingenic_mac_probe, - .remove = stmmac_pltfr_remove, .driver = { .name = "ingenic-mac", - .pm = pm_ptr(&ingenic_mac_pm_ops), + .pm = &stmmac_pltfr_pm_ops, .of_match_table = ingenic_mac_of_matches, }, }; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c index 599def7b3a649..4ea7b0a803d73 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c @@ -113,16 +113,7 @@ static int intel_eth_plat_probe(struct platform_device *pdev) plat_dat->clk_tx_i = dwmac->tx_clk; plat_dat->set_clk_tx_rate = stmmac_set_clk_tx_rate; - plat_dat->bsp_priv = dwmac; - plat_dat->eee_usecs_rate = plat_dat->clk_ptp_rate; - - if (plat_dat->eee_usecs_rate > 0) { - u32 tx_lpi_usec; - - tx_lpi_usec = (plat_dat->eee_usecs_rate / 1000000) - 1; - writel(tx_lpi_usec, stmmac_res.addr + GMAC_1US_TIC_COUNTER); - } ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); if (ret) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index c8bb9265bbb48..96c893dd262f8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -300,11 +300,8 @@ static void intel_speed_mode_2500(struct net_device *ndev, void *intel_data) if (((data & SERDES_LINK_MODE_MASK) >> SERDES_LINK_MODE_SHIFT) == SERDES_LINK_MODE_2G5) { dev_info(priv->device, "Link Speed Mode: 2.5Gbps\n"); - priv->plat->max_speed = 2500; priv->plat->phy_interface = PHY_INTERFACE_MODE_2500BASEX; priv->plat->mdio_bus_data->default_an_inband = false; - } else { - priv->plat->max_speed = 1000; } } @@ -682,7 +679,6 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, plat->axi->axi_blen[2] = 16; plat->ptp_max_adj = plat->clk_ptp_rate; - plat->eee_usecs_rate = plat->clk_ptp_rate; /* Set system clock */ sprintf(clk_name, "%s-%s", "stmmac", pci_name(pdev)); @@ -1313,13 +1309,6 @@ static int intel_eth_pci_probe(struct pci_dev *pdev, memset(&res, 0, sizeof(res)); res.addr = pcim_iomap_table(pdev)[0]; - if (plat->eee_usecs_rate > 0) { - u32 tx_lpi_usec; - - tx_lpi_usec = (plat->eee_usecs_rate / 1000000) - 1; - writel(tx_lpi_usec, res.addr + GMAC_1US_TIC_COUNTER); - } - ret = stmmac_config_multi_msi(pdev, plat, &res); if (ret) { ret = stmmac_config_single_msi(pdev, plat, &res); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c index d178d5ddc7c7a..39421d6a34e41 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c @@ -581,7 +581,6 @@ static int mediatek_dwmac_common_data(struct platform_device *pdev, int i; priv_plat->phy_mode = plat->phy_interface; - plat->mac_interface = priv_plat->phy_mode; if (priv_plat->mac_wol) plat->flags &= ~STMMAC_FLAG_USE_PHY_WOL; else diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index 0e4da216f942c..e30bdf72331ac 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -106,12 +106,11 @@ struct qcom_ethqos { struct platform_device *pdev; void __iomem *rgmii_base; void __iomem *mac_base; - int (*configure_func)(struct qcom_ethqos *ethqos); + int (*configure_func)(struct qcom_ethqos *ethqos, int speed); unsigned int link_clk_rate; struct clk *link_clk; struct phy *serdes_phy; - int speed; int serdes_speed; phy_interface_t phy_mode; @@ -385,7 +384,7 @@ static int ethqos_dll_configure(struct qcom_ethqos *ethqos) return 0; } -static int ethqos_rgmii_macro_init(struct qcom_ethqos *ethqos) +static int ethqos_rgmii_macro_init(struct qcom_ethqos *ethqos, int speed) { struct device *dev = ðqos->pdev->dev; int phase_shift; @@ -412,7 +411,7 @@ static int ethqos_rgmii_macro_init(struct qcom_ethqos *ethqos) rgmii_updatel(ethqos, RGMII_CONFIG_INTF_SEL, 0, RGMII_IO_MACRO_CONFIG); - switch (ethqos->speed) { + switch (speed) { case SPEED_1000: rgmii_updatel(ethqos, RGMII_CONFIG_DDR_MODE, RGMII_CONFIG_DDR_MODE, RGMII_IO_MACRO_CONFIG); @@ -532,14 +531,14 @@ static int ethqos_rgmii_macro_init(struct qcom_ethqos *ethqos) loopback, RGMII_IO_MACRO_CONFIG); break; default: - dev_err(dev, "Invalid speed %d\n", ethqos->speed); + dev_err(dev, "Invalid speed %d\n", speed); return -EINVAL; } return 0; } -static int ethqos_configure_rgmii(struct qcom_ethqos *ethqos) +static int ethqos_configure_rgmii(struct qcom_ethqos *ethqos, int speed) { struct device *dev = ðqos->pdev->dev; volatile unsigned int dll_lock; @@ -562,7 +561,7 @@ static int ethqos_configure_rgmii(struct qcom_ethqos *ethqos) SDCC_DLL_CONFIG_PDN, SDCC_HC_REG_DLL_CONFIG); if (ethqos->has_emac_ge_3) { - if (ethqos->speed == SPEED_1000) { + if (speed == SPEED_1000) { rgmii_writel(ethqos, 0x1800000, SDCC_TEST_CTL); rgmii_writel(ethqos, 0x2C010800, SDCC_USR_CTL); rgmii_writel(ethqos, 0xA001, SDCC_HC_REG_DLL_CONFIG2); @@ -580,7 +579,7 @@ static int ethqos_configure_rgmii(struct qcom_ethqos *ethqos) rgmii_updatel(ethqos, SDCC_DLL_CONFIG_PDN, 0, SDCC_HC_REG_DLL_CONFIG); - if (ethqos->speed != SPEED_100 && ethqos->speed != SPEED_10) { + if (speed != SPEED_100 && speed != SPEED_10) { /* Set DLL_EN */ rgmii_updatel(ethqos, SDCC_DLL_CONFIG_DLL_EN, SDCC_DLL_CONFIG_DLL_EN, SDCC_HC_REG_DLL_CONFIG); @@ -607,10 +606,10 @@ static int ethqos_configure_rgmii(struct qcom_ethqos *ethqos) dev_err(dev, "Timeout while waiting for DLL lock\n"); } - if (ethqos->speed == SPEED_1000) + if (speed == SPEED_1000) ethqos_dll_configure(ethqos); - ethqos_rgmii_macro_init(ethqos); + ethqos_rgmii_macro_init(ethqos, speed); return 0; } @@ -626,7 +625,7 @@ static void ethqos_set_serdes_speed(struct qcom_ethqos *ethqos, int speed) /* On interface toggle MAC registers gets reset. * Configure MAC block for SGMII on ethernet phy link up */ -static int ethqos_configure_sgmii(struct qcom_ethqos *ethqos) +static int ethqos_configure_sgmii(struct qcom_ethqos *ethqos, int speed) { struct net_device *dev = platform_get_drvdata(ethqos->pdev); struct stmmac_priv *priv = netdev_priv(dev); @@ -634,7 +633,7 @@ static int ethqos_configure_sgmii(struct qcom_ethqos *ethqos) val = readl(ethqos->mac_base + MAC_CTRL_REG); - switch (ethqos->speed) { + switch (speed) { case SPEED_2500: val &= ~ETHQOS_MAC_CTRL_PORT_SEL; rgmii_updatel(ethqos, RGMII_CONFIG2_RGMII_CLK_SEL_CFG, @@ -673,17 +672,9 @@ static int ethqos_configure_sgmii(struct qcom_ethqos *ethqos) return val; } -static void qcom_ethqos_speed_mode_2500(struct net_device *ndev, void *data) +static int ethqos_configure(struct qcom_ethqos *ethqos, int speed) { - struct stmmac_priv *priv = netdev_priv(ndev); - - priv->plat->max_speed = 2500; - priv->plat->phy_interface = PHY_INTERFACE_MODE_2500BASEX; -} - -static int ethqos_configure(struct qcom_ethqos *ethqos) -{ - return ethqos->configure_func(ethqos); + return ethqos->configure_func(ethqos, speed); } static void ethqos_fix_mac_speed(void *priv, int speed, unsigned int mode) @@ -691,9 +682,8 @@ static void ethqos_fix_mac_speed(void *priv, int speed, unsigned int mode) struct qcom_ethqos *ethqos = priv; qcom_ethqos_set_sgmii_loopback(ethqos, false); - ethqos->speed = speed; ethqos_update_link_clk(ethqos, speed); - ethqos_configure(ethqos); + ethqos_configure(ethqos, speed); } static int qcom_ethqos_serdes_powerup(struct net_device *ndev, void *priv) @@ -709,7 +699,7 @@ static int qcom_ethqos_serdes_powerup(struct net_device *ndev, void *priv) if (ret) return ret; - return phy_set_speed(ethqos->serdes_phy, ethqos->speed); + return phy_set_speed(ethqos->serdes_phy, ethqos->serdes_speed); } static void qcom_ethqos_serdes_powerdown(struct net_device *ndev, void *priv) @@ -803,8 +793,6 @@ static int qcom_ethqos_probe(struct platform_device *pdev) ethqos->configure_func = ethqos_configure_rgmii; break; case PHY_INTERFACE_MODE_2500BASEX: - plat_dat->speed_mode_2500 = qcom_ethqos_speed_mode_2500; - fallthrough; case PHY_INTERFACE_MODE_SGMII: ethqos->configure_func = ethqos_configure_sgmii; break; @@ -847,7 +835,6 @@ static int qcom_ethqos_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(ethqos->serdes_phy), "Failed to get serdes phy\n"); - ethqos->speed = SPEED_1000; ethqos->serdes_speed = SPEED_1000; ethqos_update_link_clk(ethqos, SPEED_1000); ethqos_set_func_clk_en(ethqos); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-renesas-gbeth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-renesas-gbeth.c new file mode 100644 index 0000000000000..9a774046455b1 --- /dev/null +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-renesas-gbeth.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * dwmac-renesas-gbeth.c - DWMAC Specific Glue layer for Renesas GBETH + * + * The Rx and Tx clocks are supplied as follows for the GBETH IP. + * + * Rx / Tx + * -------+------------- on / off ------- + * | + * | Rx-180 / Tx-180 + * +---- not ---- on / off ------- + * + * Copyright (C) 2025 Renesas Electronics Corporation + */ + +#include +#include +#include +#include +#include + +#include "stmmac_platform.h" + +struct renesas_gbeth { + struct plat_stmmacenet_data *plat_dat; + struct reset_control *rstc; + struct device *dev; +}; + +static const char *const renesas_gbeth_clks[] = { + "tx", "tx-180", "rx", "rx-180", +}; + +static int renesas_gbeth_init(struct platform_device *pdev, void *priv) +{ + struct plat_stmmacenet_data *plat_dat; + struct renesas_gbeth *gbeth = priv; + int ret; + + plat_dat = gbeth->plat_dat; + + ret = reset_control_deassert(gbeth->rstc); + if (ret) { + dev_err(gbeth->dev, "Reset deassert failed\n"); + return ret; + } + + ret = clk_bulk_prepare_enable(plat_dat->num_clks, + plat_dat->clks); + if (ret) + reset_control_assert(gbeth->rstc); + + return ret; +} + +static void renesas_gbeth_exit(struct platform_device *pdev, void *priv) +{ + struct plat_stmmacenet_data *plat_dat; + struct renesas_gbeth *gbeth = priv; + int ret; + + plat_dat = gbeth->plat_dat; + + clk_bulk_disable_unprepare(plat_dat->num_clks, plat_dat->clks); + + ret = reset_control_assert(gbeth->rstc); + if (ret) + dev_err(gbeth->dev, "Reset assert failed\n"); +} + +static int renesas_gbeth_probe(struct platform_device *pdev) +{ + struct plat_stmmacenet_data *plat_dat; + struct stmmac_resources stmmac_res; + struct device *dev = &pdev->dev; + struct renesas_gbeth *gbeth; + unsigned int i; + int err; + + err = stmmac_get_platform_resources(pdev, &stmmac_res); + if (err) + return dev_err_probe(dev, err, + "failed to get resources\n"); + + plat_dat = devm_stmmac_probe_config_dt(pdev, stmmac_res.mac); + if (IS_ERR(plat_dat)) + return dev_err_probe(dev, PTR_ERR(plat_dat), + "dt configuration failed\n"); + + gbeth = devm_kzalloc(dev, sizeof(*gbeth), GFP_KERNEL); + if (!gbeth) + return -ENOMEM; + + plat_dat->num_clks = ARRAY_SIZE(renesas_gbeth_clks); + plat_dat->clks = devm_kcalloc(dev, plat_dat->num_clks, + sizeof(*plat_dat->clks), GFP_KERNEL); + if (!plat_dat->clks) + return -ENOMEM; + + for (i = 0; i < plat_dat->num_clks; i++) + plat_dat->clks[i].id = renesas_gbeth_clks[i]; + + err = devm_clk_bulk_get(dev, plat_dat->num_clks, plat_dat->clks); + if (err < 0) + return err; + + plat_dat->clk_tx_i = stmmac_pltfr_find_clk(plat_dat, "tx"); + if (!plat_dat->clk_tx_i) + return dev_err_probe(dev, -EINVAL, + "error finding tx clock\n"); + + gbeth->rstc = devm_reset_control_get_exclusive(dev, NULL); + if (IS_ERR(gbeth->rstc)) + return PTR_ERR(gbeth->rstc); + + gbeth->dev = dev; + gbeth->plat_dat = plat_dat; + plat_dat->bsp_priv = gbeth; + plat_dat->set_clk_tx_rate = stmmac_set_clk_tx_rate; + plat_dat->init = renesas_gbeth_init; + plat_dat->exit = renesas_gbeth_exit; + plat_dat->flags |= STMMAC_FLAG_HWTSTAMP_CORRECT_LATENCY | + STMMAC_FLAG_EN_TX_LPI_CLK_PHY_CAP | + STMMAC_FLAG_SPH_DISABLE; + + return devm_stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); +} + +static const struct of_device_id renesas_gbeth_match[] = { + { .compatible = "renesas,rzv2h-gbeth", }, + { /* Sentinel */ } +}; +MODULE_DEVICE_TABLE(of, renesas_gbeth_match); + +static struct platform_driver renesas_gbeth_driver = { + .probe = renesas_gbeth_probe, + .driver = { + .name = "renesas-gbeth", + .of_match_table = renesas_gbeth_match, + }, +}; +module_platform_driver(renesas_gbeth_driver); + +MODULE_AUTHOR("Lad Prabhakar "); +MODULE_DESCRIPTION("Renesas GBETH DWMAC Specific Glue layer"); +MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index 1168556585597..59f90b123c5b8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -50,6 +50,7 @@ struct socfpga_dwmac { u32 reg_offset; u32 reg_shift; struct device *dev; + struct plat_stmmacenet_data *plat_dat; struct regmap *sys_mgr_base_addr; struct reset_control *stmmac_rst; struct reset_control *stmmac_ocp_rst; @@ -233,10 +234,7 @@ static int socfpga_dwmac_parse_data(struct socfpga_dwmac *dwmac, struct device * static int socfpga_get_plat_phymode(struct socfpga_dwmac *dwmac) { - struct net_device *ndev = dev_get_drvdata(dwmac->dev); - struct stmmac_priv *priv = netdev_priv(ndev); - - return priv->plat->mac_interface; + return dwmac->plat_dat->mac_interface; } static void socfpga_sgmii_config(struct socfpga_dwmac *dwmac, bool enable) @@ -435,6 +433,13 @@ static struct phylink_pcs *socfpga_dwmac_select_pcs(struct stmmac_priv *priv, return priv->hw->phylink_pcs; } +static int socfpga_dwmac_init(struct platform_device *pdev, void *bsp_priv) +{ + struct socfpga_dwmac *dwmac = bsp_priv; + + return dwmac->ops->set_phy_mode(dwmac); +} + static int socfpga_dwmac_probe(struct platform_device *pdev) { struct plat_stmmacenet_data *plat_dat; @@ -442,8 +447,6 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; int ret; struct socfpga_dwmac *dwmac; - struct net_device *ndev; - struct stmmac_priv *stpriv; const struct socfpga_dwmac_ops *ops; ops = device_get_match_data(&pdev->dev); @@ -479,9 +482,17 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) return ret; } + /* The socfpga driver needs to control the stmmac reset to set the phy + * mode. Create a copy of the core reset handle so it can be used by + * the driver later. + */ + dwmac->stmmac_rst = plat_dat->stmmac_rst; dwmac->ops = ops; + dwmac->plat_dat = plat_dat; + plat_dat->bsp_priv = dwmac; plat_dat->fix_mac_speed = socfpga_dwmac_fix_mac_speed; + plat_dat->init = socfpga_dwmac_init; plat_dat->pcs_init = socfpga_dwmac_pcs_init; plat_dat->pcs_exit = socfpga_dwmac_pcs_exit; plat_dat->select_pcs = socfpga_dwmac_select_pcs; @@ -489,66 +500,8 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) plat_dat->riwt_off = 1; - ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); - if (ret) - return ret; - - ndev = platform_get_drvdata(pdev); - stpriv = netdev_priv(ndev); - - /* The socfpga driver needs to control the stmmac reset to set the phy - * mode. Create a copy of the core reset handle so it can be used by - * the driver later. - */ - dwmac->stmmac_rst = stpriv->plat->stmmac_rst; - - ret = ops->set_phy_mode(dwmac); - if (ret) - goto err_dvr_remove; - - return 0; - -err_dvr_remove: - stmmac_dvr_remove(&pdev->dev); - - return ret; -} - -#ifdef CONFIG_PM_SLEEP -static int socfpga_dwmac_resume(struct device *dev) -{ - struct net_device *ndev = dev_get_drvdata(dev); - struct stmmac_priv *priv = netdev_priv(ndev); - struct socfpga_dwmac *dwmac_priv = get_stmmac_bsp_priv(dev); - - dwmac_priv->ops->set_phy_mode(priv->plat->bsp_priv); - - return stmmac_resume(dev); + return devm_stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); } -#endif /* CONFIG_PM_SLEEP */ - -static int __maybe_unused socfpga_dwmac_runtime_suspend(struct device *dev) -{ - struct net_device *ndev = dev_get_drvdata(dev); - struct stmmac_priv *priv = netdev_priv(ndev); - - stmmac_bus_clks_config(priv, false); - - return 0; -} - -static int __maybe_unused socfpga_dwmac_runtime_resume(struct device *dev) -{ - struct net_device *ndev = dev_get_drvdata(dev); - struct stmmac_priv *priv = netdev_priv(ndev); - - return stmmac_bus_clks_config(priv, true); -} - -static const struct dev_pm_ops socfpga_dwmac_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(stmmac_suspend, socfpga_dwmac_resume) - SET_RUNTIME_PM_OPS(socfpga_dwmac_runtime_suspend, socfpga_dwmac_runtime_resume, NULL) -}; static const struct socfpga_dwmac_ops socfpga_gen5_ops = { .set_phy_mode = socfpga_gen5_set_phy_mode, @@ -567,10 +520,9 @@ MODULE_DEVICE_TABLE(of, socfpga_dwmac_match); static struct platform_driver socfpga_dwmac_driver = { .probe = socfpga_dwmac_probe, - .remove = stmmac_pltfr_remove, .driver = { .name = "socfpga-dwmac", - .pm = &socfpga_dwmac_pm_ops, + .pm = &stmmac_pltfr_pm_ops, .of_match_table = socfpga_dwmac_match, }, }; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c index be57c6c12c1c2..53d5ce1f6dc69 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c @@ -23,12 +23,7 @@ #define DWMAC_50MHZ 50000000 -#define IS_PHY_IF_MODE_RGMII(iface) (iface == PHY_INTERFACE_MODE_RGMII || \ - iface == PHY_INTERFACE_MODE_RGMII_ID || \ - iface == PHY_INTERFACE_MODE_RGMII_RXID || \ - iface == PHY_INTERFACE_MODE_RGMII_TXID) - -#define IS_PHY_IF_MODE_GBIT(iface) (IS_PHY_IF_MODE_RGMII(iface) || \ +#define IS_PHY_IF_MODE_GBIT(iface) (phy_interface_mode_is_rgmii(iface) || \ iface == PHY_INTERFACE_MODE_GMII) /* STiH4xx register definitions (STiH407/STiH410 families) @@ -148,7 +143,7 @@ static void stih4xx_fix_retime_src(void *priv, int spd, unsigned int mode) src = TX_RETIME_SRC_CLKGEN; freq = DWMAC_50MHZ; } - } else if (IS_PHY_IF_MODE_RGMII(dwmac->interface)) { + } else if (phy_interface_mode_is_rgmii(dwmac->interface)) { /* On GiGa clk source can be either ext or from clkgen */ freq = rgmii_clock(spd); @@ -238,6 +233,29 @@ static int sti_dwmac_parse_data(struct sti_dwmac *dwmac, return 0; } +static int sti_dwmac_init(struct platform_device *pdev, void *bsp_priv) +{ + struct sti_dwmac *dwmac = bsp_priv; + int ret; + + ret = clk_prepare_enable(dwmac->clk); + if (ret) + return ret; + + ret = sti_dwmac_set_mode(dwmac); + if (ret) + clk_disable_unprepare(dwmac->clk); + + return ret; +} + +static void sti_dwmac_exit(struct platform_device *pdev, void *bsp_priv) +{ + struct sti_dwmac *dwmac = bsp_priv; + + clk_disable_unprepare(dwmac->clk); +} + static int sti_dwmac_probe(struct platform_device *pdev) { struct plat_stmmacenet_data *plat_dat; @@ -274,59 +292,12 @@ static int sti_dwmac_probe(struct platform_device *pdev) plat_dat->bsp_priv = dwmac; plat_dat->fix_mac_speed = data->fix_retime_src; + plat_dat->init = sti_dwmac_init; + plat_dat->exit = sti_dwmac_exit; - ret = clk_prepare_enable(dwmac->clk); - if (ret) - return ret; - - ret = sti_dwmac_set_mode(dwmac); - if (ret) - goto disable_clk; - - ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); - if (ret) - goto disable_clk; - - return 0; - -disable_clk: - clk_disable_unprepare(dwmac->clk); - - return ret; -} - -static void sti_dwmac_remove(struct platform_device *pdev) -{ - struct sti_dwmac *dwmac = get_stmmac_bsp_priv(&pdev->dev); - - stmmac_dvr_remove(&pdev->dev); - - clk_disable_unprepare(dwmac->clk); + return devm_stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); } -static int sti_dwmac_suspend(struct device *dev) -{ - struct sti_dwmac *dwmac = get_stmmac_bsp_priv(dev); - int ret = stmmac_suspend(dev); - - clk_disable_unprepare(dwmac->clk); - - return ret; -} - -static int sti_dwmac_resume(struct device *dev) -{ - struct sti_dwmac *dwmac = get_stmmac_bsp_priv(dev); - - clk_prepare_enable(dwmac->clk); - sti_dwmac_set_mode(dwmac); - - return stmmac_resume(dev); -} - -static DEFINE_SIMPLE_DEV_PM_OPS(sti_dwmac_pm_ops, sti_dwmac_suspend, - sti_dwmac_resume); - static const struct sti_dwmac_of_data stih4xx_dwmac_data = { .fix_retime_src = stih4xx_fix_retime_src, }; @@ -339,10 +310,9 @@ MODULE_DEVICE_TABLE(of, sti_dwmac_match); static struct platform_driver sti_dwmac_driver = { .probe = sti_dwmac_probe, - .remove = sti_dwmac_remove, .driver = { .name = "sti-dwmac", - .pm = pm_sleep_ptr(&sti_dwmac_pm_ops), + .pm = &stmmac_pltfr_pm_ops, .of_match_table = sti_dwmac_match, }, }; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c index c3d321192581f..1eb16eec9c0d2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c @@ -119,7 +119,7 @@ struct stm32_ops { u32 syscfg_clr_off; }; -static int stm32_dwmac_clk_enable(struct stm32_dwmac *dwmac, bool resume) +static int stm32_dwmac_clk_enable(struct stm32_dwmac *dwmac) { int ret; @@ -127,11 +127,9 @@ static int stm32_dwmac_clk_enable(struct stm32_dwmac *dwmac, bool resume) if (ret) goto err_clk_tx; - if (!dwmac->ops->clk_rx_enable_in_suspend || !resume) { - ret = clk_prepare_enable(dwmac->clk_rx); - if (ret) - goto err_clk_rx; - } + ret = clk_prepare_enable(dwmac->clk_rx); + if (ret) + goto err_clk_rx; ret = clk_prepare_enable(dwmac->syscfg_clk); if (ret) @@ -148,15 +146,14 @@ static int stm32_dwmac_clk_enable(struct stm32_dwmac *dwmac, bool resume) err_clk_eth_ck: clk_disable_unprepare(dwmac->syscfg_clk); err_syscfg_clk: - if (!dwmac->ops->clk_rx_enable_in_suspend || !resume) - clk_disable_unprepare(dwmac->clk_rx); + clk_disable_unprepare(dwmac->clk_rx); err_clk_rx: clk_disable_unprepare(dwmac->clk_tx); err_clk_tx: return ret; } -static int stm32_dwmac_init(struct plat_stmmacenet_data *plat_dat, bool resume) +static int stm32_dwmac_init(struct plat_stmmacenet_data *plat_dat) { struct stm32_dwmac *dwmac = plat_dat->bsp_priv; int ret; @@ -167,7 +164,7 @@ static int stm32_dwmac_init(struct plat_stmmacenet_data *plat_dat, bool resume) return ret; } - return stm32_dwmac_clk_enable(dwmac, resume); + return stm32_dwmac_clk_enable(dwmac); } static int stm32mp1_select_ethck_external(struct plat_stmmacenet_data *plat_dat) @@ -382,12 +379,10 @@ static int stm32mcu_set_mode(struct plat_stmmacenet_data *plat_dat) SYSCFG_MCU_ETH_MASK, val << 23); } -static void stm32_dwmac_clk_disable(struct stm32_dwmac *dwmac, bool suspend) +static void stm32_dwmac_clk_disable(struct stm32_dwmac *dwmac) { clk_disable_unprepare(dwmac->clk_tx); - if (!dwmac->ops->clk_rx_enable_in_suspend || !suspend) - clk_disable_unprepare(dwmac->clk_rx); - + clk_disable_unprepare(dwmac->clk_rx); clk_disable_unprepare(dwmac->syscfg_clk); if (dwmac->enable_eth_ck) clk_disable_unprepare(dwmac->clk_eth_ck); @@ -541,18 +536,32 @@ static int stm32_dwmac_probe(struct platform_device *pdev) plat_dat->flags |= STMMAC_FLAG_EN_TX_LPI_CLK_PHY_CAP; plat_dat->bsp_priv = dwmac; - ret = stm32_dwmac_init(plat_dat, false); + ret = stm32_dwmac_init(plat_dat); if (ret) return ret; + /* If this platform requires the clock to be running in suspend, + * prepare and enable the receive clock an additional time to keep + * it running. + */ + if (dwmac->ops->clk_rx_enable_in_suspend) { + ret = clk_prepare_enable(dwmac->clk_rx); + if (ret) + goto err_clk_disable; + } + ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); if (ret) - goto err_clk_disable; + goto err_clk_disable_suspend; return 0; +err_clk_disable_suspend: + if (dwmac->ops->clk_rx_enable_in_suspend) + clk_disable_unprepare(dwmac->clk_rx); + err_clk_disable: - stm32_dwmac_clk_disable(dwmac, false); + stm32_dwmac_clk_disable(dwmac); return ret; } @@ -565,7 +574,15 @@ static void stm32_dwmac_remove(struct platform_device *pdev) stmmac_dvr_remove(&pdev->dev); - stm32_dwmac_clk_disable(dwmac, false); + /* If this platform requires the clock to be running in suspend, + * we need to disable and unprepare the receive clock an additional + * time to balance the extra clk_prepare_enable() in the probe + * function. + */ + if (dwmac->ops->clk_rx_enable_in_suspend) + clk_disable_unprepare(dwmac->clk_rx); + + stm32_dwmac_clk_disable(dwmac); if (dwmac->irq_pwr_wakeup >= 0) { dev_pm_clear_wake_irq(&pdev->dev); @@ -596,7 +613,7 @@ static int stm32_dwmac_suspend(struct device *dev) if (ret) return ret; - stm32_dwmac_clk_disable(dwmac, true); + stm32_dwmac_clk_disable(dwmac); if (dwmac->ops->suspend) ret = dwmac->ops->suspend(dwmac); @@ -614,7 +631,7 @@ static int stm32_dwmac_resume(struct device *dev) if (dwmac->ops->resume) dwmac->ops->resume(dwmac); - ret = stm32_dwmac_init(priv->plat, true); + ret = stm32_dwmac_init(priv->plat); if (ret) return ret; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index 85723a78793ab..fd6518e252e38 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -1239,14 +1239,10 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) if (ret) return ret; - ret = sun8i_dwmac_init(pdev, plat_dat->bsp_priv); + ret = stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); if (ret) goto dwmac_syscon; - ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); - if (ret) - goto dwmac_exit; - ndev = dev_get_drvdata(&pdev->dev); priv = netdev_priv(ndev); @@ -1283,9 +1279,7 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) clk_put(gmac->ephy_clk); dwmac_remove: pm_runtime_put_noidle(&pdev->dev); - stmmac_dvr_remove(&pdev->dev); -dwmac_exit: - sun8i_dwmac_exit(pdev, gmac); + stmmac_pltfr_remove(pdev); dwmac_syscon: sun8i_dwmac_unset_syscon(gmac); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c index 9f098ff0ff058..1eadcf5d1ad63 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c @@ -72,28 +72,28 @@ static void sun7i_gmac_exit(struct platform_device *pdev, void *priv) regulator_disable(gmac->regulator); } -static void sun7i_fix_speed(void *priv, int speed, unsigned int mode) +static int sun7i_set_clk_tx_rate(void *bsp_priv, struct clk *clk_tx_i, + phy_interface_t interface, int speed) { - struct sunxi_priv_data *gmac = priv; - - /* only GMII mode requires us to reconfigure the clock lines */ - if (gmac->interface != PHY_INTERFACE_MODE_GMII) - return; - - if (gmac->clk_enabled) { - clk_disable(gmac->tx_clk); - gmac->clk_enabled = 0; - } - clk_unprepare(gmac->tx_clk); - - if (speed == 1000) { - clk_set_rate(gmac->tx_clk, SUN7I_GMAC_GMII_RGMII_RATE); - clk_prepare_enable(gmac->tx_clk); - gmac->clk_enabled = 1; - } else { - clk_set_rate(gmac->tx_clk, SUN7I_GMAC_MII_RATE); - clk_prepare(gmac->tx_clk); + struct sunxi_priv_data *gmac = bsp_priv; + + if (interface == PHY_INTERFACE_MODE_GMII) { + if (gmac->clk_enabled) { + clk_disable(gmac->tx_clk); + gmac->clk_enabled = 0; + } + clk_unprepare(gmac->tx_clk); + + if (speed == 1000) { + clk_set_rate(gmac->tx_clk, SUN7I_GMAC_GMII_RGMII_RATE); + clk_prepare_enable(gmac->tx_clk); + gmac->clk_enabled = 1; + } else { + clk_set_rate(gmac->tx_clk, SUN7I_GMAC_MII_RATE); + clk_prepare(gmac->tx_clk); + } } + return 0; } static int sun7i_gmac_probe(struct platform_device *pdev) @@ -140,24 +140,11 @@ static int sun7i_gmac_probe(struct platform_device *pdev) plat_dat->bsp_priv = gmac; plat_dat->init = sun7i_gmac_init; plat_dat->exit = sun7i_gmac_exit; - plat_dat->fix_mac_speed = sun7i_fix_speed; + plat_dat->set_clk_tx_rate = sun7i_set_clk_tx_rate; plat_dat->tx_fifo_size = 4096; plat_dat->rx_fifo_size = 16384; - ret = sun7i_gmac_init(pdev, plat_dat->bsp_priv); - if (ret) - return ret; - - ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); - if (ret) - goto err_gmac_exit; - - return 0; - -err_gmac_exit: - sun7i_gmac_exit(pdev, plat_dat->bsp_priv); - - return ret; + return devm_stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); } static const struct of_device_id sun7i_dwmac_match[] = { @@ -168,7 +155,6 @@ MODULE_DEVICE_TABLE(of, sun7i_dwmac_match); static struct platform_driver sun7i_dwmac_driver = { .probe = sun7i_gmac_probe, - .remove = stmmac_pltfr_remove, .driver = { .name = "sun7i-dwmac", .pm = &stmmac_pltfr_pm_ops, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c index 33cf99797df53..5e6ac82a89b9f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c @@ -51,21 +51,14 @@ struct visconti_eth { u32 phy_intf_sel; struct clk *phy_ref_clk; struct device *dev; - spinlock_t lock; /* lock to protect register update */ }; -static void visconti_eth_fix_mac_speed(void *priv, int speed, unsigned int mode) +static int visconti_eth_set_clk_tx_rate(void *bsp_priv, struct clk *clk_tx_i, + phy_interface_t interface, int speed) { - struct visconti_eth *dwmac = priv; + struct visconti_eth *dwmac = bsp_priv; struct net_device *netdev = dev_get_drvdata(dwmac->dev); unsigned int val, clk_sel_val = 0; - unsigned long flags; - - spin_lock_irqsave(&dwmac->lock, flags); - - /* adjust link */ - val = readl(dwmac->reg + MAC_CTRL_REG); - val &= ~(GMAC_CONFIG_PS | GMAC_CONFIG_FES); switch (speed) { case SPEED_1000: @@ -77,24 +70,19 @@ static void visconti_eth_fix_mac_speed(void *priv, int speed, unsigned int mode) clk_sel_val = ETHER_CLK_SEL_FREQ_SEL_25M; if (dwmac->phy_intf_sel == ETHER_CONFIG_INTF_RMII) clk_sel_val = ETHER_CLK_SEL_DIV_SEL_2; - val |= GMAC_CONFIG_PS | GMAC_CONFIG_FES; break; case SPEED_10: if (dwmac->phy_intf_sel == ETHER_CONFIG_INTF_RGMII) clk_sel_val = ETHER_CLK_SEL_FREQ_SEL_2P5M; if (dwmac->phy_intf_sel == ETHER_CONFIG_INTF_RMII) clk_sel_val = ETHER_CLK_SEL_DIV_SEL_20; - val |= GMAC_CONFIG_PS; break; default: /* No bit control */ netdev_err(netdev, "Unsupported speed request (%d)", speed); - spin_unlock_irqrestore(&dwmac->lock, flags); - return; + return -EINVAL; } - writel(val, dwmac->reg + MAC_CTRL_REG); - /* Stop internal clock */ val = readl(dwmac->reg + REG_ETHER_CLOCK_SEL); val &= ~(ETHER_CLK_SEL_RMII_CLK_EN | ETHER_CLK_SEL_RX_TX_CLK_EN); @@ -136,7 +124,7 @@ static void visconti_eth_fix_mac_speed(void *priv, int speed, unsigned int mode) break; } - spin_unlock_irqrestore(&dwmac->lock, flags); + return 0; } static int visconti_eth_init_hw(struct platform_device *pdev, struct plat_stmmacenet_data *plat_dat) @@ -228,11 +216,10 @@ static int visconti_eth_dwmac_probe(struct platform_device *pdev) if (!dwmac) return -ENOMEM; - spin_lock_init(&dwmac->lock); dwmac->reg = stmmac_res.addr; dwmac->dev = &pdev->dev; plat_dat->bsp_priv = dwmac; - plat_dat->fix_mac_speed = visconti_eth_fix_mac_speed; + plat_dat->set_clk_tx_rate = visconti_eth_set_clk_tx_rate; ret = visconti_eth_clock_probe(pdev, plat_dat); if (ret) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h index 42fe29a4e300b..5f387ec27c8cc 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h @@ -31,7 +31,6 @@ #define GMAC_RXQ_CTRL3 0x000000ac #define GMAC_INT_STATUS 0x000000b0 #define GMAC_INT_EN 0x000000b4 -#define GMAC_1US_TIC_COUNTER 0x000000dc #define GMAC_PCS_BASE 0x000000e0 #define GMAC_PHYIF_CONTROL_STATUS 0x000000f8 #define GMAC_PMT 0x000000c0 diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 918a32f8fda80..844f7d516a40b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -37,7 +37,7 @@ #define ETHTOOL_DMA_OFFSET 55 struct stmmac_stats { - char stat_string[ETH_GSTRING_LEN]; + char stat_string[ETH_GSTRING_LEN] __nonstring; int sizeof_stat; int stat_offset; }; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index c73eff6a56b87..43c869f64c39c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -709,6 +709,17 @@ devm_stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) #endif /* CONFIG_OF */ EXPORT_SYMBOL_GPL(devm_stmmac_probe_config_dt); +struct clk *stmmac_pltfr_find_clk(struct plat_stmmacenet_data *plat_dat, + const char *name) +{ + for (int i = 0; i < plat_dat->num_clks; i++) + if (strcmp(plat_dat->clks[i].id, name) == 0) + return plat_dat->clks[i].clk; + + return NULL; +} +EXPORT_SYMBOL_GPL(stmmac_pltfr_find_clk); + int stmmac_get_platform_resources(struct platform_device *pdev, struct stmmac_resources *stmmac_res) { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h index 72dc1a32e46d5..6e6561e29d6e3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h @@ -14,6 +14,9 @@ struct plat_stmmacenet_data * devm_stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac); +struct clk *stmmac_pltfr_find_clk(struct plat_stmmacenet_data *plat_dat, + const char *name); + int stmmac_get_platform_resources(struct platform_device *pdev, struct stmmac_resources *stmmac_res); diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 1e6d2335293df..988ce91193060 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -427,7 +427,7 @@ static void am65_cpsw_nuss_ndo_host_tx_timeout(struct net_device *ndev, if (netif_tx_queue_stopped(netif_txq)) { /* try recover if stopped by us */ - txq_trans_update(netif_txq); + txq_trans_update(ndev, netif_txq); netif_tx_wake_queue(netif_txq); } } @@ -2679,7 +2679,9 @@ static int am65_cpsw_nuss_init_slave_ports(struct am65_cpsw_common *common) goto of_node_put; ret = of_get_mac_address(port_np, port->slave.mac_addr); - if (ret) { + if (ret == -EPROBE_DEFER) { + goto of_node_put; + } else if (ret) { am65_cpsw_am654_get_efuse_macid(port_np, port->port_id, port->slave.mac_addr); @@ -2760,7 +2762,7 @@ am65_cpsw_nuss_init_port_ndev(struct am65_cpsw_common *common, u32 port_idx) mutex_init(&ndev_priv->mm_lock); port->qos.link_speed = SPEED_UNKNOWN; SET_NETDEV_DEV(port->ndev, dev); - port->ndev->dev.of_node = port->slave.port_np; + device_set_node(&port->ndev->dev, of_fwnode_handle(port->slave.port_np)); eth_hw_addr_set(port->ndev, port->slave.mac_addr); @@ -3561,6 +3563,16 @@ static int am65_cpsw_nuss_probe(struct platform_device *pdev) return ret; } + am65_cpsw_nuss_get_ver(common); + + ret = am65_cpsw_nuss_init_host_p(common); + if (ret) + goto err_pm_clear; + + ret = am65_cpsw_nuss_init_slave_ports(common); + if (ret) + goto err_pm_clear; + node = of_get_child_by_name(dev->of_node, "mdio"); if (!node) { dev_warn(dev, "MDIO node not found\n"); @@ -3577,16 +3589,6 @@ static int am65_cpsw_nuss_probe(struct platform_device *pdev) } of_node_put(node); - am65_cpsw_nuss_get_ver(common); - - ret = am65_cpsw_nuss_init_host_p(common); - if (ret) - goto err_of_clear; - - ret = am65_cpsw_nuss_init_slave_ports(common); - if (ret) - goto err_of_clear; - /* init common data */ ale_params.dev = dev; ale_params.ale_ageout = AM65_CPSW_ALE_AGEOUT_DEFAULT; diff --git a/drivers/net/ethernet/wangxun/Kconfig b/drivers/net/ethernet/wangxun/Kconfig index 47e3e8434b9ef..e5fc942c28ccc 100644 --- a/drivers/net/ethernet/wangxun/Kconfig +++ b/drivers/net/ethernet/wangxun/Kconfig @@ -40,7 +40,7 @@ config NGBE will be called ngbe. config TXGBE - tristate "Wangxun(R) 10GbE PCI Express adapters support" + tristate "Wangxun(R) 10/25/40GbE PCI Express adapters support" depends on PCI depends on COMMON_CLK depends on I2C_DESIGNWARE_PLATFORM @@ -55,7 +55,7 @@ config TXGBE select PCS_XPCS select LIBWX help - This driver supports Wangxun(R) 10GbE PCI Express family of + This driver supports Wangxun(R) 10/25/40GbE PCI Express family of adapters. More specific information on configuring the driver is in diff --git a/drivers/net/ethernet/wangxun/libwx/Makefile b/drivers/net/ethernet/wangxun/libwx/Makefile index e9f0f1f2309bf..9b78b604a94e4 100644 --- a/drivers/net/ethernet/wangxun/libwx/Makefile +++ b/drivers/net/ethernet/wangxun/libwx/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_LIBWX) += libwx.o -libwx-objs := wx_hw.o wx_lib.o wx_ethtool.o wx_ptp.o +libwx-objs := wx_hw.o wx_lib.o wx_ethtool.o wx_ptp.o wx_mbx.o wx_sriov.o diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index aed45abafb1b7..3c3aa5f4ebbff 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -10,6 +10,7 @@ #include "wx_type.h" #include "wx_lib.h" +#include "wx_sriov.h" #include "wx_hw.h" static int wx_phy_read_reg_mdi(struct mii_bus *bus, int phy_addr, int devnum, int regnum) @@ -964,11 +965,28 @@ static void wx_sync_mac_table(struct wx *wx) } } +static void wx_full_sync_mac_table(struct wx *wx) +{ + int i; + + for (i = 0; i < wx->mac.num_rar_entries; i++) { + if (wx->mac_table[i].state & WX_MAC_STATE_IN_USE) { + wx_set_rar(wx, i, + wx->mac_table[i].addr, + wx->mac_table[i].pools, + WX_PSR_MAC_SWC_AD_H_AV); + } else { + wx_clear_rar(wx, i); + } + wx->mac_table[i].state &= ~(WX_MAC_STATE_MODIFIED); + } +} + /* this function destroys the first RAR entry */ void wx_mac_set_default_filter(struct wx *wx, u8 *addr) { memcpy(&wx->mac_table[0].addr, addr, ETH_ALEN); - wx->mac_table[0].pools = 1ULL; + wx->mac_table[0].pools = BIT(VMDQ_P(0)); wx->mac_table[0].state = (WX_MAC_STATE_DEFAULT | WX_MAC_STATE_IN_USE); wx_set_rar(wx, 0, wx->mac_table[0].addr, wx->mac_table[0].pools, @@ -993,7 +1011,7 @@ void wx_flush_sw_mac_table(struct wx *wx) } EXPORT_SYMBOL(wx_flush_sw_mac_table); -static int wx_add_mac_filter(struct wx *wx, u8 *addr, u16 pool) +int wx_add_mac_filter(struct wx *wx, u8 *addr, u16 pool) { u32 i; @@ -1024,7 +1042,7 @@ static int wx_add_mac_filter(struct wx *wx, u8 *addr, u16 pool) return -ENOMEM; } -static int wx_del_mac_filter(struct wx *wx, u8 *addr, u16 pool) +int wx_del_mac_filter(struct wx *wx, u8 *addr, u16 pool) { u32 i; @@ -1206,6 +1224,35 @@ static void wx_update_mc_addr_list(struct wx *wx, struct net_device *netdev) wx_dbg(wx, "Update mc addr list Complete\n"); } +static void wx_restore_vf_multicasts(struct wx *wx) +{ + u32 i, j, vector_bit, vector_reg; + struct vf_data_storage *vfinfo; + + for (i = 0; i < wx->num_vfs; i++) { + u32 vmolr = rd32(wx, WX_PSR_VM_L2CTL(i)); + + vfinfo = &wx->vfinfo[i]; + for (j = 0; j < vfinfo->num_vf_mc_hashes; j++) { + wx->addr_ctrl.mta_in_use++; + vector_reg = WX_PSR_MC_TBL_REG(vfinfo->vf_mc_hashes[j]); + vector_bit = WX_PSR_MC_TBL_BIT(vfinfo->vf_mc_hashes[j]); + wr32m(wx, WX_PSR_MC_TBL(vector_reg), + BIT(vector_bit), BIT(vector_bit)); + /* errata 5: maintain a copy of the reg table conf */ + wx->mac.mta_shadow[vector_reg] |= BIT(vector_bit); + } + if (vfinfo->num_vf_mc_hashes) + vmolr |= WX_PSR_VM_L2CTL_ROMPE; + else + vmolr &= ~WX_PSR_VM_L2CTL_ROMPE; + wr32(wx, WX_PSR_VM_L2CTL(i), vmolr); + } + + /* Restore any VF macvlans */ + wx_full_sync_mac_table(wx); +} + /** * wx_write_mc_addr_list - write multicast addresses to MTA * @netdev: network interface device structure @@ -1223,6 +1270,9 @@ static int wx_write_mc_addr_list(struct net_device *netdev) wx_update_mc_addr_list(wx, netdev); + if (test_bit(WX_FLAG_SRIOV_ENABLED, wx->flags)) + wx_restore_vf_multicasts(wx); + return netdev_mc_count(netdev); } @@ -1243,7 +1293,7 @@ int wx_set_mac(struct net_device *netdev, void *p) if (retval) return retval; - wx_del_mac_filter(wx, wx->mac.addr, 0); + wx_del_mac_filter(wx, wx->mac.addr, VMDQ_P(0)); eth_hw_addr_set(netdev, addr->sa_data); memcpy(wx->mac.addr, addr->sa_data, netdev->addr_len); @@ -1345,6 +1395,10 @@ static int wx_hpbthresh(struct wx *wx) /* Calculate delay value for device */ dv_id = WX_DV(link, tc); + /* Loopback switch introduces additional latency */ + if (test_bit(WX_FLAG_SRIOV_ENABLED, wx->flags)) + dv_id += WX_B2BT(tc); + /* Delay value is calculated in bit times convert to KB */ kb = WX_BT2KB(dv_id); rx_pba = rd32(wx, WX_RDB_PB_SZ(0)) >> WX_RDB_PB_SZ_SHIFT; @@ -1400,12 +1454,107 @@ static void wx_pbthresh_setup(struct wx *wx) wx->fc.low_water = 0; } +static void wx_set_ethertype_anti_spoofing(struct wx *wx, bool enable, int vf) +{ + u32 pfvfspoof, reg_offset, vf_shift; + + vf_shift = WX_VF_IND_SHIFT(vf); + reg_offset = WX_VF_REG_OFFSET(vf); + + pfvfspoof = rd32(wx, WX_TDM_ETYPE_AS(reg_offset)); + if (enable) + pfvfspoof |= BIT(vf_shift); + else + pfvfspoof &= ~BIT(vf_shift); + wr32(wx, WX_TDM_ETYPE_AS(reg_offset), pfvfspoof); +} + +int wx_set_vf_spoofchk(struct net_device *netdev, int vf, bool setting) +{ + u32 index = WX_VF_REG_OFFSET(vf), vf_bit = WX_VF_IND_SHIFT(vf); + struct wx *wx = netdev_priv(netdev); + u32 regval; + + if (vf >= wx->num_vfs) + return -EINVAL; + + wx->vfinfo[vf].spoofchk_enabled = setting; + + regval = (setting << vf_bit); + wr32m(wx, WX_TDM_MAC_AS(index), regval | BIT(vf_bit), regval); + + if (wx->vfinfo[vf].vlan_count) + wr32m(wx, WX_TDM_VLAN_AS(index), regval | BIT(vf_bit), regval); + + return 0; +} + +static void wx_configure_virtualization(struct wx *wx) +{ + u16 pool = wx->num_rx_pools; + u32 reg_offset, vf_shift; + u32 i; + + if (!test_bit(WX_FLAG_SRIOV_ENABLED, wx->flags)) + return; + + wr32m(wx, WX_PSR_VM_CTL, + WX_PSR_VM_CTL_POOL_MASK | WX_PSR_VM_CTL_REPLEN, + FIELD_PREP(WX_PSR_VM_CTL_POOL_MASK, VMDQ_P(0)) | + WX_PSR_VM_CTL_REPLEN); + while (pool--) + wr32m(wx, WX_PSR_VM_L2CTL(pool), + WX_PSR_VM_L2CTL_AUPE, WX_PSR_VM_L2CTL_AUPE); + + if (wx->mac.type == wx_mac_em) { + vf_shift = BIT(VMDQ_P(0)); + /* Enable only the PF pools for Tx/Rx */ + wr32(wx, WX_RDM_VF_RE(0), vf_shift); + wr32(wx, WX_TDM_VF_TE(0), vf_shift); + } else { + vf_shift = WX_VF_IND_SHIFT(VMDQ_P(0)); + reg_offset = WX_VF_REG_OFFSET(VMDQ_P(0)); + + /* Enable only the PF pools for Tx/Rx */ + wr32(wx, WX_RDM_VF_RE(reg_offset), GENMASK(31, vf_shift)); + wr32(wx, WX_RDM_VF_RE(reg_offset ^ 1), reg_offset - 1); + wr32(wx, WX_TDM_VF_TE(reg_offset), GENMASK(31, vf_shift)); + wr32(wx, WX_TDM_VF_TE(reg_offset ^ 1), reg_offset - 1); + } + + /* clear VLAN promisc flag so VFTA will be updated if necessary */ + clear_bit(WX_FLAG_VLAN_PROMISC, wx->flags); + + for (i = 0; i < wx->num_vfs; i++) { + if (!wx->vfinfo[i].spoofchk_enabled) + wx_set_vf_spoofchk(wx->netdev, i, false); + /* enable ethertype anti spoofing if hw supports it */ + wx_set_ethertype_anti_spoofing(wx, true, i); + } +} + static void wx_configure_port(struct wx *wx) { u32 value, i; - value = WX_CFG_PORT_CTL_D_VLAN | WX_CFG_PORT_CTL_QINQ; + if (wx->mac.type == wx_mac_em) { + value = (wx->num_vfs == 0) ? + WX_CFG_PORT_CTL_NUM_VT_NONE : + WX_CFG_PORT_CTL_NUM_VT_8; + } else { + if (test_bit(WX_FLAG_VMDQ_ENABLED, wx->flags)) { + if (wx->ring_feature[RING_F_RSS].indices == 4) + value = WX_CFG_PORT_CTL_NUM_VT_32; + else + value = WX_CFG_PORT_CTL_NUM_VT_64; + } else { + value = 0; + } + } + + value |= WX_CFG_PORT_CTL_D_VLAN | WX_CFG_PORT_CTL_QINQ; wr32m(wx, WX_CFG_PORT_CTL, + WX_CFG_PORT_CTL_NUM_VT_MASK | WX_CFG_PORT_CTL_D_VLAN | WX_CFG_PORT_CTL_QINQ, value); @@ -1466,6 +1615,83 @@ static void wx_vlan_strip_control(struct wx *wx, bool enable) } } +static void wx_vlan_promisc_enable(struct wx *wx) +{ + u32 vlnctrl, i, vind, bits, reg_idx; + + vlnctrl = rd32(wx, WX_PSR_VLAN_CTL); + if (test_bit(WX_FLAG_VMDQ_ENABLED, wx->flags)) { + /* we need to keep the VLAN filter on in SRIOV */ + vlnctrl |= WX_PSR_VLAN_CTL_VFE; + wr32(wx, WX_PSR_VLAN_CTL, vlnctrl); + } else { + vlnctrl &= ~WX_PSR_VLAN_CTL_VFE; + wr32(wx, WX_PSR_VLAN_CTL, vlnctrl); + return; + } + /* We are already in VLAN promisc, nothing to do */ + if (test_bit(WX_FLAG_VLAN_PROMISC, wx->flags)) + return; + /* Set flag so we don't redo unnecessary work */ + set_bit(WX_FLAG_VLAN_PROMISC, wx->flags); + /* Add PF to all active pools */ + for (i = WX_PSR_VLAN_SWC_ENTRIES; --i;) { + wr32(wx, WX_PSR_VLAN_SWC_IDX, i); + vind = WX_VF_IND_SHIFT(VMDQ_P(0)); + reg_idx = WX_VF_REG_OFFSET(VMDQ_P(0)); + bits = rd32(wx, WX_PSR_VLAN_SWC_VM(reg_idx)); + bits |= BIT(vind); + wr32(wx, WX_PSR_VLAN_SWC_VM(reg_idx), bits); + } + /* Set all bits in the VLAN filter table array */ + for (i = 0; i < wx->mac.vft_size; i++) + wr32(wx, WX_PSR_VLAN_TBL(i), U32_MAX); +} + +static void wx_scrub_vfta(struct wx *wx) +{ + u32 i, vid, bits, vfta, vind, vlvf, reg_idx; + + for (i = WX_PSR_VLAN_SWC_ENTRIES; --i;) { + wr32(wx, WX_PSR_VLAN_SWC_IDX, i); + vlvf = rd32(wx, WX_PSR_VLAN_SWC_IDX); + /* pull VLAN ID from VLVF */ + vid = vlvf & ~WX_PSR_VLAN_SWC_VIEN; + if (vlvf & WX_PSR_VLAN_SWC_VIEN) { + /* if PF is part of this then continue */ + if (test_bit(vid, wx->active_vlans)) + continue; + } + /* remove PF from the pool */ + vind = WX_VF_IND_SHIFT(VMDQ_P(0)); + reg_idx = WX_VF_REG_OFFSET(VMDQ_P(0)); + bits = rd32(wx, WX_PSR_VLAN_SWC_VM(reg_idx)); + bits &= ~BIT(vind); + wr32(wx, WX_PSR_VLAN_SWC_VM(reg_idx), bits); + } + /* extract values from vft_shadow and write back to VFTA */ + for (i = 0; i < wx->mac.vft_size; i++) { + vfta = wx->mac.vft_shadow[i]; + wr32(wx, WX_PSR_VLAN_TBL(i), vfta); + } +} + +static void wx_vlan_promisc_disable(struct wx *wx) +{ + u32 vlnctrl; + + /* configure vlan filtering */ + vlnctrl = rd32(wx, WX_PSR_VLAN_CTL); + vlnctrl |= WX_PSR_VLAN_CTL_VFE; + wr32(wx, WX_PSR_VLAN_CTL, vlnctrl); + /* We are not in VLAN promisc, nothing to do */ + if (!test_bit(WX_FLAG_VLAN_PROMISC, wx->flags)) + return; + /* Set flag so we don't redo unnecessary work */ + clear_bit(WX_FLAG_VLAN_PROMISC, wx->flags); + wx_scrub_vfta(wx); +} + void wx_set_rx_mode(struct net_device *netdev) { struct wx *wx = netdev_priv(netdev); @@ -1478,7 +1704,7 @@ void wx_set_rx_mode(struct net_device *netdev) /* Check for Promiscuous and All Multicast modes */ fctrl = rd32(wx, WX_PSR_CTL); fctrl &= ~(WX_PSR_CTL_UPE | WX_PSR_CTL_MPE); - vmolr = rd32(wx, WX_PSR_VM_L2CTL(0)); + vmolr = rd32(wx, WX_PSR_VM_L2CTL(VMDQ_P(0))); vmolr &= ~(WX_PSR_VM_L2CTL_UPE | WX_PSR_VM_L2CTL_MPE | WX_PSR_VM_L2CTL_ROPE | @@ -1499,7 +1725,10 @@ void wx_set_rx_mode(struct net_device *netdev) fctrl |= WX_PSR_CTL_UPE | WX_PSR_CTL_MPE; /* pf don't want packets routing to vf, so clear UPE */ vmolr |= WX_PSR_VM_L2CTL_MPE; - vlnctrl &= ~WX_PSR_VLAN_CTL_VFE; + if (test_bit(WX_FLAG_VMDQ_ENABLED, wx->flags) && + test_bit(WX_FLAG_SRIOV_ENABLED, wx->flags)) + vlnctrl |= WX_PSR_VLAN_CTL_VFE; + features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; } if (netdev->flags & IFF_ALLMULTI) { @@ -1522,7 +1751,7 @@ void wx_set_rx_mode(struct net_device *netdev) * sufficient space to store all the addresses then enable * unicast promiscuous mode */ - count = wx_write_uc_addr_list(netdev, 0); + count = wx_write_uc_addr_list(netdev, VMDQ_P(0)); if (count < 0) { vmolr &= ~WX_PSR_VM_L2CTL_ROPE; vmolr |= WX_PSR_VM_L2CTL_UPE; @@ -1540,7 +1769,7 @@ void wx_set_rx_mode(struct net_device *netdev) wr32(wx, WX_PSR_VLAN_CTL, vlnctrl); wr32(wx, WX_PSR_CTL, fctrl); - wr32(wx, WX_PSR_VM_L2CTL(0), vmolr); + wr32(wx, WX_PSR_VM_L2CTL(VMDQ_P(0)), vmolr); if ((features & NETIF_F_HW_VLAN_CTAG_RX) && (features & NETIF_F_HW_VLAN_STAG_RX)) @@ -1548,6 +1777,10 @@ void wx_set_rx_mode(struct net_device *netdev) else wx_vlan_strip_control(wx, false); + if (features & NETIF_F_HW_VLAN_CTAG_FILTER) + wx_vlan_promisc_disable(wx); + else + wx_vlan_promisc_enable(wx); } EXPORT_SYMBOL(wx_set_rx_mode); @@ -1797,6 +2030,13 @@ static void wx_setup_reta(struct wx *wx) u32 random_key_size = WX_RSS_KEY_SIZE / 4; u32 i, j; + if (test_bit(WX_FLAG_SRIOV_ENABLED, wx->flags)) { + if (wx->mac.type == wx_mac_em) + rss_i = 1; + else + rss_i = rss_i < 4 ? 4 : rss_i; + } + /* Fill out hash function seeds */ for (i = 0; i < random_key_size; i++) wr32(wx, WX_RDB_RSSRK(i), wx->rss_key[i]); @@ -1814,10 +2054,42 @@ static void wx_setup_reta(struct wx *wx) wx_store_reta(wx); } +#define WX_RDB_RSS_PL_2 FIELD_PREP(GENMASK(31, 29), 1) +#define WX_RDB_RSS_PL_4 FIELD_PREP(GENMASK(31, 29), 2) +static void wx_setup_psrtype(struct wx *wx) +{ + int rss_i = wx->ring_feature[RING_F_RSS].indices; + u32 psrtype; + int pool; + + psrtype = WX_RDB_PL_CFG_L4HDR | + WX_RDB_PL_CFG_L3HDR | + WX_RDB_PL_CFG_L2HDR | + WX_RDB_PL_CFG_TUN_OUTL2HDR | + WX_RDB_PL_CFG_TUN_TUNHDR; + + if (wx->mac.type == wx_mac_em) { + for_each_set_bit(pool, &wx->fwd_bitmask, 8) + wr32(wx, WX_RDB_PL_CFG(VMDQ_P(pool)), psrtype); + } else { + if (rss_i > 3) + psrtype |= WX_RDB_RSS_PL_4; + else if (rss_i > 1) + psrtype |= WX_RDB_RSS_PL_2; + + for_each_set_bit(pool, &wx->fwd_bitmask, 32) + wr32(wx, WX_RDB_PL_CFG(VMDQ_P(pool)), psrtype); + } +} + static void wx_setup_mrqc(struct wx *wx) { u32 rss_field = 0; + /* VT, and RSS do not coexist at the same time */ + if (test_bit(WX_FLAG_VMDQ_ENABLED, wx->flags)) + return; + /* Disable indicating checksum in descriptor, enables RSS hash */ wr32m(wx, WX_PSR_CTL, WX_PSR_CTL_PCSD, WX_PSR_CTL_PCSD); @@ -1847,16 +2119,11 @@ static void wx_setup_mrqc(struct wx *wx) **/ void wx_configure_rx(struct wx *wx) { - u32 psrtype, i; int ret; + u32 i; wx_disable_rx(wx); - - psrtype = WX_RDB_PL_CFG_L4HDR | - WX_RDB_PL_CFG_L3HDR | - WX_RDB_PL_CFG_L2HDR | - WX_RDB_PL_CFG_TUN_TUNHDR; - wr32(wx, WX_RDB_PL_CFG(0), psrtype); + wx_setup_psrtype(wx); /* enable hw crc stripping */ wr32m(wx, WX_RSC_CTL, WX_RSC_CTL_CRC_STRIP, WX_RSC_CTL_CRC_STRIP); @@ -1904,6 +2171,7 @@ void wx_configure(struct wx *wx) { wx_set_rxpba(wx); wx_pbthresh_setup(wx); + wx_configure_virtualization(wx); wx_configure_port(wx); wx_set_rx_mode(wx->netdev); @@ -2262,7 +2530,7 @@ static int wx_set_vlvf(struct wx *wx, u32 vlan, u32 vind, bool vlan_on, * * Turn on/off specified VLAN in the VLAN filter table. **/ -static int wx_set_vfta(struct wx *wx, u32 vlan, u32 vind, bool vlan_on) +int wx_set_vfta(struct wx *wx, u32 vlan, u32 vind, bool vlan_on) { u32 bitindex, vfta, targetbit; bool vfta_changed = false; diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.h b/drivers/net/ethernet/wangxun/libwx/wx_hw.h index b883342bb5767..91c1d6135045b 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.h @@ -26,9 +26,12 @@ void wx_init_eeprom_params(struct wx *wx); void wx_get_mac_addr(struct wx *wx, u8 *mac_addr); void wx_init_rx_addrs(struct wx *wx); void wx_mac_set_default_filter(struct wx *wx, u8 *addr); +int wx_add_mac_filter(struct wx *wx, u8 *addr, u16 pool); +int wx_del_mac_filter(struct wx *wx, u8 *addr, u16 pool); void wx_flush_sw_mac_table(struct wx *wx); int wx_set_mac(struct net_device *netdev, void *p); void wx_disable_rx(struct wx *wx); +int wx_set_vf_spoofchk(struct net_device *netdev, int vf, bool setting); int wx_disable_sec_rx_path(struct wx *wx); void wx_enable_sec_rx_path(struct wx *wx); void wx_set_rx_mode(struct net_device *netdev); @@ -42,6 +45,7 @@ int wx_stop_adapter(struct wx *wx); void wx_reset_misc(struct wx *wx); int wx_get_pcie_msix_counts(struct wx *wx, u16 *msix_count, u16 max_msix_count); int wx_sw_init(struct wx *wx); +int wx_set_vfta(struct wx *wx, u32 vlan, u32 vind, bool vlan_on); int wx_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid); int wx_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid); int wx_fc_enable(struct wx *wx, bool tx_pause, bool rx_pause); diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index e69eaa65e0de8..2a808afeb4142 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -1620,6 +1620,65 @@ void wx_napi_disable_all(struct wx *wx) } EXPORT_SYMBOL(wx_napi_disable_all); +static bool wx_set_vmdq_queues(struct wx *wx) +{ + u16 vmdq_i = wx->ring_feature[RING_F_VMDQ].limit; + u16 rss_i = wx->ring_feature[RING_F_RSS].limit; + u16 rss_m = WX_RSS_DISABLED_MASK; + u16 vmdq_m = 0; + + /* only proceed if VMDq is enabled */ + if (!test_bit(WX_FLAG_VMDQ_ENABLED, wx->flags)) + return false; + /* Add starting offset to total pool count */ + vmdq_i += wx->ring_feature[RING_F_VMDQ].offset; + + if (wx->mac.type == wx_mac_sp) { + /* double check we are limited to maximum pools */ + vmdq_i = min_t(u16, 64, vmdq_i); + + /* 64 pool mode with 2 queues per pool, or + * 16/32/64 pool mode with 1 queue per pool + */ + if (vmdq_i > 32 || rss_i < 4) { + vmdq_m = WX_VMDQ_2Q_MASK; + rss_m = WX_RSS_2Q_MASK; + rss_i = min_t(u16, rss_i, 2); + /* 32 pool mode with 4 queues per pool */ + } else { + vmdq_m = WX_VMDQ_4Q_MASK; + rss_m = WX_RSS_4Q_MASK; + rss_i = 4; + } + } else { + /* double check we are limited to maximum pools */ + vmdq_i = min_t(u16, 8, vmdq_i); + + /* when VMDQ on, disable RSS */ + rss_i = 1; + } + + /* remove the starting offset from the pool count */ + vmdq_i -= wx->ring_feature[RING_F_VMDQ].offset; + + /* save features for later use */ + wx->ring_feature[RING_F_VMDQ].indices = vmdq_i; + wx->ring_feature[RING_F_VMDQ].mask = vmdq_m; + + /* limit RSS based on user input and save for later use */ + wx->ring_feature[RING_F_RSS].indices = rss_i; + wx->ring_feature[RING_F_RSS].mask = rss_m; + + wx->queues_per_pool = rss_i;/*maybe same to num_rx_queues_per_pool*/ + wx->num_rx_pools = vmdq_i; + wx->num_rx_queues_per_pool = rss_i; + + wx->num_rx_queues = vmdq_i * rss_i; + wx->num_tx_queues = vmdq_i * rss_i; + + return true; +} + /** * wx_set_rss_queues: Allocate queues for RSS * @wx: board private structure to initialize @@ -1634,6 +1693,10 @@ static void wx_set_rss_queues(struct wx *wx) /* set mask for 16 queue limit of RSS */ f = &wx->ring_feature[RING_F_RSS]; + if (wx->mac.type == wx_mac_sp) + f->mask = WX_RSS_64Q_MASK; + else + f->mask = WX_RSS_8Q_MASK; f->indices = f->limit; if (!(test_bit(WX_FLAG_FDIR_CAPABLE, wx->flags))) @@ -1666,6 +1729,9 @@ static void wx_set_num_queues(struct wx *wx) wx->num_tx_queues = 1; wx->queues_per_pool = 1; + if (wx_set_vmdq_queues(wx)) + return; + wx_set_rss_queues(wx); } @@ -1746,6 +1812,10 @@ static int wx_set_interrupt_capability(struct wx *wx) if (ret == 0 || (ret == -ENOMEM)) return ret; + /* Disable VMDq support */ + dev_warn(&wx->pdev->dev, "Disabling VMQQ support\n"); + clear_bit(WX_FLAG_VMDQ_ENABLED, wx->flags); + /* Disable RSS */ dev_warn(&wx->pdev->dev, "Disabling RSS support\n"); wx->ring_feature[RING_F_RSS].limit = 1; @@ -1772,6 +1842,49 @@ static int wx_set_interrupt_capability(struct wx *wx) return 0; } +static bool wx_cache_ring_vmdq(struct wx *wx) +{ + struct wx_ring_feature *vmdq = &wx->ring_feature[RING_F_VMDQ]; + struct wx_ring_feature *rss = &wx->ring_feature[RING_F_RSS]; + u16 reg_idx; + int i; + + /* only proceed if VMDq is enabled */ + if (!test_bit(WX_FLAG_VMDQ_ENABLED, wx->flags)) + return false; + + if (wx->mac.type == wx_mac_sp) { + /* start at VMDq register offset for SR-IOV enabled setups */ + reg_idx = vmdq->offset * __ALIGN_MASK(1, ~vmdq->mask); + for (i = 0; i < wx->num_rx_queues; i++, reg_idx++) { + /* If we are greater than indices move to next pool */ + if ((reg_idx & ~vmdq->mask) >= rss->indices) + reg_idx = __ALIGN_MASK(reg_idx, ~vmdq->mask); + wx->rx_ring[i]->reg_idx = reg_idx; + } + reg_idx = vmdq->offset * __ALIGN_MASK(1, ~vmdq->mask); + for (i = 0; i < wx->num_tx_queues; i++, reg_idx++) { + /* If we are greater than indices move to next pool */ + if ((reg_idx & rss->mask) >= rss->indices) + reg_idx = __ALIGN_MASK(reg_idx, ~vmdq->mask); + wx->tx_ring[i]->reg_idx = reg_idx; + } + } else { + /* start at VMDq register offset for SR-IOV enabled setups */ + reg_idx = vmdq->offset; + for (i = 0; i < wx->num_rx_queues; i++) + /* If we are greater than indices move to next pool */ + wx->rx_ring[i]->reg_idx = reg_idx + i; + + reg_idx = vmdq->offset; + for (i = 0; i < wx->num_tx_queues; i++) + /* If we are greater than indices move to next pool */ + wx->tx_ring[i]->reg_idx = reg_idx + i; + } + + return true; +} + /** * wx_cache_ring_rss - Descriptor ring to register mapping for RSS * @wx: board private structure to initialize @@ -1783,6 +1896,9 @@ static void wx_cache_ring_rss(struct wx *wx) { u16 i; + if (wx_cache_ring_vmdq(wx)) + return; + for (i = 0; i < wx->num_rx_queues; i++) wx->rx_ring[i]->reg_idx = i; @@ -2181,7 +2297,8 @@ static void wx_set_ivar(struct wx *wx, s8 direction, wr32(wx, WX_PX_MISC_IVAR, ivar); } else { /* tx or rx causes */ - msix_vector += 1; /* offset for queue vectors */ + if (!(wx->mac.type == wx_mac_em && wx->num_vfs == 7)) + msix_vector += 1; /* offset for queue vectors */ msix_vector |= WX_PX_IVAR_ALLOC_VAL; index = ((16 * (queue & 1)) + (8 * direction)); ivar = rd32(wx, WX_PX_IVAR(queue >> 1)); @@ -2233,10 +2350,17 @@ void wx_configure_vectors(struct wx *wx) { struct pci_dev *pdev = wx->pdev; u32 eitrsel = 0; - u16 v_idx; + u16 v_idx, i; if (pdev->msix_enabled) { /* Populate MSIX to EITR Select */ + if (wx->mac.type == wx_mac_sp) { + if (wx->num_vfs >= 32) + eitrsel = BIT(wx->num_vfs % 32) - 1; + } else if (wx->mac.type == wx_mac_em) { + for (i = 0; i < wx->num_vfs; i++) + eitrsel |= BIT(i); + } wr32(wx, WX_PX_ITRSEL, eitrsel); /* use EIAM to auto-mask when MSI-X interrupt is asserted * this saves a register write for every interrupt @@ -2876,6 +3000,33 @@ netdev_features_t wx_fix_features(struct net_device *netdev, } EXPORT_SYMBOL(wx_fix_features); +#define WX_MAX_TUNNEL_HDR_LEN 80 +netdev_features_t wx_features_check(struct sk_buff *skb, + struct net_device *netdev, + netdev_features_t features) +{ + struct wx *wx = netdev_priv(netdev); + + if (!skb->encapsulation) + return features; + + if (wx->mac.type == wx_mac_em) + return features & ~NETIF_F_CSUM_MASK; + + if (unlikely(skb_inner_mac_header(skb) - skb_transport_header(skb) > + WX_MAX_TUNNEL_HDR_LEN)) + return features & ~NETIF_F_CSUM_MASK; + + if (skb->inner_protocol_type == ENCAP_TYPE_ETHER && + skb->inner_protocol != htons(ETH_P_IP) && + skb->inner_protocol != htons(ETH_P_IPV6) && + skb->inner_protocol != htons(ETH_P_TEB)) + return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); + + return features; +} +EXPORT_SYMBOL(wx_features_check); + void wx_set_ring(struct wx *wx, u32 new_tx_count, u32 new_rx_count, struct wx_ring *temp_ring) { diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.h b/drivers/net/ethernet/wangxun/libwx/wx_lib.h index fdeb0c315b755..919f499993089 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.h @@ -33,6 +33,9 @@ void wx_get_stats64(struct net_device *netdev, int wx_set_features(struct net_device *netdev, netdev_features_t features); netdev_features_t wx_fix_features(struct net_device *netdev, netdev_features_t features); +netdev_features_t wx_features_check(struct sk_buff *skb, + struct net_device *netdev, + netdev_features_t features); void wx_set_ring(struct wx *wx, u32 new_tx_count, u32 new_rx_count, struct wx_ring *temp_ring); diff --git a/drivers/net/ethernet/wangxun/libwx/wx_mbx.c b/drivers/net/ethernet/wangxun/libwx/wx_mbx.c new file mode 100644 index 0000000000000..73af5f11c3bdc --- /dev/null +++ b/drivers/net/ethernet/wangxun/libwx/wx_mbx.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2015 - 2025 Beijing WangXun Technology Co., Ltd. */ + +#include +#include "wx_type.h" +#include "wx_mbx.h" + +/** + * wx_obtain_mbx_lock_pf - obtain mailbox lock + * @wx: pointer to the HW structure + * @vf: the VF index + * + * Return: return 0 on success and -EBUSY on failure + **/ +static int wx_obtain_mbx_lock_pf(struct wx *wx, u16 vf) +{ + int count = 5; + u32 mailbox; + + while (count--) { + /* Take ownership of the buffer */ + wr32(wx, WX_PXMAILBOX(vf), WX_PXMAILBOX_PFU); + + /* reserve mailbox for vf use */ + mailbox = rd32(wx, WX_PXMAILBOX(vf)); + if (mailbox & WX_PXMAILBOX_PFU) + return 0; + else if (count) + udelay(10); + } + wx_err(wx, "Failed to obtain mailbox lock for PF%d", vf); + + return -EBUSY; +} + +static int wx_check_for_bit_pf(struct wx *wx, u32 mask, int index) +{ + u32 mbvficr = rd32(wx, WX_MBVFICR(index)); + + if (!(mbvficr & mask)) + return -EBUSY; + wr32(wx, WX_MBVFICR(index), mask); + + return 0; +} + +/** + * wx_check_for_ack_pf - checks to see if the VF has acked + * @wx: pointer to the HW structure + * @vf: the VF index + * + * Return: return 0 if the VF has set the status bit or else -EBUSY + **/ +int wx_check_for_ack_pf(struct wx *wx, u16 vf) +{ + u32 index = vf / 16, vf_bit = vf % 16; + + return wx_check_for_bit_pf(wx, + FIELD_PREP(WX_MBVFICR_VFACK_MASK, + BIT(vf_bit)), + index); +} + +/** + * wx_check_for_msg_pf - checks to see if the VF has sent mail + * @wx: pointer to the HW structure + * @vf: the VF index + * + * Return: return 0 if the VF has got req bit or else -EBUSY + **/ +int wx_check_for_msg_pf(struct wx *wx, u16 vf) +{ + u32 index = vf / 16, vf_bit = vf % 16; + + return wx_check_for_bit_pf(wx, + FIELD_PREP(WX_MBVFICR_VFREQ_MASK, + BIT(vf_bit)), + index); +} + +/** + * wx_write_mbx_pf - Places a message in the mailbox + * @wx: pointer to the HW structure + * @msg: The message buffer + * @size: Length of buffer + * @vf: the VF index + * + * Return: return 0 on success and -EINVAL/-EBUSY on failure + **/ +int wx_write_mbx_pf(struct wx *wx, u32 *msg, u16 size, u16 vf) +{ + struct wx_mbx_info *mbx = &wx->mbx; + int ret, i; + + /* mbx->size is up to 15 */ + if (size > mbx->size) { + wx_err(wx, "Invalid mailbox message size %d", size); + return -EINVAL; + } + + /* lock the mailbox to prevent pf/vf race condition */ + ret = wx_obtain_mbx_lock_pf(wx, vf); + if (ret) + return ret; + + /* flush msg and acks as we are overwriting the message buffer */ + wx_check_for_msg_pf(wx, vf); + wx_check_for_ack_pf(wx, vf); + + /* copy the caller specified message to the mailbox memory buffer */ + for (i = 0; i < size; i++) + wr32a(wx, WX_PXMBMEM(vf), i, msg[i]); + + /* Interrupt VF to tell it a message has been sent and release buffer */ + /* set mirrored mailbox flags */ + wr32a(wx, WX_PXMBMEM(vf), WX_VXMAILBOX_SIZE, WX_PXMAILBOX_STS); + wr32(wx, WX_PXMAILBOX(vf), WX_PXMAILBOX_STS); + + return 0; +} + +/** + * wx_read_mbx_pf - Read a message from the mailbox + * @wx: pointer to the HW structure + * @msg: The message buffer + * @size: Length of buffer + * @vf: the VF index + * + * Return: return 0 on success and -EBUSY on failure + **/ +int wx_read_mbx_pf(struct wx *wx, u32 *msg, u16 size, u16 vf) +{ + struct wx_mbx_info *mbx = &wx->mbx; + int ret; + u16 i; + + /* limit read to size of mailbox and mbx->size is up to 15 */ + if (size > mbx->size) + size = mbx->size; + + /* lock the mailbox to prevent pf/vf race condition */ + ret = wx_obtain_mbx_lock_pf(wx, vf); + if (ret) + return ret; + + for (i = 0; i < size; i++) + msg[i] = rd32a(wx, WX_PXMBMEM(vf), i); + + /* Acknowledge the message and release buffer */ + /* set mirrored mailbox flags */ + wr32a(wx, WX_PXMBMEM(vf), WX_VXMAILBOX_SIZE, WX_PXMAILBOX_ACK); + wr32(wx, WX_PXMAILBOX(vf), WX_PXMAILBOX_ACK); + + return 0; +} + +/** + * wx_check_for_rst_pf - checks to see if the VF has reset + * @wx: pointer to the HW structure + * @vf: the VF index + * + * Return: return 0 on success and -EBUSY on failure + **/ +int wx_check_for_rst_pf(struct wx *wx, u16 vf) +{ + u32 reg_offset = WX_VF_REG_OFFSET(vf); + u32 vf_shift = WX_VF_IND_SHIFT(vf); + u32 vflre = 0; + + vflre = rd32(wx, WX_VFLRE(reg_offset)); + if (!(vflre & BIT(vf_shift))) + return -EBUSY; + wr32(wx, WX_VFLREC(reg_offset), BIT(vf_shift)); + + return 0; +} diff --git a/drivers/net/ethernet/wangxun/libwx/wx_mbx.h b/drivers/net/ethernet/wangxun/libwx/wx_mbx.h new file mode 100644 index 0000000000000..05aae138dbc31 --- /dev/null +++ b/drivers/net/ethernet/wangxun/libwx/wx_mbx.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2015 - 2025 Beijing WangXun Technology Co., Ltd. */ +#ifndef _WX_MBX_H_ +#define _WX_MBX_H_ + +#define WX_VXMAILBOX_SIZE 15 + +/* PF Registers */ +#define WX_PXMAILBOX(i) (0x600 + (4 * (i))) /* i=[0,63] */ +#define WX_PXMAILBOX_STS BIT(0) /* Initiate message send to VF */ +#define WX_PXMAILBOX_ACK BIT(1) /* Ack message recv'd from VF */ +#define WX_PXMAILBOX_PFU BIT(3) /* PF owns the mailbox buffer */ + +#define WX_PXMBMEM(i) (0x5000 + (64 * (i))) /* i=[0,63] */ + +#define WX_VFLRE(i) (0x4A0 + (4 * (i))) /* i=[0,1] */ +#define WX_VFLREC(i) (0x4A8 + (4 * (i))) /* i=[0,1] */ + +/* SR-IOV specific macros */ +#define WX_MBVFICR(i) (0x480 + (4 * (i))) /* i=[0,3] */ +#define WX_MBVFICR_VFREQ_MASK GENMASK(15, 0) +#define WX_MBVFICR_VFACK_MASK GENMASK(31, 16) + +#define WX_VT_MSGTYPE_ACK BIT(31) +#define WX_VT_MSGTYPE_NACK BIT(30) +#define WX_VT_MSGTYPE_CTS BIT(29) +#define WX_VT_MSGINFO_SHIFT 16 +#define WX_VT_MSGINFO_MASK GENMASK(23, 16) + +enum wx_pfvf_api_rev { + wx_mbox_api_null, + wx_mbox_api_13 = 4, /* API version 1.3 */ + wx_mbox_api_unknown, /* indicates that API version is not known */ +}; + +/* mailbox API */ +#define WX_VF_RESET 0x01 /* VF requests reset */ +#define WX_VF_SET_MAC_ADDR 0x02 /* VF requests PF to set MAC addr */ +#define WX_VF_SET_MULTICAST 0x03 /* VF requests PF to set MC addr */ +#define WX_VF_SET_VLAN 0x04 /* VF requests PF to set VLAN */ +#define WX_VF_SET_LPE 0x05 /* VF requests PF to set VMOLR.LPE */ +#define WX_VF_SET_MACVLAN 0x06 /* VF requests PF unicast filter */ +#define WX_VF_API_NEGOTIATE 0x08 /* negotiate API version */ +#define WX_VF_GET_QUEUES 0x09 /* get queue configuration */ +#define WX_VF_GET_RETA 0x0a /* VF request for RETA */ +#define WX_VF_GET_RSS_KEY 0x0b /* get RSS key */ +#define WX_VF_UPDATE_XCAST_MODE 0x0c +#define WX_VF_GET_LINK_STATE 0x10 /* get vf link state */ +#define WX_VF_GET_FW_VERSION 0x11 /* get fw version */ + +#define WX_VF_BACKUP 0x8001 /* VF requests backup */ + +#define WX_PF_CONTROL_MSG BIT(8) /* PF control message */ +#define WX_PF_NOFITY_VF_LINK_STATUS 0x1 +#define WX_PF_NOFITY_VF_NET_NOT_RUNNING BIT(31) + +#define WX_VF_TX_QUEUES 1 /* number of Tx queues supported */ +#define WX_VF_RX_QUEUES 2 /* number of Rx queues supported */ +#define WX_VF_TRANS_VLAN 3 /* Indication of port vlan */ +#define WX_VF_DEF_QUEUE 4 /* Default queue offset */ + +#define WX_VF_PERMADDR_MSG_LEN 4 + +enum wxvf_xcast_modes { + WXVF_XCAST_MODE_NONE = 0, + WXVF_XCAST_MODE_MULTI, + WXVF_XCAST_MODE_ALLMULTI, + WXVF_XCAST_MODE_PROMISC, +}; + +int wx_write_mbx_pf(struct wx *wx, u32 *msg, u16 size, u16 vf); +int wx_read_mbx_pf(struct wx *wx, u32 *msg, u16 size, u16 vf); +int wx_check_for_rst_pf(struct wx *wx, u16 mbx_id); +int wx_check_for_msg_pf(struct wx *wx, u16 mbx_id); +int wx_check_for_ack_pf(struct wx *wx, u16 mbx_id); + +#endif /* _WX_MBX_H_ */ diff --git a/drivers/net/ethernet/wangxun/libwx/wx_sriov.c b/drivers/net/ethernet/wangxun/libwx/wx_sriov.c new file mode 100644 index 0000000000000..52e6a6faf7153 --- /dev/null +++ b/drivers/net/ethernet/wangxun/libwx/wx_sriov.c @@ -0,0 +1,909 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2015 - 2025 Beijing WangXun Technology Co., Ltd. */ + +#include +#include + +#include "wx_type.h" +#include "wx_hw.h" +#include "wx_mbx.h" +#include "wx_sriov.h" + +static void wx_vf_configuration(struct pci_dev *pdev, int event_mask) +{ + bool enable = !!WX_VF_ENABLE_CHECK(event_mask); + struct wx *wx = pci_get_drvdata(pdev); + u32 vfn = WX_VF_NUM_GET(event_mask); + + if (enable) + eth_zero_addr(wx->vfinfo[vfn].vf_mac_addr); +} + +static int wx_alloc_vf_macvlans(struct wx *wx, u8 num_vfs) +{ + struct vf_macvlans *mv_list; + int num_vf_macvlans, i; + + /* Initialize list of VF macvlans */ + INIT_LIST_HEAD(&wx->vf_mvs.mvlist); + + num_vf_macvlans = wx->mac.num_rar_entries - + (WX_MAX_PF_MACVLANS + 1 + num_vfs); + if (!num_vf_macvlans) + return -EINVAL; + + mv_list = kcalloc(num_vf_macvlans, sizeof(struct vf_macvlans), + GFP_KERNEL); + if (!mv_list) + return -ENOMEM; + + for (i = 0; i < num_vf_macvlans; i++) { + mv_list[i].vf = -1; + mv_list[i].free = true; + list_add(&mv_list[i].mvlist, &wx->vf_mvs.mvlist); + } + wx->mv_list = mv_list; + + return 0; +} + +static void wx_sriov_clear_data(struct wx *wx) +{ + /* set num VFs to 0 to prevent access to vfinfo */ + wx->num_vfs = 0; + + /* free VF control structures */ + kfree(wx->vfinfo); + wx->vfinfo = NULL; + + /* free macvlan list */ + kfree(wx->mv_list); + wx->mv_list = NULL; + + /* set default pool back to 0 */ + wr32m(wx, WX_PSR_VM_CTL, WX_PSR_VM_CTL_POOL_MASK, 0); + wx->ring_feature[RING_F_VMDQ].offset = 0; + + clear_bit(WX_FLAG_SRIOV_ENABLED, wx->flags); + /* Disable VMDq flag so device will be set in NM mode */ + if (wx->ring_feature[RING_F_VMDQ].limit == 1) + clear_bit(WX_FLAG_VMDQ_ENABLED, wx->flags); +} + +static int __wx_enable_sriov(struct wx *wx, u8 num_vfs) +{ + int i, ret = 0; + u32 value = 0; + + set_bit(WX_FLAG_SRIOV_ENABLED, wx->flags); + wx_err(wx, "SR-IOV enabled with %d VFs\n", num_vfs); + + /* Enable VMDq flag so device will be set in VM mode */ + set_bit(WX_FLAG_VMDQ_ENABLED, wx->flags); + if (!wx->ring_feature[RING_F_VMDQ].limit) + wx->ring_feature[RING_F_VMDQ].limit = 1; + wx->ring_feature[RING_F_VMDQ].offset = num_vfs; + + wx->vfinfo = kcalloc(num_vfs, sizeof(struct vf_data_storage), + GFP_KERNEL); + if (!wx->vfinfo) + return -ENOMEM; + + ret = wx_alloc_vf_macvlans(wx, num_vfs); + if (ret) + return ret; + + /* Initialize default switching mode VEB */ + wr32m(wx, WX_PSR_CTL, WX_PSR_CTL_SW_EN, WX_PSR_CTL_SW_EN); + + for (i = 0; i < num_vfs; i++) { + /* enable spoof checking for all VFs */ + wx->vfinfo[i].spoofchk_enabled = true; + wx->vfinfo[i].link_enable = true; + /* untrust all VFs */ + wx->vfinfo[i].trusted = false; + /* set the default xcast mode */ + wx->vfinfo[i].xcast_mode = WXVF_XCAST_MODE_NONE; + } + + if (wx->mac.type == wx_mac_em) { + value = WX_CFG_PORT_CTL_NUM_VT_8; + } else { + if (num_vfs < 32) + value = WX_CFG_PORT_CTL_NUM_VT_32; + else + value = WX_CFG_PORT_CTL_NUM_VT_64; + } + wr32m(wx, WX_CFG_PORT_CTL, + WX_CFG_PORT_CTL_NUM_VT_MASK, + value); + + return ret; +} + +static void wx_sriov_reinit(struct wx *wx) +{ + rtnl_lock(); + wx->setup_tc(wx->netdev, netdev_get_num_tc(wx->netdev)); + rtnl_unlock(); +} + +void wx_disable_sriov(struct wx *wx) +{ + if (!pci_vfs_assigned(wx->pdev)) + pci_disable_sriov(wx->pdev); + else + wx_err(wx, "Unloading driver while VFs are assigned.\n"); + + /* clear flags and free allloced data */ + wx_sriov_clear_data(wx); +} +EXPORT_SYMBOL(wx_disable_sriov); + +static int wx_pci_sriov_enable(struct pci_dev *dev, + int num_vfs) +{ + struct wx *wx = pci_get_drvdata(dev); + int err = 0, i; + + err = __wx_enable_sriov(wx, num_vfs); + if (err) + return err; + + wx->num_vfs = num_vfs; + for (i = 0; i < wx->num_vfs; i++) + wx_vf_configuration(dev, (i | WX_VF_ENABLE)); + + /* reset before enabling SRIOV to avoid mailbox issues */ + wx_sriov_reinit(wx); + + err = pci_enable_sriov(dev, num_vfs); + if (err) { + wx_err(wx, "Failed to enable PCI sriov: %d\n", err); + goto err_out; + } + + return num_vfs; +err_out: + wx_sriov_clear_data(wx); + return err; +} + +static void wx_pci_sriov_disable(struct pci_dev *dev) +{ + struct wx *wx = pci_get_drvdata(dev); + + wx_disable_sriov(wx); + wx_sriov_reinit(wx); +} + +int wx_pci_sriov_configure(struct pci_dev *pdev, int num_vfs) +{ + struct wx *wx = pci_get_drvdata(pdev); + int err; + + if (!num_vfs) { + if (!pci_vfs_assigned(pdev)) { + wx_pci_sriov_disable(pdev); + return 0; + } + + wx_err(wx, "can't free VFs because some are assigned to VMs.\n"); + return -EBUSY; + } + + err = wx_pci_sriov_enable(pdev, num_vfs); + if (err) + return err; + + return num_vfs; +} +EXPORT_SYMBOL(wx_pci_sriov_configure); + +static int wx_set_vf_mac(struct wx *wx, u16 vf, const u8 *mac_addr) +{ + u8 hw_addr[ETH_ALEN]; + int ret = 0; + + ether_addr_copy(hw_addr, mac_addr); + wx_del_mac_filter(wx, wx->vfinfo[vf].vf_mac_addr, vf); + ret = wx_add_mac_filter(wx, hw_addr, vf); + if (ret >= 0) + ether_addr_copy(wx->vfinfo[vf].vf_mac_addr, mac_addr); + else + eth_zero_addr(wx->vfinfo[vf].vf_mac_addr); + + return ret; +} + +static void wx_set_vmolr(struct wx *wx, u16 vf, bool aupe) +{ + u32 vmolr = rd32(wx, WX_PSR_VM_L2CTL(vf)); + + vmolr |= WX_PSR_VM_L2CTL_BAM; + if (aupe) + vmolr |= WX_PSR_VM_L2CTL_AUPE; + else + vmolr &= ~WX_PSR_VM_L2CTL_AUPE; + wr32(wx, WX_PSR_VM_L2CTL(vf), vmolr); +} + +static void wx_set_vmvir(struct wx *wx, u16 vid, u16 qos, u16 vf) +{ + u32 vmvir = vid | (qos << VLAN_PRIO_SHIFT) | + WX_TDM_VLAN_INS_VLANA_DEFAULT; + + wr32(wx, WX_TDM_VLAN_INS(vf), vmvir); +} + +static int wx_set_vf_vlan(struct wx *wx, int add, int vid, u16 vf) +{ + if (!vid && !add) + return 0; + + return wx_set_vfta(wx, vid, vf, (bool)add); +} + +static void wx_set_vlan_anti_spoofing(struct wx *wx, bool enable, int vf) +{ + u32 index = WX_VF_REG_OFFSET(vf), vf_bit = WX_VF_IND_SHIFT(vf); + u32 pfvfspoof; + + pfvfspoof = rd32(wx, WX_TDM_VLAN_AS(index)); + if (enable) + pfvfspoof |= BIT(vf_bit); + else + pfvfspoof &= ~BIT(vf_bit); + wr32(wx, WX_TDM_VLAN_AS(index), pfvfspoof); +} + +static void wx_write_qde(struct wx *wx, u32 vf, u32 qde) +{ + struct wx_ring_feature *vmdq = &wx->ring_feature[RING_F_VMDQ]; + u32 q_per_pool = __ALIGN_MASK(1, ~vmdq->mask); + u32 reg = 0, n = vf * q_per_pool / 32; + u32 i = vf * q_per_pool; + + reg = rd32(wx, WX_RDM_PF_QDE(n)); + for (i = (vf * q_per_pool - n * 32); + i < ((vf + 1) * q_per_pool - n * 32); + i++) { + if (qde == 1) + reg |= qde << i; + else + reg &= qde << i; + } + + wr32(wx, WX_RDM_PF_QDE(n), reg); +} + +static void wx_clear_vmvir(struct wx *wx, u32 vf) +{ + wr32(wx, WX_TDM_VLAN_INS(vf), 0); +} + +static void wx_ping_vf(struct wx *wx, int vf) +{ + u32 ping = WX_PF_CONTROL_MSG; + + if (wx->vfinfo[vf].clear_to_send) + ping |= WX_VT_MSGTYPE_CTS; + wx_write_mbx_pf(wx, &ping, 1, vf); +} + +static void wx_set_vf_rx_tx(struct wx *wx, int vf) +{ + u32 index = WX_VF_REG_OFFSET(vf), vf_bit = WX_VF_IND_SHIFT(vf); + u32 reg_cur_tx, reg_cur_rx, reg_req_tx, reg_req_rx; + + reg_cur_tx = rd32(wx, WX_TDM_VF_TE(index)); + reg_cur_rx = rd32(wx, WX_RDM_VF_RE(index)); + + if (wx->vfinfo[vf].link_enable) { + reg_req_tx = reg_cur_tx | BIT(vf_bit); + reg_req_rx = reg_cur_rx | BIT(vf_bit); + /* Enable particular VF */ + if (reg_cur_tx != reg_req_tx) + wr32(wx, WX_TDM_VF_TE(index), reg_req_tx); + if (reg_cur_rx != reg_req_rx) + wr32(wx, WX_RDM_VF_RE(index), reg_req_rx); + } else { + reg_req_tx = BIT(vf_bit); + reg_req_rx = BIT(vf_bit); + /* Disable particular VF */ + if (reg_cur_tx & reg_req_tx) + wr32(wx, WX_TDM_VFTE_CLR(index), reg_req_tx); + if (reg_cur_rx & reg_req_rx) + wr32(wx, WX_RDM_VFRE_CLR(index), reg_req_rx); + } +} + +static int wx_get_vf_queues(struct wx *wx, u32 *msgbuf, u32 vf) +{ + struct wx_ring_feature *vmdq = &wx->ring_feature[RING_F_VMDQ]; + unsigned int default_tc = 0; + + msgbuf[WX_VF_TX_QUEUES] = __ALIGN_MASK(1, ~vmdq->mask); + msgbuf[WX_VF_RX_QUEUES] = __ALIGN_MASK(1, ~vmdq->mask); + + if (wx->vfinfo[vf].pf_vlan || wx->vfinfo[vf].pf_qos) + msgbuf[WX_VF_TRANS_VLAN] = 1; + else + msgbuf[WX_VF_TRANS_VLAN] = 0; + + /* notify VF of default queue */ + msgbuf[WX_VF_DEF_QUEUE] = default_tc; + + return 0; +} + +static void wx_vf_reset_event(struct wx *wx, u16 vf) +{ + struct vf_data_storage *vfinfo = &wx->vfinfo[vf]; + u8 num_tcs = netdev_get_num_tc(wx->netdev); + + /* add PF assigned VLAN */ + wx_set_vf_vlan(wx, true, vfinfo->pf_vlan, vf); + + /* reset offloads to defaults */ + wx_set_vmolr(wx, vf, !vfinfo->pf_vlan); + + /* set outgoing tags for VFs */ + if (!vfinfo->pf_vlan && !vfinfo->pf_qos && !num_tcs) { + wx_clear_vmvir(wx, vf); + } else { + if (vfinfo->pf_qos || !num_tcs) + wx_set_vmvir(wx, vfinfo->pf_vlan, + vfinfo->pf_qos, vf); + else + wx_set_vmvir(wx, vfinfo->pf_vlan, + wx->default_up, vf); + } + + /* reset multicast table array for vf */ + wx->vfinfo[vf].num_vf_mc_hashes = 0; + + /* Flush and reset the mta with the new values */ + wx_set_rx_mode(wx->netdev); + + wx_del_mac_filter(wx, wx->vfinfo[vf].vf_mac_addr, vf); + /* reset VF api back to unknown */ + wx->vfinfo[vf].vf_api = wx_mbox_api_null; +} + +static void wx_vf_reset_msg(struct wx *wx, u16 vf) +{ + const u8 *vf_mac = wx->vfinfo[vf].vf_mac_addr; + struct net_device *dev = wx->netdev; + u32 msgbuf[5] = {0, 0, 0, 0, 0}; + u8 *addr = (u8 *)(&msgbuf[1]); + u32 reg = 0, index, vf_bit; + int pf_max_frame; + + /* reset the filters for the device */ + wx_vf_reset_event(wx, vf); + + /* set vf mac address */ + if (!is_zero_ether_addr(vf_mac)) + wx_set_vf_mac(wx, vf, vf_mac); + + index = WX_VF_REG_OFFSET(vf); + vf_bit = WX_VF_IND_SHIFT(vf); + + /* force drop enable for all VF Rx queues */ + wx_write_qde(wx, vf, 1); + + /* set transmit and receive for vf */ + wx_set_vf_rx_tx(wx, vf); + + pf_max_frame = dev->mtu + ETH_HLEN; + + if (pf_max_frame > ETH_FRAME_LEN) + reg = BIT(vf_bit); + wr32(wx, WX_RDM_VFRE_CLR(index), reg); + + /* enable VF mailbox for further messages */ + wx->vfinfo[vf].clear_to_send = true; + + /* reply to reset with ack and vf mac address */ + msgbuf[0] = WX_VF_RESET; + if (!is_zero_ether_addr(vf_mac)) { + msgbuf[0] |= WX_VT_MSGTYPE_ACK; + memcpy(addr, vf_mac, ETH_ALEN); + } else { + msgbuf[0] |= WX_VT_MSGTYPE_NACK; + wx_err(wx, "VF %d has no MAC address assigned", vf); + } + + msgbuf[3] = wx->mac.mc_filter_type; + wx_write_mbx_pf(wx, msgbuf, WX_VF_PERMADDR_MSG_LEN, vf); +} + +static int wx_set_vf_mac_addr(struct wx *wx, u32 *msgbuf, u16 vf) +{ + const u8 *new_mac = ((u8 *)(&msgbuf[1])); + int ret; + + if (!is_valid_ether_addr(new_mac)) { + wx_err(wx, "VF %d attempted to set invalid mac\n", vf); + return -EINVAL; + } + + if (wx->vfinfo[vf].pf_set_mac && + memcmp(wx->vfinfo[vf].vf_mac_addr, new_mac, ETH_ALEN)) { + wx_err(wx, + "VF %d attempt to set a MAC but it already had a MAC.", + vf); + return -EBUSY; + } + + ret = wx_set_vf_mac(wx, vf, new_mac); + if (ret < 0) + return ret; + + return 0; +} + +static void wx_set_vf_multicasts(struct wx *wx, u32 *msgbuf, u32 vf) +{ + struct vf_data_storage *vfinfo = &wx->vfinfo[vf]; + u16 entries = (msgbuf[0] & WX_VT_MSGINFO_MASK) + >> WX_VT_MSGINFO_SHIFT; + u32 vmolr = rd32(wx, WX_PSR_VM_L2CTL(vf)); + u32 vector_bit, vector_reg, mta_reg, i; + u16 *hash_list = (u16 *)&msgbuf[1]; + + /* only so many hash values supported */ + entries = min_t(u16, entries, WX_MAX_VF_MC_ENTRIES); + vfinfo->num_vf_mc_hashes = entries; + + for (i = 0; i < entries; i++) + vfinfo->vf_mc_hashes[i] = hash_list[i]; + + for (i = 0; i < vfinfo->num_vf_mc_hashes; i++) { + vector_reg = WX_PSR_MC_TBL_REG(vfinfo->vf_mc_hashes[i]); + vector_bit = WX_PSR_MC_TBL_BIT(vfinfo->vf_mc_hashes[i]); + mta_reg = wx->mac.mta_shadow[vector_reg]; + mta_reg |= BIT(vector_bit); + wx->mac.mta_shadow[vector_reg] = mta_reg; + wr32(wx, WX_PSR_MC_TBL(vector_reg), mta_reg); + } + vmolr |= WX_PSR_VM_L2CTL_ROMPE; + wr32(wx, WX_PSR_VM_L2CTL(vf), vmolr); +} + +static void wx_set_vf_lpe(struct wx *wx, u32 max_frame, u32 vf) +{ + u32 index, vf_bit, vfre; + u32 max_frs, reg_val; + + /* determine VF receive enable location */ + index = WX_VF_REG_OFFSET(vf); + vf_bit = WX_VF_IND_SHIFT(vf); + + vfre = rd32(wx, WX_RDM_VF_RE(index)); + vfre |= BIT(vf_bit); + wr32(wx, WX_RDM_VF_RE(index), vfre); + + /* pull current max frame size from hardware */ + max_frs = DIV_ROUND_UP(max_frame, 1024); + reg_val = rd32(wx, WX_MAC_WDG_TIMEOUT) & WX_MAC_WDG_TIMEOUT_WTO_MASK; + if (max_frs > (reg_val + WX_MAC_WDG_TIMEOUT_WTO_DELTA)) + wr32(wx, WX_MAC_WDG_TIMEOUT, + max_frs - WX_MAC_WDG_TIMEOUT_WTO_DELTA); +} + +static int wx_find_vlvf_entry(struct wx *wx, u32 vlan) +{ + int regindex; + u32 vlvf; + + /* short cut the special case */ + if (vlan == 0) + return 0; + + /* Search for the vlan id in the VLVF entries */ + for (regindex = 1; regindex < WX_PSR_VLAN_SWC_ENTRIES; regindex++) { + wr32(wx, WX_PSR_VLAN_SWC_IDX, regindex); + vlvf = rd32(wx, WX_PSR_VLAN_SWC); + if ((vlvf & VLAN_VID_MASK) == vlan) + break; + } + + /* Return a negative value if not found */ + if (regindex >= WX_PSR_VLAN_SWC_ENTRIES) + regindex = -EINVAL; + + return regindex; +} + +static int wx_set_vf_macvlan(struct wx *wx, + u16 vf, int index, unsigned char *mac_addr) +{ + struct vf_macvlans *entry; + struct list_head *pos; + int retval = 0; + + if (index <= 1) { + list_for_each(pos, &wx->vf_mvs.mvlist) { + entry = list_entry(pos, struct vf_macvlans, mvlist); + if (entry->vf == vf) { + entry->vf = -1; + entry->free = true; + entry->is_macvlan = false; + wx_del_mac_filter(wx, entry->vf_macvlan, vf); + } + } + } + + if (!index) + return 0; + + entry = NULL; + list_for_each(pos, &wx->vf_mvs.mvlist) { + entry = list_entry(pos, struct vf_macvlans, mvlist); + if (entry->free) + break; + } + + if (!entry || !entry->free) + return -ENOSPC; + + retval = wx_add_mac_filter(wx, mac_addr, vf); + if (retval >= 0) { + entry->free = false; + entry->is_macvlan = true; + entry->vf = vf; + memcpy(entry->vf_macvlan, mac_addr, ETH_ALEN); + } + + return retval; +} + +static int wx_set_vf_vlan_msg(struct wx *wx, u32 *msgbuf, u16 vf) +{ + int add = (msgbuf[0] & WX_VT_MSGINFO_MASK) >> WX_VT_MSGINFO_SHIFT; + int vid = (msgbuf[1] & WX_PSR_VLAN_SWC_VLANID_MASK); + int ret; + + if (add) + wx->vfinfo[vf].vlan_count++; + else if (wx->vfinfo[vf].vlan_count) + wx->vfinfo[vf].vlan_count--; + + /* in case of promiscuous mode any VLAN filter set for a VF must + * also have the PF pool added to it. + */ + if (add && wx->netdev->flags & IFF_PROMISC) + wx_set_vf_vlan(wx, add, vid, VMDQ_P(0)); + + ret = wx_set_vf_vlan(wx, add, vid, vf); + if (!ret && wx->vfinfo[vf].spoofchk_enabled) + wx_set_vlan_anti_spoofing(wx, true, vf); + + /* Go through all the checks to see if the VLAN filter should + * be wiped completely. + */ + if (!add && wx->netdev->flags & IFF_PROMISC) { + u32 bits = 0, vlvf; + int reg_ndx; + + reg_ndx = wx_find_vlvf_entry(wx, vid); + if (reg_ndx < 0) + return -ENOSPC; + wr32(wx, WX_PSR_VLAN_SWC_IDX, reg_ndx); + vlvf = rd32(wx, WX_PSR_VLAN_SWC); + /* See if any other pools are set for this VLAN filter + * entry other than the PF. + */ + if (VMDQ_P(0) < 32) { + bits = rd32(wx, WX_PSR_VLAN_SWC_VM_L); + bits &= ~BIT(VMDQ_P(0)); + if (wx->mac.type != wx_mac_em) + bits |= rd32(wx, WX_PSR_VLAN_SWC_VM_H); + } else { + if (wx->mac.type != wx_mac_em) + bits = rd32(wx, WX_PSR_VLAN_SWC_VM_H); + bits &= ~BIT(VMDQ_P(0) % 32); + bits |= rd32(wx, WX_PSR_VLAN_SWC_VM_L); + } + /* If the filter was removed then ensure PF pool bit + * is cleared if the PF only added itself to the pool + * because the PF is in promiscuous mode. + */ + if ((vlvf & VLAN_VID_MASK) == vid && !bits) + wx_set_vf_vlan(wx, add, vid, VMDQ_P(0)); + } + + return 0; +} + +static int wx_set_vf_macvlan_msg(struct wx *wx, u32 *msgbuf, u16 vf) +{ + int index = (msgbuf[0] & WX_VT_MSGINFO_MASK) >> + WX_VT_MSGINFO_SHIFT; + u8 *new_mac = ((u8 *)(&msgbuf[1])); + int err; + + if (wx->vfinfo[vf].pf_set_mac && index > 0) { + wx_err(wx, "VF %d request MACVLAN filter but is denied\n", vf); + return -EINVAL; + } + + /* An non-zero index indicates the VF is setting a filter */ + if (index) { + if (!is_valid_ether_addr(new_mac)) { + wx_err(wx, "VF %d attempted to set invalid mac\n", vf); + return -EINVAL; + } + /* If the VF is allowed to set MAC filters then turn off + * anti-spoofing to avoid false positives. + */ + if (wx->vfinfo[vf].spoofchk_enabled) + wx_set_vf_spoofchk(wx->netdev, vf, false); + } + + err = wx_set_vf_macvlan(wx, vf, index, new_mac); + if (err == -ENOSPC) + wx_err(wx, + "VF %d request MACVLAN filter but there is no space\n", + vf); + if (err < 0) + return err; + + return 0; +} + +static int wx_negotiate_vf_api(struct wx *wx, u32 *msgbuf, u32 vf) +{ + int api = msgbuf[1]; + + switch (api) { + case wx_mbox_api_13: + wx->vfinfo[vf].vf_api = api; + return 0; + default: + wx_err(wx, "VF %d requested invalid api version %u\n", vf, api); + return -EINVAL; + } +} + +static int wx_get_vf_link_state(struct wx *wx, u32 *msgbuf, u32 vf) +{ + msgbuf[1] = wx->vfinfo[vf].link_enable; + + return 0; +} + +static int wx_get_fw_version(struct wx *wx, u32 *msgbuf, u32 vf) +{ + unsigned long fw_version = 0ULL; + int ret = 0; + + ret = kstrtoul(wx->eeprom_id, 16, &fw_version); + if (ret) + return -EOPNOTSUPP; + msgbuf[1] = fw_version; + + return 0; +} + +static int wx_update_vf_xcast_mode(struct wx *wx, u32 *msgbuf, u32 vf) +{ + int xcast_mode = msgbuf[1]; + u32 vmolr, disable, enable; + + if (wx->vfinfo[vf].xcast_mode == xcast_mode) + return 0; + + switch (xcast_mode) { + case WXVF_XCAST_MODE_NONE: + disable = WX_PSR_VM_L2CTL_BAM | WX_PSR_VM_L2CTL_ROMPE | + WX_PSR_VM_L2CTL_MPE | WX_PSR_VM_L2CTL_UPE | + WX_PSR_VM_L2CTL_VPE; + enable = 0; + break; + case WXVF_XCAST_MODE_MULTI: + disable = WX_PSR_VM_L2CTL_MPE | WX_PSR_VM_L2CTL_UPE | + WX_PSR_VM_L2CTL_VPE; + enable = WX_PSR_VM_L2CTL_BAM | WX_PSR_VM_L2CTL_ROMPE; + break; + case WXVF_XCAST_MODE_ALLMULTI: + disable = WX_PSR_VM_L2CTL_UPE | WX_PSR_VM_L2CTL_VPE; + enable = WX_PSR_VM_L2CTL_BAM | WX_PSR_VM_L2CTL_ROMPE | + WX_PSR_VM_L2CTL_MPE; + break; + case WXVF_XCAST_MODE_PROMISC: + disable = 0; + enable = WX_PSR_VM_L2CTL_BAM | WX_PSR_VM_L2CTL_ROMPE | + WX_PSR_VM_L2CTL_MPE | WX_PSR_VM_L2CTL_UPE | + WX_PSR_VM_L2CTL_VPE; + break; + default: + return -EOPNOTSUPP; + } + + vmolr = rd32(wx, WX_PSR_VM_L2CTL(vf)); + vmolr &= ~disable; + vmolr |= enable; + wr32(wx, WX_PSR_VM_L2CTL(vf), vmolr); + + wx->vfinfo[vf].xcast_mode = xcast_mode; + msgbuf[1] = xcast_mode; + + return 0; +} + +static void wx_rcv_msg_from_vf(struct wx *wx, u16 vf) +{ + u16 mbx_size = WX_VXMAILBOX_SIZE; + u32 msgbuf[WX_VXMAILBOX_SIZE]; + int retval; + + retval = wx_read_mbx_pf(wx, msgbuf, mbx_size, vf); + if (retval) { + wx_err(wx, "Error receiving message from VF\n"); + return; + } + + /* this is a message we already processed, do nothing */ + if (msgbuf[0] & (WX_VT_MSGTYPE_ACK | WX_VT_MSGTYPE_NACK)) + return; + + if (msgbuf[0] == WX_VF_RESET) { + wx_vf_reset_msg(wx, vf); + return; + } + + /* until the vf completes a virtual function reset it should not be + * allowed to start any configuration. + */ + if (!wx->vfinfo[vf].clear_to_send) { + msgbuf[0] |= WX_VT_MSGTYPE_NACK; + wx_write_mbx_pf(wx, msgbuf, 1, vf); + return; + } + + switch ((msgbuf[0] & U16_MAX)) { + case WX_VF_SET_MAC_ADDR: + retval = wx_set_vf_mac_addr(wx, msgbuf, vf); + break; + case WX_VF_SET_MULTICAST: + wx_set_vf_multicasts(wx, msgbuf, vf); + retval = 0; + break; + case WX_VF_SET_VLAN: + retval = wx_set_vf_vlan_msg(wx, msgbuf, vf); + break; + case WX_VF_SET_LPE: + wx_set_vf_lpe(wx, msgbuf[1], vf); + retval = 0; + break; + case WX_VF_SET_MACVLAN: + retval = wx_set_vf_macvlan_msg(wx, msgbuf, vf); + break; + case WX_VF_API_NEGOTIATE: + retval = wx_negotiate_vf_api(wx, msgbuf, vf); + break; + case WX_VF_GET_QUEUES: + retval = wx_get_vf_queues(wx, msgbuf, vf); + break; + case WX_VF_GET_LINK_STATE: + retval = wx_get_vf_link_state(wx, msgbuf, vf); + break; + case WX_VF_GET_FW_VERSION: + retval = wx_get_fw_version(wx, msgbuf, vf); + break; + case WX_VF_UPDATE_XCAST_MODE: + retval = wx_update_vf_xcast_mode(wx, msgbuf, vf); + break; + case WX_VF_BACKUP: + break; + default: + wx_err(wx, "Unhandled Msg %8.8x\n", msgbuf[0]); + break; + } + + /* notify the VF of the results of what it sent us */ + if (retval) + msgbuf[0] |= WX_VT_MSGTYPE_NACK; + else + msgbuf[0] |= WX_VT_MSGTYPE_ACK; + + msgbuf[0] |= WX_VT_MSGTYPE_CTS; + + wx_write_mbx_pf(wx, msgbuf, mbx_size, vf); +} + +static void wx_rcv_ack_from_vf(struct wx *wx, u16 vf) +{ + u32 msg = WX_VT_MSGTYPE_NACK; + + /* if device isn't clear to send it shouldn't be reading either */ + if (!wx->vfinfo[vf].clear_to_send) + wx_write_mbx_pf(wx, &msg, 1, vf); +} + +void wx_msg_task(struct wx *wx) +{ + u16 vf; + + for (vf = 0; vf < wx->num_vfs; vf++) { + /* process any reset requests */ + if (!wx_check_for_rst_pf(wx, vf)) + wx_vf_reset_event(wx, vf); + + /* process any messages pending */ + if (!wx_check_for_msg_pf(wx, vf)) + wx_rcv_msg_from_vf(wx, vf); + + /* process any acks */ + if (!wx_check_for_ack_pf(wx, vf)) + wx_rcv_ack_from_vf(wx, vf); + } +} +EXPORT_SYMBOL(wx_msg_task); + +void wx_disable_vf_rx_tx(struct wx *wx) +{ + wr32(wx, WX_TDM_VFTE_CLR(0), U32_MAX); + wr32(wx, WX_RDM_VFRE_CLR(0), U32_MAX); + if (wx->mac.type != wx_mac_em) { + wr32(wx, WX_TDM_VFTE_CLR(1), U32_MAX); + wr32(wx, WX_RDM_VFRE_CLR(1), U32_MAX); + } +} +EXPORT_SYMBOL(wx_disable_vf_rx_tx); + +void wx_ping_all_vfs_with_link_status(struct wx *wx, bool link_up) +{ + u32 msgbuf[2] = {0, 0}; + u16 i; + + if (!wx->num_vfs) + return; + msgbuf[0] = WX_PF_NOFITY_VF_LINK_STATUS | WX_PF_CONTROL_MSG; + if (link_up) + msgbuf[1] = FIELD_PREP(GENMASK(31, 1), wx->speed) | link_up; + if (wx->notify_down) + msgbuf[1] |= WX_PF_NOFITY_VF_NET_NOT_RUNNING; + for (i = 0; i < wx->num_vfs; i++) { + if (wx->vfinfo[i].clear_to_send) + msgbuf[0] |= WX_VT_MSGTYPE_CTS; + wx_write_mbx_pf(wx, msgbuf, 2, i); + } +} +EXPORT_SYMBOL(wx_ping_all_vfs_with_link_status); + +static void wx_set_vf_link_state(struct wx *wx, int vf, int state) +{ + wx->vfinfo[vf].link_state = state; + switch (state) { + case IFLA_VF_LINK_STATE_AUTO: + if (netif_running(wx->netdev)) + wx->vfinfo[vf].link_enable = true; + else + wx->vfinfo[vf].link_enable = false; + break; + case IFLA_VF_LINK_STATE_ENABLE: + wx->vfinfo[vf].link_enable = true; + break; + case IFLA_VF_LINK_STATE_DISABLE: + wx->vfinfo[vf].link_enable = false; + break; + } + /* restart the VF */ + wx->vfinfo[vf].clear_to_send = false; + wx_ping_vf(wx, vf); + + wx_set_vf_rx_tx(wx, vf); +} + +void wx_set_all_vfs(struct wx *wx) +{ + int i; + + for (i = 0; i < wx->num_vfs; i++) + wx_set_vf_link_state(wx, i, wx->vfinfo[i].link_state); +} +EXPORT_SYMBOL(wx_set_all_vfs); diff --git a/drivers/net/ethernet/wangxun/libwx/wx_sriov.h b/drivers/net/ethernet/wangxun/libwx/wx_sriov.h new file mode 100644 index 0000000000000..8a3a47bb58155 --- /dev/null +++ b/drivers/net/ethernet/wangxun/libwx/wx_sriov.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2015 - 2025 Beijing WangXun Technology Co., Ltd. */ + +#ifndef _WX_SRIOV_H_ +#define _WX_SRIOV_H_ + +#define WX_VF_ENABLE_CHECK(_m) FIELD_GET(BIT(31), (_m)) +#define WX_VF_NUM_GET(_m) FIELD_GET(GENMASK(5, 0), (_m)) +#define WX_VF_ENABLE BIT(31) + +void wx_disable_sriov(struct wx *wx); +int wx_pci_sriov_configure(struct pci_dev *pdev, int num_vfs); +void wx_msg_task(struct wx *wx); +void wx_disable_vf_rx_tx(struct wx *wx); +void wx_ping_all_vfs_with_link_status(struct wx *wx, bool link_up); +void wx_set_all_vfs(struct wx *wx); + +#endif /* _WX_SRIOV_H_ */ diff --git a/drivers/net/ethernet/wangxun/libwx/wx_type.h b/drivers/net/ethernet/wangxun/libwx/wx_type.h index 4c545b2aa997c..8cfed5d4ebf7e 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_type.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_type.h @@ -20,8 +20,13 @@ /* MSI-X capability fields masks */ #define WX_PCIE_MSIX_TBL_SZ_MASK 0x7FF #define WX_PCI_LINK_STATUS 0xB2 +#define WX_MAX_PF_MACVLANS 15 +#define WX_MAX_VF_MC_ENTRIES 30 /**************** Global Registers ****************************/ +#define WX_VF_REG_OFFSET(_v) FIELD_GET(GENMASK(15, 5), (_v)) +#define WX_VF_IND_SHIFT(_v) FIELD_GET(GENMASK(4, 0), (_v)) + /* chip control Registers */ #define WX_MIS_PWR 0x10000 #define WX_MIS_RST 0x1000C @@ -76,6 +81,9 @@ #define WX_MAC_LXONOFFRXC 0x11E0C /*********************** Receive DMA registers **************************/ +#define WX_RDM_VF_RE(_i) (0x12004 + ((_i) * 4)) +#define WX_RDM_PF_QDE(_i) (0x12080 + ((_i) * 4)) +#define WX_RDM_VFRE_CLR(_i) (0x120A0 + ((_i) * 4)) #define WX_RDM_DRP_PKT 0x12500 #define WX_RDM_PKT_CNT 0x12504 #define WX_RDM_BYTE_CNT_LSB 0x12508 @@ -84,12 +92,17 @@ /************************* Port Registers ************************************/ /* port cfg Registers */ #define WX_CFG_PORT_CTL 0x14400 +#define WX_CFG_PORT_CTL_PFRSTD BIT(14) #define WX_CFG_PORT_CTL_DRV_LOAD BIT(3) #define WX_CFG_PORT_CTL_QINQ BIT(2) #define WX_CFG_PORT_CTL_D_VLAN BIT(0) /* double vlan*/ #define WX_CFG_TAG_TPID(_i) (0x14430 + ((_i) * 4)) #define WX_CFG_PORT_CTL_NUM_VT_MASK GENMASK(13, 12) /* number of TVs */ +#define WX_CFG_PORT_CTL_NUM_VT_NONE 0 +#define WX_CFG_PORT_CTL_NUM_VT_8 FIELD_PREP(GENMASK(13, 12), 1) +#define WX_CFG_PORT_CTL_NUM_VT_32 FIELD_PREP(GENMASK(13, 12), 2) +#define WX_CFG_PORT_CTL_NUM_VT_64 FIELD_PREP(GENMASK(13, 12), 3) /* GPIO Registers */ #define WX_GPIO_DR 0x14800 @@ -112,6 +125,11 @@ /*********************** Transmit DMA registers **************************/ /* transmit global control */ #define WX_TDM_CTL 0x18000 +#define WX_TDM_VF_TE(_i) (0x18004 + ((_i) * 4)) +#define WX_TDM_MAC_AS(_i) (0x18060 + ((_i) * 4)) +#define WX_TDM_VLAN_AS(_i) (0x18070 + ((_i) * 4)) +#define WX_TDM_VFTE_CLR(_i) (0x180A0 + ((_i) * 4)) + /* TDM CTL BIT */ #define WX_TDM_CTL_TE BIT(0) /* Transmit Enable */ #define WX_TDM_PB_THRE(_i) (0x18020 + ((_i) * 4)) @@ -165,6 +183,7 @@ /******************************* PSR Registers *******************************/ /* psr control */ #define WX_PSR_CTL 0x15000 +#define WX_PSR_VM_CTL 0x151B0 /* Header split receive */ #define WX_PSR_CTL_SW_EN BIT(18) #define WX_PSR_CTL_RSC_ACK BIT(17) @@ -201,12 +220,17 @@ #define WX_PSR_1588_CTL_VALID BIT(0) /* mcasst/ucast overflow tbl */ #define WX_PSR_MC_TBL(_i) (0x15200 + ((_i) * 4)) +#define WX_PSR_MC_TBL_REG(_i) FIELD_GET(GENMASK(11, 5), (_i)) +#define WX_PSR_MC_TBL_BIT(_i) FIELD_GET(GENMASK(4, 0), (_i)) #define WX_PSR_UC_TBL(_i) (0x15400 + ((_i) * 4)) +#define WX_PSR_VM_CTL_REPLEN BIT(30) /* replication enabled */ +#define WX_PSR_VM_CTL_POOL_MASK GENMASK(12, 7) /* VM L2 contorl */ #define WX_PSR_VM_L2CTL(_i) (0x15600 + ((_i) * 4)) #define WX_PSR_VM_L2CTL_UPE BIT(4) /* unicast promiscuous */ #define WX_PSR_VM_L2CTL_VACC BIT(6) /* accept nomatched vlan */ +#define WX_PSR_VM_L2CTL_VPE BIT(7) /* vlan promiscuous mode */ #define WX_PSR_VM_L2CTL_AUPE BIT(8) /* accept untagged packets */ #define WX_PSR_VM_L2CTL_ROMPE BIT(9) /* accept packets in MTA tbl */ #define WX_PSR_VM_L2CTL_ROPE BIT(10) /* accept packets in UC tbl */ @@ -245,10 +269,12 @@ #define WX_PSR_VLAN_SWC 0x16220 #define WX_PSR_VLAN_SWC_VM_L 0x16224 #define WX_PSR_VLAN_SWC_VM_H 0x16228 +#define WX_PSR_VLAN_SWC_VM(_i) (0x16224 + ((_i) * 4)) #define WX_PSR_VLAN_SWC_IDX 0x16230 /* 64 vlan entries */ /* VLAN pool filtering masks */ #define WX_PSR_VLAN_SWC_VIEN BIT(31) /* filter is valid */ #define WX_PSR_VLAN_SWC_ENTRIES 64 +#define WX_PSR_VLAN_SWC_VLANID_MASK GENMASK(11, 0) /********************************* RSEC **************************************/ /* general rsec */ @@ -259,6 +285,13 @@ #define WX_RSC_ST 0x17004 #define WX_RSC_ST_RSEC_RDY BIT(0) +/*********************** Transmit DMA registers **************************/ +/* transmit global control */ +#define WX_TDM_ETYPE_AS(_i) (0x18058 + ((_i) * 4)) +#define WX_TDM_VLAN_INS(_i) (0x18100 + ((_i) * 4)) +/* Per VF Port VLAN insertion rules */ +#define WX_TDM_VLAN_INS_VLANA_DEFAULT BIT(30) /* Always use default VLAN*/ + /****************************** TDB ******************************************/ #define WX_TDB_PB_SZ(_i) (0x1CC00 + ((_i) * 4)) #define WX_TXPKT_SIZE_MAX 0xA /* Max Tx Packet size */ @@ -328,6 +361,9 @@ #define WX_MAC_WDG_TIMEOUT 0x1100C #define WX_MAC_RX_FLOW_CTRL 0x11090 #define WX_MAC_RX_FLOW_CTRL_RFE BIT(0) /* receive fc enable */ + +#define WX_MAC_WDG_TIMEOUT_WTO_MASK GENMASK(3, 0) +#define WX_MAC_WDG_TIMEOUT_WTO_DELTA 2 /* MDIO Registers */ #define WX_MSCA 0x11200 #define WX_MSCA_RA(v) FIELD_PREP(U16_MAX, v) @@ -417,6 +453,15 @@ enum WX_MSCA_CMD_value { /* Number of 80 microseconds we wait for PCI Express master disable */ #define WX_PCI_MASTER_DISABLE_TIMEOUT 80000 +#define WX_RSS_64Q_MASK 0x3F +#define WX_RSS_8Q_MASK 0x7 +#define WX_RSS_4Q_MASK 0x3 +#define WX_RSS_2Q_MASK 0x1 +#define WX_RSS_DISABLED_MASK 0x0 + +#define WX_VMDQ_4Q_MASK 0x7C +#define WX_VMDQ_2Q_MASK 0x7E + /****************** Manageablility Host Interface defines ********************/ #define WX_HI_MAX_BLOCK_BYTE_LENGTH 256 /* Num of bytes in range */ #define WX_HI_COMMAND_TIMEOUT 1000 /* Process HI command limit */ @@ -484,7 +529,7 @@ enum WX_MSCA_CMD_value { #define WX_REQ_TX_DESCRIPTOR_MULTIPLE 128 #define WX_MAX_JUMBO_FRAME_SIZE 9432 /* max payload 9414 */ -#define VMDQ_P(p) p +#define VMDQ_P(p) ((p) + wx->ring_feature[RING_F_VMDQ].offset) /* Supported Rx Buffer Sizes */ #define WX_RXBUFFER_256 256 /* Used for skb receive header */ @@ -778,6 +823,10 @@ struct wx_bus_info { u16 device; }; +struct wx_mbx_info { + u16 size; +}; + struct wx_thermal_sensor_data { s16 temp; s16 alarm_thresh; @@ -1051,6 +1100,7 @@ struct wx_ring_feature { enum wx_ring_f_enum { RING_F_NONE = 0, + RING_F_VMDQ, RING_F_RSS, RING_F_FDIR, RING_F_ARRAY_SIZE /* must be last in enum set */ @@ -1106,8 +1156,38 @@ enum wx_state { WX_STATE_NBITS /* must be last */ }; +struct vf_data_storage { + struct pci_dev *vfdev; + unsigned char vf_mac_addr[ETH_ALEN]; + bool spoofchk_enabled; + bool link_enable; + bool trusted; + int xcast_mode; + unsigned int vf_api; + bool clear_to_send; + u16 pf_vlan; /* When set, guest VLAN config not allowed. */ + u16 pf_qos; + bool pf_set_mac; + + u16 vf_mc_hashes[WX_MAX_VF_MC_ENTRIES]; + u16 num_vf_mc_hashes; + u16 vlan_count; + int link_state; +}; + +struct vf_macvlans { + struct list_head mvlist; + int vf; + bool free; + bool is_macvlan; + u8 vf_macvlan[ETH_ALEN]; +}; + enum wx_pf_flags { WX_FLAG_SWFW_RING, + WX_FLAG_VMDQ_ENABLED, + WX_FLAG_VLAN_PROMISC, + WX_FLAG_SRIOV_ENABLED, WX_FLAG_FDIR_CAPABLE, WX_FLAG_FDIR_HASH, WX_FLAG_FDIR_PERFECT, @@ -1128,6 +1208,7 @@ struct wx { struct pci_dev *pdev; struct net_device *netdev; struct wx_bus_info bus; + struct wx_mbx_info mbx; struct wx_mac_info mac; enum em_mac_type mac_type; enum sp_media_type media_type; @@ -1151,6 +1232,7 @@ struct wx { u8 swfw_index; /* PHY stuff */ + bool notify_down; unsigned int link; int speed; int duplex; @@ -1182,6 +1264,8 @@ struct wx { struct wx_ring *tx_ring[64] ____cacheline_aligned_in_smp; struct wx_ring *rx_ring[64]; struct wx_q_vector *q_vector[64]; + int num_rx_pools; + int num_rx_queues_per_pool; unsigned int queues_per_pool; struct msix_entry *msix_q_entries; @@ -1203,6 +1287,7 @@ struct wx { u32 wol; u16 bd_number; + bool default_up; struct wx_hw_stats stats; u64 tx_busy; @@ -1211,10 +1296,16 @@ struct wx { u64 hw_csum_rx_good; u64 hw_csum_rx_error; u64 alloc_rx_buff_failed; + unsigned int num_vfs; + struct vf_data_storage *vfinfo; + struct vf_macvlans vf_mvs; + struct vf_macvlans *mv_list; + unsigned long fwd_bitmask; u32 atr_sample_rate; void (*atr)(struct wx_ring *ring, struct wx_tx_buffer *first, u8 ptype); void (*configure_fdir)(struct wx *wx); + int (*setup_tc)(struct net_device *netdev, u8 tc); void (*do_reset)(struct net_device *netdev); int (*ptp_setup_sdp)(struct wx *wx); diff --git a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c index 91b3055a5a9f5..b5022c49dc5ee 100644 --- a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c +++ b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c @@ -15,6 +15,8 @@ #include "../libwx/wx_hw.h" #include "../libwx/wx_lib.h" #include "../libwx/wx_ptp.h" +#include "../libwx/wx_mbx.h" +#include "../libwx/wx_sriov.h" #include "ngbe_type.h" #include "ngbe_mdio.h" #include "ngbe_hw.h" @@ -129,6 +131,10 @@ static int ngbe_sw_init(struct wx *wx) wx->tx_work_limit = NGBE_DEFAULT_TX_WORK; wx->rx_work_limit = NGBE_DEFAULT_RX_WORK; + wx->mbx.size = WX_VXMAILBOX_SIZE; + wx->setup_tc = ngbe_setup_tc; + set_bit(0, &wx->fwd_bitmask); + return 0; } @@ -200,12 +206,10 @@ static irqreturn_t ngbe_intr(int __always_unused irq, void *data) return IRQ_HANDLED; } -static irqreturn_t ngbe_msix_other(int __always_unused irq, void *data) +static irqreturn_t __ngbe_msix_misc(struct wx *wx, u32 eicr) { - struct wx *wx = data; - u32 eicr; - - eicr = wx_misc_isb(wx, WX_ISB_MISC); + if (eicr & NGBE_PX_MISC_IC_VF_MBOX) + wx_msg_task(wx); if (unlikely(eicr & NGBE_PX_MISC_IC_TIMESYNC)) wx_ptp_check_pps_event(wx); @@ -217,6 +221,35 @@ static irqreturn_t ngbe_msix_other(int __always_unused irq, void *data) return IRQ_HANDLED; } +static irqreturn_t ngbe_msix_misc(int __always_unused irq, void *data) +{ + struct wx *wx = data; + u32 eicr; + + eicr = wx_misc_isb(wx, WX_ISB_MISC); + + return __ngbe_msix_misc(wx, eicr); +} + +static irqreturn_t ngbe_misc_and_queue(int __always_unused irq, void *data) +{ + struct wx_q_vector *q_vector; + struct wx *wx = data; + u32 eicr; + + eicr = wx_misc_isb(wx, WX_ISB_MISC); + if (!eicr) { + /* queue */ + q_vector = wx->q_vector[0]; + napi_schedule_irqoff(&q_vector->napi); + if (netif_running(wx->netdev)) + ngbe_irq_enable(wx, true); + return IRQ_HANDLED; + } + + return __ngbe_msix_misc(wx, eicr); +} + /** * ngbe_request_msix_irqs - Initialize MSI-X interrupts * @wx: board private structure @@ -249,8 +282,16 @@ static int ngbe_request_msix_irqs(struct wx *wx) } } - err = request_irq(wx->msix_entry->vector, - ngbe_msix_other, 0, netdev->name, wx); + /* Due to hardware design, when num_vfs < 7, pf can use 0 for misc and 1 + * for queue. But when num_vfs == 7, vector[1] is assigned to vf6. + * Misc and queue should reuse interrupt vector[0]. + */ + if (wx->num_vfs == 7) + err = request_irq(wx->msix_entry->vector, + ngbe_misc_and_queue, 0, netdev->name, wx); + else + err = request_irq(wx->msix_entry->vector, + ngbe_msix_misc, 0, netdev->name, wx); if (err) { wx_err(wx, "request_irq for msix_other failed: %d\n", err); @@ -302,6 +343,22 @@ static void ngbe_disable_device(struct wx *wx) struct net_device *netdev = wx->netdev; u32 i; + if (wx->num_vfs) { + /* Clear EITR Select mapping */ + wr32(wx, WX_PX_ITRSEL, 0); + + /* Mark all the VFs as inactive */ + for (i = 0; i < wx->num_vfs; i++) + wx->vfinfo[i].clear_to_send = 0; + wx->notify_down = true; + /* ping all the active vfs to let them know we are going down */ + wx_ping_all_vfs_with_link_status(wx, false); + wx->notify_down = false; + + /* Disable all VFTE/VFRE TX/RX */ + wx_disable_vf_rx_tx(wx); + } + /* disable all enabled rx queues */ for (i = 0; i < wx->num_rx_queues; i++) /* this call also flushes the previous write */ @@ -324,12 +381,19 @@ static void ngbe_disable_device(struct wx *wx) wx_update_stats(wx); } +static void ngbe_reset(struct wx *wx) +{ + wx_flush_sw_mac_table(wx); + wx_mac_set_default_filter(wx, wx->mac.addr); + if (test_bit(WX_STATE_PTP_RUNNING, wx->state)) + wx_ptp_reset(wx); +} + void ngbe_down(struct wx *wx) { phylink_stop(wx->phylink); ngbe_disable_device(wx); - if (test_bit(WX_STATE_PTP_RUNNING, wx->state)) - wx_ptp_reset(wx); + ngbe_reset(wx); wx_clean_all_tx_rings(wx); wx_clean_all_rx_rings(wx); } @@ -352,6 +416,11 @@ void ngbe_up(struct wx *wx) ngbe_sfp_modules_txrx_powerctl(wx, true); phylink_start(wx->phylink); + /* Set PF Reset Done bit so PF/VF Mail Ops can work */ + wr32m(wx, WX_CFG_PORT_CTL, + WX_CFG_PORT_CTL_PFRSTD, WX_CFG_PORT_CTL_PFRSTD); + if (wx->num_vfs) + wx_ping_all_vfs_with_link_status(wx, false); } /** @@ -518,6 +587,7 @@ static const struct net_device_ops ngbe_netdev_ops = { .ndo_set_rx_mode = wx_set_rx_mode, .ndo_set_features = wx_set_features, .ndo_fix_features = wx_fix_features, + .ndo_features_check = wx_features_check, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = wx_set_mac, .ndo_get_stats64 = wx_get_stats64, @@ -596,6 +666,10 @@ static int ngbe_probe(struct pci_dev *pdev, goto err_pci_release_regions; } + /* The emerald supports up to 8 VFs per pf, but physical + * function also need one pool for basic networking. + */ + pci_sriov_set_totalvfs(pdev, NGBE_MAX_VFS_DRV_LIMIT); wx->driver_name = ngbe_driver_name; ngbe_set_ethtool_ops(netdev); netdev->netdev_ops = &ngbe_netdev_ops; @@ -744,6 +818,7 @@ static void ngbe_remove(struct pci_dev *pdev) struct net_device *netdev; netdev = wx->netdev; + wx_disable_sriov(wx); unregister_netdev(netdev); phylink_destroy(wx->phylink); pci_release_selected_regions(pdev, @@ -803,6 +878,7 @@ static struct pci_driver ngbe_driver = { .suspend = ngbe_suspend, .resume = ngbe_resume, .shutdown = ngbe_shutdown, + .sriov_configure = wx_pci_sriov_configure, }; module_pci_driver(ngbe_driver); diff --git a/drivers/net/ethernet/wangxun/ngbe/ngbe_mdio.c b/drivers/net/ethernet/wangxun/ngbe/ngbe_mdio.c index ea1d7e9a91f3d..c63bb6e6f4056 100644 --- a/drivers/net/ethernet/wangxun/ngbe/ngbe_mdio.c +++ b/drivers/net/ethernet/wangxun/ngbe/ngbe_mdio.c @@ -9,6 +9,7 @@ #include "../libwx/wx_type.h" #include "../libwx/wx_ptp.h" #include "../libwx/wx_hw.h" +#include "../libwx/wx_sriov.h" #include "ngbe_type.h" #include "ngbe_mdio.h" @@ -70,6 +71,8 @@ static void ngbe_mac_link_down(struct phylink_config *config, wx->speed = SPEED_UNKNOWN; if (test_bit(WX_STATE_PTP_RUNNING, wx->state)) wx_ptp_reset_cyclecounter(wx); + /* ping all the active vfs to let them know we are going down */ + wx_ping_all_vfs_with_link_status(wx, false); } static void ngbe_mac_link_up(struct phylink_config *config, @@ -114,6 +117,8 @@ static void ngbe_mac_link_up(struct phylink_config *config, wx->last_rx_ptp_check = jiffies; if (test_bit(WX_STATE_PTP_RUNNING, wx->state)) wx_ptp_reset_cyclecounter(wx); + /* ping all the active vfs to let them know we are going up */ + wx_ping_all_vfs_with_link_status(wx, true); } static const struct phylink_mac_ops ngbe_mac_ops = { diff --git a/drivers/net/ethernet/wangxun/ngbe/ngbe_type.h b/drivers/net/ethernet/wangxun/ngbe/ngbe_type.h index 992adbb98c7d1..bb74263f04985 100644 --- a/drivers/net/ethernet/wangxun/ngbe/ngbe_type.h +++ b/drivers/net/ethernet/wangxun/ngbe/ngbe_type.h @@ -73,12 +73,14 @@ #define NGBE_PX_MISC_IEN_TIMESYNC BIT(11) #define NGBE_PX_MISC_IEN_ETH_LK BIT(18) #define NGBE_PX_MISC_IEN_INT_ERR BIT(20) +#define NGBE_PX_MISC_IC_VF_MBOX BIT(23) #define NGBE_PX_MISC_IEN_GPIO BIT(26) #define NGBE_PX_MISC_IEN_MASK ( \ NGBE_PX_MISC_IEN_DEV_RST | \ NGBE_PX_MISC_IEN_TIMESYNC | \ NGBE_PX_MISC_IEN_ETH_LK | \ NGBE_PX_MISC_IEN_INT_ERR | \ + NGBE_PX_MISC_IC_VF_MBOX | \ NGBE_PX_MISC_IEN_GPIO) /* Extended Interrupt Cause Read */ @@ -134,6 +136,7 @@ #define NGBE_MAX_RXD 8192 #define NGBE_MIN_RXD 128 +#define NGBE_MAX_VFS_DRV_LIMIT 7 extern char ngbe_driver_name[]; void ngbe_down(struct wx *wx); diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c index 8658a51ee8106..3b9e831cf0ef3 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c @@ -7,6 +7,7 @@ #include "../libwx/wx_type.h" #include "../libwx/wx_lib.h" #include "../libwx/wx_hw.h" +#include "../libwx/wx_sriov.h" #include "txgbe_type.h" #include "txgbe_phy.h" #include "txgbe_irq.h" @@ -109,8 +110,17 @@ static irqreturn_t txgbe_misc_irq_handle(int irq, void *data) struct wx *wx = txgbe->wx; u32 eicr; - if (wx->pdev->msix_enabled) + if (wx->pdev->msix_enabled) { + eicr = wx_misc_isb(wx, WX_ISB_MISC); + if (!eicr) + return IRQ_NONE; + txgbe->eicr = eicr; + if (eicr & TXGBE_PX_MISC_IC_VF_MBOX) { + wx_msg_task(txgbe->wx); + wx_intr_enable(wx, TXGBE_INTR_MISC); + } return IRQ_WAKE_THREAD; + } eicr = wx_misc_isb(wx, WX_ISB_VEC0); if (!eicr) { @@ -129,6 +139,11 @@ static irqreturn_t txgbe_misc_irq_handle(int irq, void *data) q_vector = wx->q_vector[0]; napi_schedule_irqoff(&q_vector->napi); + eicr = wx_misc_isb(wx, WX_ISB_MISC); + if (!eicr) + return IRQ_NONE; + txgbe->eicr = eicr; + return IRQ_WAKE_THREAD; } @@ -140,7 +155,7 @@ static irqreturn_t txgbe_misc_irq_thread_fn(int irq, void *data) unsigned int sub_irq; u32 eicr; - eicr = wx_misc_isb(wx, WX_ISB_MISC); + eicr = txgbe->eicr; if (eicr & (TXGBE_PX_MISC_ETH_LK | TXGBE_PX_MISC_ETH_LKDN | TXGBE_PX_MISC_ETH_AN)) { sub_irq = irq_find_mapping(txgbe->misc.domain, TXGBE_IRQ_LINK); @@ -183,7 +198,7 @@ int txgbe_setup_misc_irq(struct txgbe *txgbe) if (wx->mac.type == wx_mac_aml) goto skip_sp_irq; - txgbe->misc.nirqs = 1; + txgbe->misc.nirqs = TXGBE_IRQ_MAX; txgbe->misc.domain = irq_domain_add_simple(NULL, txgbe->misc.nirqs, 0, &txgbe_misc_irq_domain_ops, txgbe); if (!txgbe->misc.domain) diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c index 38206a46693bc..f57d84628e071 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -15,6 +16,8 @@ #include "../libwx/wx_lib.h" #include "../libwx/wx_ptp.h" #include "../libwx/wx_hw.h" +#include "../libwx/wx_mbx.h" +#include "../libwx/wx_sriov.h" #include "txgbe_type.h" #include "txgbe_hw.h" #include "txgbe_phy.h" @@ -117,6 +120,12 @@ static void txgbe_up_complete(struct wx *wx) /* enable transmits */ netif_tx_start_all_queues(netdev); + + /* Set PF Reset Done bit so PF/VF Mail Ops can work */ + wr32m(wx, WX_CFG_PORT_CTL, WX_CFG_PORT_CTL_PFRSTD, + WX_CFG_PORT_CTL_PFRSTD); + /* update setting rx tx for all active vfs */ + wx_set_all_vfs(wx); } static void txgbe_reset(struct wx *wx) @@ -165,6 +174,16 @@ static void txgbe_disable_device(struct wx *wx) wx_err(wx, "%s: invalid bus lan id %d\n", __func__, wx->bus.func); + if (wx->num_vfs) { + /* Clear EITR Select mapping */ + wr32(wx, WX_PX_ITRSEL, 0); + /* Mark all the VFs as inactive */ + for (i = 0; i < wx->num_vfs; i++) + wx->vfinfo[i].clear_to_send = 0; + /* update setting rx tx for all active vfs */ + wx_set_all_vfs(wx); + } + if (!(((wx->subsystem_device_id & WX_NCSI_MASK) == WX_NCSI_SUP) || ((wx->subsystem_device_id & WX_WOL_MASK) == WX_WOL_SUP))) { /* disable mac transmiter */ @@ -307,12 +326,15 @@ static int txgbe_sw_init(struct wx *wx) /* set default ring sizes */ wx->tx_ring_count = TXGBE_DEFAULT_TXD; wx->rx_ring_count = TXGBE_DEFAULT_RXD; + wx->mbx.size = WX_VXMAILBOX_SIZE; /* set default work limits */ wx->tx_work_limit = TXGBE_DEFAULT_TX_WORK; wx->rx_work_limit = TXGBE_DEFAULT_RX_WORK; + wx->setup_tc = txgbe_setup_tc; wx->do_reset = txgbe_do_reset; + set_bit(0, &wx->fwd_bitmask); switch (wx->mac.type) { case wx_mac_sp: @@ -516,6 +538,39 @@ void txgbe_do_reset(struct net_device *netdev) txgbe_reset(wx); } +static int txgbe_udp_tunnel_sync(struct net_device *dev, unsigned int table) +{ + struct wx *wx = netdev_priv(dev); + struct udp_tunnel_info ti; + + udp_tunnel_nic_get_port(dev, table, 0, &ti); + switch (ti.type) { + case UDP_TUNNEL_TYPE_VXLAN: + wr32(wx, TXGBE_CFG_VXLAN, ntohs(ti.port)); + break; + case UDP_TUNNEL_TYPE_VXLAN_GPE: + wr32(wx, TXGBE_CFG_VXLAN_GPE, ntohs(ti.port)); + break; + case UDP_TUNNEL_TYPE_GENEVE: + wr32(wx, TXGBE_CFG_GENEVE, ntohs(ti.port)); + break; + default: + break; + } + + return 0; +} + +static const struct udp_tunnel_nic_info txgbe_udp_tunnels = { + .sync_table = txgbe_udp_tunnel_sync, + .flags = UDP_TUNNEL_NIC_INFO_OPEN_ONLY, + .tables = { + { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN, }, + { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN_GPE, }, + { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_GENEVE, }, + }, +}; + static const struct net_device_ops txgbe_netdev_ops = { .ndo_open = txgbe_open, .ndo_stop = txgbe_close, @@ -524,6 +579,7 @@ static const struct net_device_ops txgbe_netdev_ops = { .ndo_set_rx_mode = wx_set_rx_mode, .ndo_set_features = wx_set_features, .ndo_fix_features = wx_fix_features, + .ndo_features_check = wx_features_check, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = wx_set_mac, .ndo_get_stats64 = wx_get_stats64, @@ -604,9 +660,14 @@ static int txgbe_probe(struct pci_dev *pdev, goto err_pci_release_regions; } + /* The sapphire supports up to 63 VFs per pf, but physical + * function also need one pool for basic networking. + */ + pci_sriov_set_totalvfs(pdev, TXGBE_MAX_VFS_DRV_LIMIT); wx->driver_name = txgbe_driver_name; txgbe_set_ethtool_ops(netdev); netdev->netdev_ops = &txgbe_netdev_ops; + netdev->udp_tunnel_nic_info = &txgbe_udp_tunnels; /* setup the private structure */ err = txgbe_sw_init(wx); @@ -652,6 +713,7 @@ static int txgbe_probe(struct pci_dev *pdev, netdev->features |= NETIF_F_HIGHDMA; netdev->hw_features |= NETIF_F_GRO; netdev->features |= NETIF_F_GRO; + netdev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; netdev->priv_flags |= IFF_UNICAST_FLT; netdev->priv_flags |= IFF_SUPP_NOFCS; @@ -795,6 +857,7 @@ static void txgbe_remove(struct pci_dev *pdev) struct net_device *netdev; netdev = wx->netdev; + wx_disable_sriov(wx); unregister_netdev(netdev); txgbe_remove_phy(txgbe); @@ -817,11 +880,12 @@ static struct pci_driver txgbe_driver = { .probe = txgbe_probe, .remove = txgbe_remove, .shutdown = txgbe_shutdown, + .sriov_configure = wx_pci_sriov_configure, }; module_pci_driver(txgbe_driver); MODULE_DEVICE_TABLE(pci, txgbe_pci_tbl); MODULE_AUTHOR("Beijing WangXun Technology Co., Ltd, "); -MODULE_DESCRIPTION("WangXun(R) 10 Gigabit PCI Express Network Driver"); +MODULE_DESCRIPTION("WangXun(R) 10/25/40 Gigabit PCI Express Network Driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c index 85f022ceef4fe..1863cfd27ee79 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c @@ -16,6 +16,8 @@ #include "../libwx/wx_type.h" #include "../libwx/wx_lib.h" #include "../libwx/wx_ptp.h" +#include "../libwx/wx_sriov.h" +#include "../libwx/wx_mbx.h" #include "../libwx/wx_hw.h" #include "txgbe_type.h" #include "txgbe_phy.h" @@ -184,6 +186,8 @@ static void txgbe_mac_link_down(struct phylink_config *config, wx->speed = SPEED_UNKNOWN; if (test_bit(WX_STATE_PTP_RUNNING, wx->state)) wx_ptp_reset_cyclecounter(wx); + /* ping all the active vfs to let them know we are going down */ + wx_ping_all_vfs_with_link_status(wx, false); } static void txgbe_mac_link_up(struct phylink_config *config, @@ -225,6 +229,8 @@ static void txgbe_mac_link_up(struct phylink_config *config, wx->last_rx_ptp_check = jiffies; if (test_bit(WX_STATE_PTP_RUNNING, wx->state)) wx_ptp_reset_cyclecounter(wx); + /* ping all the active vfs to let them know we are going up */ + wx_ping_all_vfs_with_link_status(wx, true); } static int txgbe_mac_prepare(struct phylink_config *config, unsigned int mode, diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h b/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h index 9c1c26234cad9..cb553318641da 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h @@ -77,15 +77,20 @@ #define TXGBE_PX_MISC_ETH_LK BIT(18) #define TXGBE_PX_MISC_ETH_AN BIT(19) #define TXGBE_PX_MISC_INT_ERR BIT(20) +#define TXGBE_PX_MISC_IC_VF_MBOX BIT(23) #define TXGBE_PX_MISC_GPIO BIT(26) #define TXGBE_PX_MISC_IEN_MASK \ (TXGBE_PX_MISC_ETH_LKDN | TXGBE_PX_MISC_DEV_RST | \ TXGBE_PX_MISC_ETH_EVENT | TXGBE_PX_MISC_ETH_LK | \ - TXGBE_PX_MISC_ETH_AN | TXGBE_PX_MISC_INT_ERR) + TXGBE_PX_MISC_ETH_AN | TXGBE_PX_MISC_INT_ERR | \ + TXGBE_PX_MISC_IC_VF_MBOX) /* Port cfg registers */ #define TXGBE_CFG_PORT_ST 0x14404 #define TXGBE_CFG_PORT_ST_LINK_UP BIT(0) +#define TXGBE_CFG_VXLAN 0x14410 +#define TXGBE_CFG_VXLAN_GPE 0x14414 +#define TXGBE_CFG_GENEVE 0x14418 /* I2C registers */ #define TXGBE_I2C_BASE 0x14900 @@ -174,6 +179,8 @@ #define TXGBE_SP_RX_PB_SIZE 512 #define TXGBE_SP_TDB_PB_SZ (160 * 1024) /* 160KB Packet Buffer */ +#define TXGBE_MAX_VFS_DRV_LIMIT 63 + #define TXGBE_DEFAULT_ATR_SAMPLE_RATE 20 /* Software ATR hash keys */ @@ -348,6 +355,7 @@ struct txgbe { struct clk *clk; struct gpio_chip *gpio; unsigned int link_irq; + u32 eicr; /* flow director */ struct hlist_head fdir_filter_list; diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 054abf283ab33..1b7a653c1f4ed 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -2980,7 +2980,7 @@ static int axienet_probe(struct platform_device *pdev) } } if (!IS_ENABLED(CONFIG_64BIT) && lp->features & XAE_FEATURE_DMA_64BIT) { - dev_err(&pdev->dev, "64-bit addressable DMA is not compatible with 32-bit archecture\n"); + dev_err(&pdev->dev, "64-bit addressable DMA is not compatible with 32-bit architecture\n"); ret = -EINVAL; goto cleanup_clk; } diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 66e38ce9cd1d3..ffc15a4326899 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1946,22 +1946,14 @@ static __net_init int geneve_init_net(struct net *net) return 0; } -static void geneve_destroy_tunnels(struct net *net, struct list_head *head) +static void __net_exit geneve_exit_rtnl_net(struct net *net, + struct list_head *dev_to_kill) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *geneve, *next; list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) - geneve_dellink(geneve->dev, head); -} - -static void __net_exit geneve_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - list_for_each_entry(net, net_list, exit_list) - geneve_destroy_tunnels(net, dev_to_kill); + geneve_dellink(geneve->dev, dev_to_kill); } static void __net_exit geneve_exit_net(struct net *net) @@ -1973,7 +1965,7 @@ static void __net_exit geneve_exit_net(struct net *net) static struct pernet_operations geneve_net_ops = { .init = geneve_init_net, - .exit_batch_rtnl = geneve_exit_batch_rtnl, + .exit_rtnl = geneve_exit_rtnl_net, .exit = geneve_exit_net, .id = &geneve_net_id, .size = sizeof(struct geneve_net), diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index ef793607890d9..d4dec741c7f44 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -2475,23 +2475,19 @@ static int __net_init gtp_net_init(struct net *net) return 0; } -static void __net_exit gtp_net_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit gtp_net_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - struct net *net; - - list_for_each_entry(net, net_list, exit_list) { - struct gtp_net *gn = net_generic(net, gtp_net_id); - struct gtp_dev *gtp, *gtp_next; + struct gtp_net *gn = net_generic(net, gtp_net_id); + struct gtp_dev *gtp, *gtp_next; - list_for_each_entry_safe(gtp, gtp_next, &gn->gtp_dev_list, list) - gtp_dellink(gtp->dev, dev_to_kill); - } + list_for_each_entry_safe(gtp, gtp_next, &gn->gtp_dev_list, list) + gtp_dellink(gtp->dev, dev_to_kill); } static struct pernet_operations gtp_net_ops = { .init = gtp_net_init, - .exit_batch_rtnl = gtp_net_exit_batch_rtnl, + .exit_rtnl = gtp_net_exit_rtnl, .id = >p_net_id, .size = sizeof(struct gtp_net), }; diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 9e366f275406d..5fda7a0fcce06 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -1074,7 +1074,7 @@ static int baycom_siocdevprivate(struct net_device *dev, struct ifreq *ifr, return 0; case HDLCDRVCTL_DRIVERNAME: - strscpy_pad(hi.data.drivername, "baycom_epp", sizeof(hi.data.drivername)); + strscpy_pad(hi.data.drivername, "baycom_epp"); break; case HDLCDRVCTL_GETMODE: @@ -1091,8 +1091,7 @@ static int baycom_siocdevprivate(struct net_device *dev, struct ifreq *ifr, return baycom_setmode(bc, hi.data.modename); case HDLCDRVCTL_MODELIST: - strscpy_pad(hi.data.modename, "intclk,extclk,intmodem,extmodem,divider=x", - sizeof(hi.data.modename)); + strscpy_pad(hi.data.modename, "intclk,extclk,intmodem,extmodem,divider=x"); break; case HDLCDRVCTL_MODEMPARMASK: diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index ca62188a317ad..e3e65772c5991 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -219,7 +219,7 @@ void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type) unsigned int ipvlan_mac_hash(const unsigned char *addr) { - u32 hash = jhash_1word(__get_unaligned_cpu32(addr+2), + u32 hash = jhash_1word(get_unaligned((u32 *)(addr + 2)), ipvlan_jhash_secret); return hash & IPVLAN_MAC_FILTER_MASK; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d0dfa6bca6cc2..7045b1d587541 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -254,7 +254,7 @@ static u32 macvlan_hash_mix(const struct macvlan_dev *vlan) static unsigned int mc_hash(const struct macvlan_dev *vlan, const unsigned char *addr) { - u32 val = __get_unaligned_cpu32(addr + 2); + u32 val = get_unaligned((u32 *)(addr + 2)); val ^= macvlan_hash_mix(vlan); return hash_32(val, MACVLAN_MC_FILTER_BITS); diff --git a/drivers/net/mdio/Kconfig b/drivers/net/mdio/Kconfig index 4a7a303be2f74..058fcdaf6c185 100644 --- a/drivers/net/mdio/Kconfig +++ b/drivers/net/mdio/Kconfig @@ -185,6 +185,13 @@ config MDIO_IPQ8064 This driver supports the MDIO interface found in the network interface units of the IPQ8064 SoC +config MDIO_REALTEK_RTL9300 + tristate "Realtek RTL9300 MDIO interface support" + depends on MACH_REALTEK_RTL || COMPILE_TEST + help + This driver supports the MDIO interface found in the Realtek + RTL9300 family of Ethernet switches with integrated SoC. + config MDIO_REGMAP tristate help diff --git a/drivers/net/mdio/Makefile b/drivers/net/mdio/Makefile index 1015f0db4531d..c23778e73890c 100644 --- a/drivers/net/mdio/Makefile +++ b/drivers/net/mdio/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_MDIO_MOXART) += mdio-moxart.o obj-$(CONFIG_MDIO_MSCC_MIIM) += mdio-mscc-miim.o obj-$(CONFIG_MDIO_MVUSB) += mdio-mvusb.o obj-$(CONFIG_MDIO_OCTEON) += mdio-octeon.o +obj-$(CONFIG_MDIO_REALTEK_RTL9300) += mdio-realtek-rtl9300.o obj-$(CONFIG_MDIO_REGMAP) += mdio-regmap.o obj-$(CONFIG_MDIO_SUN4I) += mdio-sun4i.o obj-$(CONFIG_MDIO_THUNDER) += mdio-thunder.o diff --git a/drivers/net/mdio/mdio-realtek-rtl9300.c b/drivers/net/mdio/mdio-realtek-rtl9300.c new file mode 100644 index 0000000000000..33694c3ff9a71 --- /dev/null +++ b/drivers/net/mdio/mdio-realtek-rtl9300.c @@ -0,0 +1,522 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * MDIO controller for RTL9300 switches with integrated SoC. + * + * The MDIO communication is abstracted by the switch. At the software level + * communication uses the switch port to address the PHY. We work out the + * mapping based on the MDIO bus described in device tree and phandles on the + * ethernet-ports property. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SMI_GLB_CTRL 0xca00 +#define GLB_CTRL_INTF_SEL(intf) BIT(16 + (intf)) +#define SMI_PORT0_15_POLLING_SEL 0xca08 +#define SMI_ACCESS_PHY_CTRL_0 0xcb70 +#define SMI_ACCESS_PHY_CTRL_1 0xcb74 +#define PHY_CTRL_REG_ADDR GENMASK(24, 20) +#define PHY_CTRL_PARK_PAGE GENMASK(19, 15) +#define PHY_CTRL_MAIN_PAGE GENMASK(14, 3) +#define PHY_CTRL_WRITE BIT(2) +#define PHY_CTRL_READ 0 +#define PHY_CTRL_TYPE_C45 BIT(1) +#define PHY_CTRL_TYPE_C22 0 +#define PHY_CTRL_CMD BIT(0) +#define PHY_CTRL_FAIL BIT(25) +#define SMI_ACCESS_PHY_CTRL_2 0xcb78 +#define PHY_CTRL_INDATA GENMASK(31, 16) +#define PHY_CTRL_DATA GENMASK(15, 0) +#define SMI_ACCESS_PHY_CTRL_3 0xcb7c +#define PHY_CTRL_MMD_DEVAD GENMASK(20, 16) +#define PHY_CTRL_MMD_REG GENMASK(15, 0) +#define SMI_PORT0_5_ADDR_CTRL 0xcb80 + +#define MAX_PORTS 28 +#define MAX_SMI_BUSSES 4 +#define MAX_SMI_ADDR 0x1f + +struct rtl9300_mdio_priv { + struct regmap *regmap; + struct mutex lock; /* protect HW access */ + DECLARE_BITMAP(valid_ports, MAX_PORTS); + u8 smi_bus[MAX_PORTS]; + u8 smi_addr[MAX_PORTS]; + bool smi_bus_is_c45[MAX_SMI_BUSSES]; + struct mii_bus *bus[MAX_SMI_BUSSES]; +}; + +struct rtl9300_mdio_chan { + struct rtl9300_mdio_priv *priv; + u8 mdio_bus; +}; + +static int rtl9300_mdio_phy_to_port(struct mii_bus *bus, int phy_id) +{ + struct rtl9300_mdio_chan *chan = bus->priv; + struct rtl9300_mdio_priv *priv; + int i; + + priv = chan->priv; + + for_each_set_bit(i, priv->valid_ports, MAX_PORTS) + if (priv->smi_bus[i] == chan->mdio_bus && + priv->smi_addr[i] == phy_id) + return i; + + return -ENOENT; +} + +static int rtl9300_mdio_wait_ready(struct rtl9300_mdio_priv *priv) +{ + struct regmap *regmap = priv->regmap; + u32 val; + + lockdep_assert_held(&priv->lock); + + return regmap_read_poll_timeout(regmap, SMI_ACCESS_PHY_CTRL_1, + val, !(val & PHY_CTRL_CMD), 10, 1000); +} + +static int rtl9300_mdio_read_c22(struct mii_bus *bus, int phy_id, int regnum) +{ + struct rtl9300_mdio_chan *chan = bus->priv; + struct rtl9300_mdio_priv *priv; + struct regmap *regmap; + int port; + u32 val; + int err; + + priv = chan->priv; + regmap = priv->regmap; + + port = rtl9300_mdio_phy_to_port(bus, phy_id); + if (port < 0) + return port; + + mutex_lock(&priv->lock); + err = rtl9300_mdio_wait_ready(priv); + if (err) + goto out_err; + + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_2, FIELD_PREP(PHY_CTRL_INDATA, port)); + if (err) + goto out_err; + + val = FIELD_PREP(PHY_CTRL_REG_ADDR, regnum) | + FIELD_PREP(PHY_CTRL_PARK_PAGE, 0x1f) | + FIELD_PREP(PHY_CTRL_MAIN_PAGE, 0xfff) | + PHY_CTRL_READ | PHY_CTRL_TYPE_C22 | PHY_CTRL_CMD; + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_1, val); + if (err) + goto out_err; + + err = rtl9300_mdio_wait_ready(priv); + if (err) + goto out_err; + + err = regmap_read(regmap, SMI_ACCESS_PHY_CTRL_2, &val); + if (err) + goto out_err; + + mutex_unlock(&priv->lock); + return FIELD_GET(PHY_CTRL_DATA, val); + +out_err: + mutex_unlock(&priv->lock); + return err; +} + +static int rtl9300_mdio_write_c22(struct mii_bus *bus, int phy_id, int regnum, u16 value) +{ + struct rtl9300_mdio_chan *chan = bus->priv; + struct rtl9300_mdio_priv *priv; + struct regmap *regmap; + int port; + u32 val; + int err; + + priv = chan->priv; + regmap = priv->regmap; + + port = rtl9300_mdio_phy_to_port(bus, phy_id); + if (port < 0) + return port; + + mutex_lock(&priv->lock); + err = rtl9300_mdio_wait_ready(priv); + if (err) + goto out_err; + + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_0, BIT(port)); + if (err) + goto out_err; + + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_2, FIELD_PREP(PHY_CTRL_INDATA, value)); + if (err) + goto out_err; + + val = FIELD_PREP(PHY_CTRL_REG_ADDR, regnum) | + FIELD_PREP(PHY_CTRL_PARK_PAGE, 0x1f) | + FIELD_PREP(PHY_CTRL_MAIN_PAGE, 0xfff) | + PHY_CTRL_WRITE | PHY_CTRL_TYPE_C22 | PHY_CTRL_CMD; + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_1, val); + if (err) + goto out_err; + + err = regmap_read_poll_timeout(regmap, SMI_ACCESS_PHY_CTRL_1, + val, !(val & PHY_CTRL_CMD), 10, 100); + if (err) + goto out_err; + + if (val & PHY_CTRL_FAIL) { + err = -ENXIO; + goto out_err; + } + + mutex_unlock(&priv->lock); + return 0; + +out_err: + mutex_unlock(&priv->lock); + return err; +} + +static int rtl9300_mdio_read_c45(struct mii_bus *bus, int phy_id, int dev_addr, int regnum) +{ + struct rtl9300_mdio_chan *chan = bus->priv; + struct rtl9300_mdio_priv *priv; + struct regmap *regmap; + int port; + u32 val; + int err; + + priv = chan->priv; + regmap = priv->regmap; + + port = rtl9300_mdio_phy_to_port(bus, phy_id); + if (port < 0) + return port; + + mutex_lock(&priv->lock); + err = rtl9300_mdio_wait_ready(priv); + if (err) + goto out_err; + + val = FIELD_PREP(PHY_CTRL_INDATA, port); + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_2, val); + if (err) + goto out_err; + + val = FIELD_PREP(PHY_CTRL_MMD_DEVAD, dev_addr) | + FIELD_PREP(PHY_CTRL_MMD_REG, regnum); + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_3, val); + if (err) + goto out_err; + + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_1, + PHY_CTRL_READ | PHY_CTRL_TYPE_C45 | PHY_CTRL_CMD); + if (err) + goto out_err; + + err = rtl9300_mdio_wait_ready(priv); + if (err) + goto out_err; + + err = regmap_read(regmap, SMI_ACCESS_PHY_CTRL_2, &val); + if (err) + goto out_err; + + mutex_unlock(&priv->lock); + return FIELD_GET(PHY_CTRL_DATA, val); + +out_err: + mutex_unlock(&priv->lock); + return err; +} + +static int rtl9300_mdio_write_c45(struct mii_bus *bus, int phy_id, int dev_addr, + int regnum, u16 value) +{ + struct rtl9300_mdio_chan *chan = bus->priv; + struct rtl9300_mdio_priv *priv; + struct regmap *regmap; + int port; + u32 val; + int err; + + priv = chan->priv; + regmap = priv->regmap; + + port = rtl9300_mdio_phy_to_port(bus, phy_id); + if (port < 0) + return port; + + mutex_lock(&priv->lock); + err = rtl9300_mdio_wait_ready(priv); + if (err) + goto out_err; + + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_0, BIT(port)); + if (err) + goto out_err; + + val = FIELD_PREP(PHY_CTRL_INDATA, value); + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_2, val); + if (err) + goto out_err; + + val = FIELD_PREP(PHY_CTRL_MMD_DEVAD, dev_addr) | + FIELD_PREP(PHY_CTRL_MMD_REG, regnum); + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_3, val); + if (err) + goto out_err; + + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_1, + PHY_CTRL_TYPE_C45 | PHY_CTRL_WRITE | PHY_CTRL_CMD); + if (err) + goto out_err; + + err = regmap_read_poll_timeout(regmap, SMI_ACCESS_PHY_CTRL_1, + val, !(val & PHY_CTRL_CMD), 10, 100); + if (err) + goto out_err; + + if (val & PHY_CTRL_FAIL) { + err = -ENXIO; + goto out_err; + } + + mutex_unlock(&priv->lock); + return 0; + +out_err: + mutex_unlock(&priv->lock); + return err; +} + +static int rtl9300_mdiobus_init(struct rtl9300_mdio_priv *priv) +{ + u32 glb_ctrl_mask = 0, glb_ctrl_val = 0; + struct regmap *regmap = priv->regmap; + u32 port_addr[5] = { 0 }; + u32 poll_sel[2] = { 0 }; + int i, err; + + /* Associate the port with the SMI interface and PHY */ + for_each_set_bit(i, priv->valid_ports, MAX_PORTS) { + int pos; + + pos = (i % 6) * 5; + port_addr[i / 6] |= (priv->smi_addr[i] & 0x1f) << pos; + + pos = (i % 16) * 2; + poll_sel[i / 16] |= (priv->smi_bus[i] & 0x3) << pos; + } + + /* Put the interfaces into C45 mode if required */ + glb_ctrl_mask = GENMASK(19, 16); + for (i = 0; i < MAX_SMI_BUSSES; i++) + if (priv->smi_bus_is_c45[i]) + glb_ctrl_val |= GLB_CTRL_INTF_SEL(i); + + err = regmap_bulk_write(regmap, SMI_PORT0_5_ADDR_CTRL, + port_addr, 5); + if (err) + return err; + + err = regmap_bulk_write(regmap, SMI_PORT0_15_POLLING_SEL, + poll_sel, 2); + if (err) + return err; + + err = regmap_update_bits(regmap, SMI_GLB_CTRL, + glb_ctrl_mask, glb_ctrl_val); + if (err) + return err; + + return 0; +} + +static int rtl9300_mdiobus_probe_one(struct device *dev, struct rtl9300_mdio_priv *priv, + struct fwnode_handle *node) +{ + struct rtl9300_mdio_chan *chan; + struct fwnode_handle *child; + struct mii_bus *bus; + u32 mdio_bus; + int err; + + err = fwnode_property_read_u32(node, "reg", &mdio_bus); + if (err) + return err; + + /* The MDIO accesses from the kernel work with the PHY polling unit in + * the switch. We need to tell the PPU to operate either in GPHY (i.e. + * clause 22) or 10GPHY mode (i.e. clause 45). + * + * We select 10GPHY mode if there is at least one PHY that declares + * compatible = "ethernet-phy-ieee802.3-c45". This does mean we can't + * support both c45 and c22 on the same MDIO bus. + */ + fwnode_for_each_child_node(node, child) + if (fwnode_device_is_compatible(child, "ethernet-phy-ieee802.3-c45")) + priv->smi_bus_is_c45[mdio_bus] = true; + + bus = devm_mdiobus_alloc_size(dev, sizeof(*chan)); + if (!bus) + return -ENOMEM; + + bus->name = "Realtek Switch MDIO Bus"; + if (priv->smi_bus_is_c45[mdio_bus]) { + bus->read_c45 = rtl9300_mdio_read_c45; + bus->write_c45 = rtl9300_mdio_write_c45; + } else { + bus->read = rtl9300_mdio_read_c22; + bus->write = rtl9300_mdio_write_c22; + } + bus->parent = dev; + chan = bus->priv; + chan->mdio_bus = mdio_bus; + chan->priv = priv; + + snprintf(bus->id, MII_BUS_ID_SIZE, "%s-%d", dev_name(dev), mdio_bus); + + err = devm_of_mdiobus_register(dev, bus, to_of_node(node)); + if (err) + return dev_err_probe(dev, err, "cannot register MDIO bus\n"); + + return 0; +} + +/* The mdio-controller is part of a switch block so we parse the sibling + * ethernet-ports node and build a mapping of the switch port to MDIO bus/addr + * based on the phy-handle. + */ +static int rtl9300_mdiobus_map_ports(struct device *dev) +{ + struct rtl9300_mdio_priv *priv = dev_get_drvdata(dev); + struct device *parent = dev->parent; + struct fwnode_handle *port; + int err; + + struct fwnode_handle *ports __free(fwnode_handle) = + device_get_named_child_node(parent, "ethernet-ports"); + if (!ports) + return dev_err_probe(dev, -EINVAL, "%pfwP missing ethernet-ports\n", + dev_fwnode(parent)); + + fwnode_for_each_child_node(ports, port) { + struct device_node *mdio_dn; + u32 addr; + u32 bus; + u32 pn; + + struct device_node *phy_dn __free(device_node) = + of_parse_phandle(to_of_node(port), "phy-handle", 0); + /* skip ports without phys */ + if (!phy_dn) + continue; + + mdio_dn = phy_dn->parent; + /* only map ports that are connected to this mdio-controller */ + if (mdio_dn->parent != dev->of_node) + continue; + + err = fwnode_property_read_u32(port, "reg", &pn); + if (err) + return err; + + if (pn >= MAX_PORTS) + return dev_err_probe(dev, -EINVAL, "illegal port number %d\n", pn); + + if (test_bit(pn, priv->valid_ports)) + return dev_err_probe(dev, -EINVAL, "duplicated port number %d\n", pn); + + err = of_property_read_u32(mdio_dn, "reg", &bus); + if (err) + return err; + + if (bus >= MAX_SMI_BUSSES) + return dev_err_probe(dev, -EINVAL, "illegal smi bus number %d\n", bus); + + err = of_property_read_u32(phy_dn, "reg", &addr); + if (err) + return err; + + __set_bit(pn, priv->valid_ports); + priv->smi_bus[pn] = bus; + priv->smi_addr[pn] = addr; + } + + return 0; +} + +static int rtl9300_mdiobus_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct rtl9300_mdio_priv *priv; + struct fwnode_handle *child; + int err; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + err = devm_mutex_init(dev, &priv->lock); + if (err) + return err; + + priv->regmap = syscon_node_to_regmap(dev->parent->of_node); + if (IS_ERR(priv->regmap)) + return PTR_ERR(priv->regmap); + + platform_set_drvdata(pdev, priv); + + err = rtl9300_mdiobus_map_ports(dev); + if (err) + return err; + + device_for_each_child_node(dev, child) { + err = rtl9300_mdiobus_probe_one(dev, priv, child); + if (err) + return err; + } + + err = rtl9300_mdiobus_init(priv); + if (err) + return dev_err_probe(dev, err, "failed to initialise MDIO bus controller\n"); + + return 0; +} + +static const struct of_device_id rtl9300_mdio_ids[] = { + { .compatible = "realtek,rtl9301-mdio" }, + {} +}; +MODULE_DEVICE_TABLE(of, rtl9300_mdio_ids); + +static struct platform_driver rtl9300_mdio_driver = { + .probe = rtl9300_mdiobus_probe, + .driver = { + .name = "mdio-rtl9300", + .of_match_table = rtl9300_mdio_ids, + }, +}; + +module_platform_driver(rtl9300_mdio_driver); + +MODULE_DESCRIPTION("RTL9300 MDIO driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile new file mode 100644 index 0000000000000..229be66167e1f --- /dev/null +++ b/drivers/net/ovpn/Makefile @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# ovpn -- OpenVPN data channel offload in kernel space +# +# Copyright (C) 2020-2025 OpenVPN, Inc. +# +# Author: Antonio Quartulli + +obj-$(CONFIG_OVPN) := ovpn.o +ovpn-y += bind.o +ovpn-y += crypto.o +ovpn-y += crypto_aead.o +ovpn-y += main.o +ovpn-y += io.o +ovpn-y += netlink.o +ovpn-y += netlink-gen.o +ovpn-y += peer.o +ovpn-y += pktid.o +ovpn-y += socket.o +ovpn-y += stats.o +ovpn-y += tcp.o +ovpn-y += udp.o diff --git a/drivers/net/ovpn/bind.c b/drivers/net/ovpn/bind.c new file mode 100644 index 0000000000000..24d2788a277e6 --- /dev/null +++ b/drivers/net/ovpn/bind.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2012-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#include +#include + +#include "ovpnpriv.h" +#include "bind.h" +#include "peer.h" + +/** + * ovpn_bind_from_sockaddr - retrieve binding matching sockaddr + * @ss: the sockaddr to match + * + * Return: the bind matching the passed sockaddr if found, NULL otherwise + */ +struct ovpn_bind *ovpn_bind_from_sockaddr(const struct sockaddr_storage *ss) +{ + struct ovpn_bind *bind; + size_t sa_len; + + if (ss->ss_family == AF_INET) + sa_len = sizeof(struct sockaddr_in); + else if (ss->ss_family == AF_INET6) + sa_len = sizeof(struct sockaddr_in6); + else + return ERR_PTR(-EAFNOSUPPORT); + + bind = kzalloc(sizeof(*bind), GFP_ATOMIC); + if (unlikely(!bind)) + return ERR_PTR(-ENOMEM); + + memcpy(&bind->remote, ss, sa_len); + + return bind; +} + +/** + * ovpn_bind_reset - assign new binding to peer + * @peer: the peer whose binding has to be replaced + * @new: the new bind to assign + */ +void ovpn_bind_reset(struct ovpn_peer *peer, struct ovpn_bind *new) +{ + lockdep_assert_held(&peer->lock); + + kfree_rcu(rcu_replace_pointer(peer->bind, new, + lockdep_is_held(&peer->lock)), rcu); +} diff --git a/drivers/net/ovpn/bind.h b/drivers/net/ovpn/bind.h new file mode 100644 index 0000000000000..4e0b8398bfd9e --- /dev/null +++ b/drivers/net/ovpn/bind.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2012-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_OVPNBIND_H_ +#define _NET_OVPN_OVPNBIND_H_ + +#include +#include +#include +#include +#include +#include + +struct ovpn_peer; + +/** + * union ovpn_sockaddr - basic transport layer address + * @in4: IPv4 address + * @in6: IPv6 address + */ +union ovpn_sockaddr { + struct sockaddr_in in4; + struct sockaddr_in6 in6; +}; + +/** + * struct ovpn_bind - remote peer binding + * @remote: the remote peer sockaddress + * @local: local endpoint used to talk to the peer + * @local.ipv4: local IPv4 used to talk to the peer + * @local.ipv6: local IPv6 used to talk to the peer + * @rcu: used to schedule RCU cleanup job + */ +struct ovpn_bind { + union ovpn_sockaddr remote; /* remote sockaddr */ + + union { + struct in_addr ipv4; + struct in6_addr ipv6; + } local; + + struct rcu_head rcu; +}; + +/** + * ovpn_bind_skb_src_match - match packet source with binding + * @bind: the binding to match + * @skb: the packet to match + * + * Return: true if the packet source matches the remote peer sockaddr + * in the binding + */ +static inline bool ovpn_bind_skb_src_match(const struct ovpn_bind *bind, + const struct sk_buff *skb) +{ + const union ovpn_sockaddr *remote; + + if (unlikely(!bind)) + return false; + + remote = &bind->remote; + + switch (skb->protocol) { + case htons(ETH_P_IP): + if (unlikely(remote->in4.sin_family != AF_INET)) + return false; + + if (unlikely(remote->in4.sin_addr.s_addr != ip_hdr(skb)->saddr)) + return false; + + if (unlikely(remote->in4.sin_port != udp_hdr(skb)->source)) + return false; + break; + case htons(ETH_P_IPV6): + if (unlikely(remote->in6.sin6_family != AF_INET6)) + return false; + + if (unlikely(!ipv6_addr_equal(&remote->in6.sin6_addr, + &ipv6_hdr(skb)->saddr))) + return false; + + if (unlikely(remote->in6.sin6_port != udp_hdr(skb)->source)) + return false; + break; + default: + return false; + } + + return true; +} + +struct ovpn_bind *ovpn_bind_from_sockaddr(const struct sockaddr_storage *sa); +void ovpn_bind_reset(struct ovpn_peer *peer, struct ovpn_bind *bind); + +#endif /* _NET_OVPN_OVPNBIND_H_ */ diff --git a/drivers/net/ovpn/crypto.c b/drivers/net/ovpn/crypto.c new file mode 100644 index 0000000000000..90580e32052fb --- /dev/null +++ b/drivers/net/ovpn/crypto.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#include +#include +#include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "pktid.h" +#include "crypto_aead.h" +#include "crypto.h" + +static void ovpn_ks_destroy_rcu(struct rcu_head *head) +{ + struct ovpn_crypto_key_slot *ks; + + ks = container_of(head, struct ovpn_crypto_key_slot, rcu); + ovpn_aead_crypto_key_slot_destroy(ks); +} + +void ovpn_crypto_key_slot_release(struct kref *kref) +{ + struct ovpn_crypto_key_slot *ks; + + ks = container_of(kref, struct ovpn_crypto_key_slot, refcount); + call_rcu(&ks->rcu, ovpn_ks_destroy_rcu); +} + +/* can only be invoked when all peer references have been dropped (i.e. RCU + * release routine) + */ +void ovpn_crypto_state_release(struct ovpn_crypto_state *cs) +{ + struct ovpn_crypto_key_slot *ks; + + ks = rcu_access_pointer(cs->slots[0]); + if (ks) { + RCU_INIT_POINTER(cs->slots[0], NULL); + ovpn_crypto_key_slot_put(ks); + } + + ks = rcu_access_pointer(cs->slots[1]); + if (ks) { + RCU_INIT_POINTER(cs->slots[1], NULL); + ovpn_crypto_key_slot_put(ks); + } +} + +/* removes the key matching the specified id from the crypto context */ +bool ovpn_crypto_kill_key(struct ovpn_crypto_state *cs, u8 key_id) +{ + struct ovpn_crypto_key_slot *ks = NULL; + + spin_lock_bh(&cs->lock); + if (rcu_access_pointer(cs->slots[0])->key_id == key_id) { + ks = rcu_replace_pointer(cs->slots[0], NULL, + lockdep_is_held(&cs->lock)); + } else if (rcu_access_pointer(cs->slots[1])->key_id == key_id) { + ks = rcu_replace_pointer(cs->slots[1], NULL, + lockdep_is_held(&cs->lock)); + } + spin_unlock_bh(&cs->lock); + + if (ks) + ovpn_crypto_key_slot_put(ks); + + /* let the caller know if a key was actually killed */ + return ks; +} + +/* Reset the ovpn_crypto_state object in a way that is atomic + * to RCU readers. + */ +int ovpn_crypto_state_reset(struct ovpn_crypto_state *cs, + const struct ovpn_peer_key_reset *pkr) +{ + struct ovpn_crypto_key_slot *old = NULL, *new; + u8 idx; + + if (pkr->slot != OVPN_KEY_SLOT_PRIMARY && + pkr->slot != OVPN_KEY_SLOT_SECONDARY) + return -EINVAL; + + new = ovpn_aead_crypto_key_slot_new(&pkr->key); + if (IS_ERR(new)) + return PTR_ERR(new); + + spin_lock_bh(&cs->lock); + idx = cs->primary_idx; + switch (pkr->slot) { + case OVPN_KEY_SLOT_PRIMARY: + old = rcu_replace_pointer(cs->slots[idx], new, + lockdep_is_held(&cs->lock)); + break; + case OVPN_KEY_SLOT_SECONDARY: + old = rcu_replace_pointer(cs->slots[!idx], new, + lockdep_is_held(&cs->lock)); + break; + } + spin_unlock_bh(&cs->lock); + + if (old) + ovpn_crypto_key_slot_put(old); + + return 0; +} + +void ovpn_crypto_key_slot_delete(struct ovpn_crypto_state *cs, + enum ovpn_key_slot slot) +{ + struct ovpn_crypto_key_slot *ks = NULL; + u8 idx; + + if (slot != OVPN_KEY_SLOT_PRIMARY && + slot != OVPN_KEY_SLOT_SECONDARY) { + pr_warn("Invalid slot to release: %u\n", slot); + return; + } + + spin_lock_bh(&cs->lock); + idx = cs->primary_idx; + switch (slot) { + case OVPN_KEY_SLOT_PRIMARY: + ks = rcu_replace_pointer(cs->slots[idx], NULL, + lockdep_is_held(&cs->lock)); + break; + case OVPN_KEY_SLOT_SECONDARY: + ks = rcu_replace_pointer(cs->slots[!idx], NULL, + lockdep_is_held(&cs->lock)); + break; + } + spin_unlock_bh(&cs->lock); + + if (!ks) { + pr_debug("Key slot already released: %u\n", slot); + return; + } + + pr_debug("deleting key slot %u, key_id=%u\n", slot, ks->key_id); + ovpn_crypto_key_slot_put(ks); +} + +void ovpn_crypto_key_slots_swap(struct ovpn_crypto_state *cs) +{ + const struct ovpn_crypto_key_slot *old_primary, *old_secondary; + u8 idx; + + spin_lock_bh(&cs->lock); + idx = cs->primary_idx; + old_primary = rcu_dereference_protected(cs->slots[idx], + lockdep_is_held(&cs->lock)); + old_secondary = rcu_dereference_protected(cs->slots[!idx], + lockdep_is_held(&cs->lock)); + /* perform real swap by switching the index of the primary key */ + WRITE_ONCE(cs->primary_idx, !cs->primary_idx); + + pr_debug("key swapped: (old primary) %d <-> (new primary) %d\n", + old_primary ? old_primary->key_id : -1, + old_secondary ? old_secondary->key_id : -1); + + spin_unlock_bh(&cs->lock); +} + +/** + * ovpn_crypto_config_get - populate keyconf object with non-sensible key data + * @cs: the crypto state to extract the key data from + * @slot: the specific slot to inspect + * @keyconf: the output object to populate + * + * Return: 0 on success or a negative error code otherwise + */ +int ovpn_crypto_config_get(struct ovpn_crypto_state *cs, + enum ovpn_key_slot slot, + struct ovpn_key_config *keyconf) +{ + struct ovpn_crypto_key_slot *ks; + int idx; + + switch (slot) { + case OVPN_KEY_SLOT_PRIMARY: + idx = cs->primary_idx; + break; + case OVPN_KEY_SLOT_SECONDARY: + idx = !cs->primary_idx; + break; + default: + return -EINVAL; + } + + rcu_read_lock(); + ks = rcu_dereference(cs->slots[idx]); + if (!ks) { + rcu_read_unlock(); + return -ENOENT; + } + + keyconf->cipher_alg = ovpn_aead_crypto_alg(ks); + keyconf->key_id = ks->key_id; + rcu_read_unlock(); + + return 0; +} diff --git a/drivers/net/ovpn/crypto.h b/drivers/net/ovpn/crypto.h new file mode 100644 index 0000000000000..0e284fec3a75a --- /dev/null +++ b/drivers/net/ovpn/crypto.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_OVPNCRYPTO_H_ +#define _NET_OVPN_OVPNCRYPTO_H_ + +#include "pktid.h" +#include "proto.h" + +/* info needed for both encrypt and decrypt directions */ +struct ovpn_key_direction { + const u8 *cipher_key; + size_t cipher_key_size; + const u8 *nonce_tail; /* only needed for GCM modes */ + size_t nonce_tail_size; /* only needed for GCM modes */ +}; + +/* all info for a particular symmetric key (primary or secondary) */ +struct ovpn_key_config { + enum ovpn_cipher_alg cipher_alg; + u8 key_id; + struct ovpn_key_direction encrypt; + struct ovpn_key_direction decrypt; +}; + +/* used to pass settings from netlink to the crypto engine */ +struct ovpn_peer_key_reset { + enum ovpn_key_slot slot; + struct ovpn_key_config key; +}; + +struct ovpn_crypto_key_slot { + u8 key_id; + + struct crypto_aead *encrypt; + struct crypto_aead *decrypt; + u8 nonce_tail_xmit[OVPN_NONCE_TAIL_SIZE]; + u8 nonce_tail_recv[OVPN_NONCE_TAIL_SIZE]; + + struct ovpn_pktid_recv pid_recv ____cacheline_aligned_in_smp; + struct ovpn_pktid_xmit pid_xmit ____cacheline_aligned_in_smp; + struct kref refcount; + struct rcu_head rcu; +}; + +struct ovpn_crypto_state { + struct ovpn_crypto_key_slot __rcu *slots[2]; + u8 primary_idx; + + /* protects primary and secondary slots */ + spinlock_t lock; +}; + +static inline bool ovpn_crypto_key_slot_hold(struct ovpn_crypto_key_slot *ks) +{ + return kref_get_unless_zero(&ks->refcount); +} + +static inline void ovpn_crypto_state_init(struct ovpn_crypto_state *cs) +{ + RCU_INIT_POINTER(cs->slots[0], NULL); + RCU_INIT_POINTER(cs->slots[1], NULL); + cs->primary_idx = 0; + spin_lock_init(&cs->lock); +} + +static inline struct ovpn_crypto_key_slot * +ovpn_crypto_key_id_to_slot(const struct ovpn_crypto_state *cs, u8 key_id) +{ + struct ovpn_crypto_key_slot *ks; + u8 idx; + + if (unlikely(!cs)) + return NULL; + + rcu_read_lock(); + idx = READ_ONCE(cs->primary_idx); + ks = rcu_dereference(cs->slots[idx]); + if (ks && ks->key_id == key_id) { + if (unlikely(!ovpn_crypto_key_slot_hold(ks))) + ks = NULL; + goto out; + } + + ks = rcu_dereference(cs->slots[!idx]); + if (ks && ks->key_id == key_id) { + if (unlikely(!ovpn_crypto_key_slot_hold(ks))) + ks = NULL; + goto out; + } + + /* when both key slots are occupied but no matching key ID is found, ks + * has to be reset to NULL to avoid carrying a stale pointer + */ + ks = NULL; +out: + rcu_read_unlock(); + + return ks; +} + +static inline struct ovpn_crypto_key_slot * +ovpn_crypto_key_slot_primary(const struct ovpn_crypto_state *cs) +{ + struct ovpn_crypto_key_slot *ks; + + rcu_read_lock(); + ks = rcu_dereference(cs->slots[cs->primary_idx]); + if (unlikely(ks && !ovpn_crypto_key_slot_hold(ks))) + ks = NULL; + rcu_read_unlock(); + + return ks; +} + +void ovpn_crypto_key_slot_release(struct kref *kref); + +static inline void ovpn_crypto_key_slot_put(struct ovpn_crypto_key_slot *ks) +{ + kref_put(&ks->refcount, ovpn_crypto_key_slot_release); +} + +int ovpn_crypto_state_reset(struct ovpn_crypto_state *cs, + const struct ovpn_peer_key_reset *pkr); + +void ovpn_crypto_key_slot_delete(struct ovpn_crypto_state *cs, + enum ovpn_key_slot slot); + +void ovpn_crypto_state_release(struct ovpn_crypto_state *cs); + +void ovpn_crypto_key_slots_swap(struct ovpn_crypto_state *cs); + +int ovpn_crypto_config_get(struct ovpn_crypto_state *cs, + enum ovpn_key_slot slot, + struct ovpn_key_config *keyconf); + +bool ovpn_crypto_kill_key(struct ovpn_crypto_state *cs, u8 key_id); + +#endif /* _NET_OVPN_OVPNCRYPTO_H_ */ diff --git a/drivers/net/ovpn/crypto_aead.c b/drivers/net/ovpn/crypto_aead.c new file mode 100644 index 0000000000000..74ee639ac8688 --- /dev/null +++ b/drivers/net/ovpn/crypto_aead.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#include +#include +#include +#include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "io.h" +#include "pktid.h" +#include "crypto_aead.h" +#include "crypto.h" +#include "peer.h" +#include "proto.h" +#include "skb.h" + +#define OVPN_AUTH_TAG_SIZE 16 +#define OVPN_AAD_SIZE (OVPN_OPCODE_SIZE + OVPN_NONCE_WIRE_SIZE) + +#define ALG_NAME_AES "gcm(aes)" +#define ALG_NAME_CHACHAPOLY "rfc7539(chacha20,poly1305)" + +static int ovpn_aead_encap_overhead(const struct ovpn_crypto_key_slot *ks) +{ + return OVPN_OPCODE_SIZE + /* OP header size */ + sizeof(u32) + /* Packet ID */ + crypto_aead_authsize(ks->encrypt); /* Auth Tag */ +} + +int ovpn_aead_encrypt(struct ovpn_peer *peer, struct ovpn_crypto_key_slot *ks, + struct sk_buff *skb) +{ + const unsigned int tag_size = crypto_aead_authsize(ks->encrypt); + struct aead_request *req; + struct sk_buff *trailer; + struct scatterlist *sg; + int nfrags, ret; + u32 pktid, op; + u8 *iv; + + ovpn_skb_cb(skb)->peer = peer; + ovpn_skb_cb(skb)->ks = ks; + + /* Sample AEAD header format: + * 48000001 00000005 7e7046bd 444a7e28 cc6387b1 64a4d6c1 380275a... + * [ OP32 ] [seq # ] [ auth tag ] [ payload ... ] + * [4-byte + * IV head] + */ + + /* check that there's enough headroom in the skb for packet + * encapsulation + */ + if (unlikely(skb_cow_head(skb, OVPN_HEAD_ROOM))) + return -ENOBUFS; + + /* get number of skb frags and ensure that packet data is writable */ + nfrags = skb_cow_data(skb, 0, &trailer); + if (unlikely(nfrags < 0)) + return nfrags; + + if (unlikely(nfrags + 2 > (MAX_SKB_FRAGS + 2))) + return -ENOSPC; + + /* sg may be required by async crypto */ + ovpn_skb_cb(skb)->sg = kmalloc(sizeof(*ovpn_skb_cb(skb)->sg) * + (nfrags + 2), GFP_ATOMIC); + if (unlikely(!ovpn_skb_cb(skb)->sg)) + return -ENOMEM; + + sg = ovpn_skb_cb(skb)->sg; + + /* sg table: + * 0: op, wire nonce (AD, len=OVPN_OP_SIZE_V2+OVPN_NONCE_WIRE_SIZE), + * 1, 2, 3, ..., n: payload, + * n+1: auth_tag (len=tag_size) + */ + sg_init_table(sg, nfrags + 2); + + /* build scatterlist to encrypt packet payload */ + ret = skb_to_sgvec_nomark(skb, sg + 1, 0, skb->len); + if (unlikely(nfrags != ret)) + return -EINVAL; + + /* append auth_tag onto scatterlist */ + __skb_push(skb, tag_size); + sg_set_buf(sg + nfrags + 1, skb->data, tag_size); + + /* obtain packet ID, which is used both as a first + * 4 bytes of nonce and last 4 bytes of associated data. + */ + ret = ovpn_pktid_xmit_next(&ks->pid_xmit, &pktid); + if (unlikely(ret < 0)) + return ret; + + /* iv may be required by async crypto */ + ovpn_skb_cb(skb)->iv = kmalloc(OVPN_NONCE_SIZE, GFP_ATOMIC); + if (unlikely(!ovpn_skb_cb(skb)->iv)) + return -ENOMEM; + + iv = ovpn_skb_cb(skb)->iv; + + /* concat 4 bytes packet id and 8 bytes nonce tail into 12 bytes + * nonce + */ + ovpn_pktid_aead_write(pktid, ks->nonce_tail_xmit, iv); + + /* make space for packet id and push it to the front */ + __skb_push(skb, OVPN_NONCE_WIRE_SIZE); + memcpy(skb->data, iv, OVPN_NONCE_WIRE_SIZE); + + /* add packet op as head of additional data */ + op = ovpn_opcode_compose(OVPN_DATA_V2, ks->key_id, peer->id); + __skb_push(skb, OVPN_OPCODE_SIZE); + BUILD_BUG_ON(sizeof(op) != OVPN_OPCODE_SIZE); + *((__force __be32 *)skb->data) = htonl(op); + + /* AEAD Additional data */ + sg_set_buf(sg, skb->data, OVPN_AAD_SIZE); + + req = aead_request_alloc(ks->encrypt, GFP_ATOMIC); + if (unlikely(!req)) + return -ENOMEM; + + ovpn_skb_cb(skb)->req = req; + + /* setup async crypto operation */ + aead_request_set_tfm(req, ks->encrypt); + aead_request_set_callback(req, 0, ovpn_encrypt_post, skb); + aead_request_set_crypt(req, sg, sg, + skb->len - ovpn_aead_encap_overhead(ks), iv); + aead_request_set_ad(req, OVPN_AAD_SIZE); + + /* encrypt it */ + return crypto_aead_encrypt(req); +} + +int ovpn_aead_decrypt(struct ovpn_peer *peer, struct ovpn_crypto_key_slot *ks, + struct sk_buff *skb) +{ + const unsigned int tag_size = crypto_aead_authsize(ks->decrypt); + int ret, payload_len, nfrags; + unsigned int payload_offset; + struct aead_request *req; + struct sk_buff *trailer; + struct scatterlist *sg; + u8 *iv; + + payload_offset = OVPN_AAD_SIZE + tag_size; + payload_len = skb->len - payload_offset; + + ovpn_skb_cb(skb)->payload_offset = payload_offset; + ovpn_skb_cb(skb)->peer = peer; + ovpn_skb_cb(skb)->ks = ks; + + /* sanity check on packet size, payload size must be >= 0 */ + if (unlikely(payload_len < 0)) + return -EINVAL; + + /* Prepare the skb data buffer to be accessed up until the auth tag. + * This is required because this area is directly mapped into the sg + * list. + */ + if (unlikely(!pskb_may_pull(skb, payload_offset))) + return -ENODATA; + + /* get number of skb frags and ensure that packet data is writable */ + nfrags = skb_cow_data(skb, 0, &trailer); + if (unlikely(nfrags < 0)) + return nfrags; + + if (unlikely(nfrags + 2 > (MAX_SKB_FRAGS + 2))) + return -ENOSPC; + + /* sg may be required by async crypto */ + ovpn_skb_cb(skb)->sg = kmalloc(sizeof(*ovpn_skb_cb(skb)->sg) * + (nfrags + 2), GFP_ATOMIC); + if (unlikely(!ovpn_skb_cb(skb)->sg)) + return -ENOMEM; + + sg = ovpn_skb_cb(skb)->sg; + + /* sg table: + * 0: op, wire nonce (AD, len=OVPN_OPCODE_SIZE+OVPN_NONCE_WIRE_SIZE), + * 1, 2, 3, ..., n: payload, + * n+1: auth_tag (len=tag_size) + */ + sg_init_table(sg, nfrags + 2); + + /* packet op is head of additional data */ + sg_set_buf(sg, skb->data, OVPN_AAD_SIZE); + + /* build scatterlist to decrypt packet payload */ + ret = skb_to_sgvec_nomark(skb, sg + 1, payload_offset, payload_len); + if (unlikely(nfrags != ret)) + return -EINVAL; + + /* append auth_tag onto scatterlist */ + sg_set_buf(sg + nfrags + 1, skb->data + OVPN_AAD_SIZE, tag_size); + + /* iv may be required by async crypto */ + ovpn_skb_cb(skb)->iv = kmalloc(OVPN_NONCE_SIZE, GFP_ATOMIC); + if (unlikely(!ovpn_skb_cb(skb)->iv)) + return -ENOMEM; + + iv = ovpn_skb_cb(skb)->iv; + + /* copy nonce into IV buffer */ + memcpy(iv, skb->data + OVPN_OPCODE_SIZE, OVPN_NONCE_WIRE_SIZE); + memcpy(iv + OVPN_NONCE_WIRE_SIZE, ks->nonce_tail_recv, + OVPN_NONCE_TAIL_SIZE); + + req = aead_request_alloc(ks->decrypt, GFP_ATOMIC); + if (unlikely(!req)) + return -ENOMEM; + + ovpn_skb_cb(skb)->req = req; + + /* setup async crypto operation */ + aead_request_set_tfm(req, ks->decrypt); + aead_request_set_callback(req, 0, ovpn_decrypt_post, skb); + aead_request_set_crypt(req, sg, sg, payload_len + tag_size, iv); + + aead_request_set_ad(req, OVPN_AAD_SIZE); + + /* decrypt it */ + return crypto_aead_decrypt(req); +} + +/* Initialize a struct crypto_aead object */ +static struct crypto_aead *ovpn_aead_init(const char *title, + const char *alg_name, + const unsigned char *key, + unsigned int keylen) +{ + struct crypto_aead *aead; + int ret; + + aead = crypto_alloc_aead(alg_name, 0, 0); + if (IS_ERR(aead)) { + ret = PTR_ERR(aead); + pr_err("%s crypto_alloc_aead failed, err=%d\n", title, ret); + aead = NULL; + goto error; + } + + ret = crypto_aead_setkey(aead, key, keylen); + if (ret) { + pr_err("%s crypto_aead_setkey size=%u failed, err=%d\n", title, + keylen, ret); + goto error; + } + + ret = crypto_aead_setauthsize(aead, OVPN_AUTH_TAG_SIZE); + if (ret) { + pr_err("%s crypto_aead_setauthsize failed, err=%d\n", title, + ret); + goto error; + } + + /* basic AEAD assumption */ + if (crypto_aead_ivsize(aead) != OVPN_NONCE_SIZE) { + pr_err("%s IV size must be %d\n", title, OVPN_NONCE_SIZE); + ret = -EINVAL; + goto error; + } + + pr_debug("********* Cipher %s (%s)\n", alg_name, title); + pr_debug("*** IV size=%u\n", crypto_aead_ivsize(aead)); + pr_debug("*** req size=%u\n", crypto_aead_reqsize(aead)); + pr_debug("*** block size=%u\n", crypto_aead_blocksize(aead)); + pr_debug("*** auth size=%u\n", crypto_aead_authsize(aead)); + pr_debug("*** alignmask=0x%x\n", crypto_aead_alignmask(aead)); + + return aead; + +error: + crypto_free_aead(aead); + return ERR_PTR(ret); +} + +void ovpn_aead_crypto_key_slot_destroy(struct ovpn_crypto_key_slot *ks) +{ + if (!ks) + return; + + crypto_free_aead(ks->encrypt); + crypto_free_aead(ks->decrypt); + kfree(ks); +} + +struct ovpn_crypto_key_slot * +ovpn_aead_crypto_key_slot_new(const struct ovpn_key_config *kc) +{ + struct ovpn_crypto_key_slot *ks = NULL; + const char *alg_name; + int ret; + + /* validate crypto alg */ + switch (kc->cipher_alg) { + case OVPN_CIPHER_ALG_AES_GCM: + alg_name = ALG_NAME_AES; + break; + case OVPN_CIPHER_ALG_CHACHA20_POLY1305: + alg_name = ALG_NAME_CHACHAPOLY; + break; + default: + return ERR_PTR(-EOPNOTSUPP); + } + + if (kc->encrypt.nonce_tail_size != OVPN_NONCE_TAIL_SIZE || + kc->decrypt.nonce_tail_size != OVPN_NONCE_TAIL_SIZE) + return ERR_PTR(-EINVAL); + + /* build the key slot */ + ks = kmalloc(sizeof(*ks), GFP_KERNEL); + if (!ks) + return ERR_PTR(-ENOMEM); + + ks->encrypt = NULL; + ks->decrypt = NULL; + kref_init(&ks->refcount); + ks->key_id = kc->key_id; + + ks->encrypt = ovpn_aead_init("encrypt", alg_name, + kc->encrypt.cipher_key, + kc->encrypt.cipher_key_size); + if (IS_ERR(ks->encrypt)) { + ret = PTR_ERR(ks->encrypt); + ks->encrypt = NULL; + goto destroy_ks; + } + + ks->decrypt = ovpn_aead_init("decrypt", alg_name, + kc->decrypt.cipher_key, + kc->decrypt.cipher_key_size); + if (IS_ERR(ks->decrypt)) { + ret = PTR_ERR(ks->decrypt); + ks->decrypt = NULL; + goto destroy_ks; + } + + memcpy(ks->nonce_tail_xmit, kc->encrypt.nonce_tail, + OVPN_NONCE_TAIL_SIZE); + memcpy(ks->nonce_tail_recv, kc->decrypt.nonce_tail, + OVPN_NONCE_TAIL_SIZE); + + /* init packet ID generation/validation */ + ovpn_pktid_xmit_init(&ks->pid_xmit); + ovpn_pktid_recv_init(&ks->pid_recv); + + return ks; + +destroy_ks: + ovpn_aead_crypto_key_slot_destroy(ks); + return ERR_PTR(ret); +} + +enum ovpn_cipher_alg ovpn_aead_crypto_alg(struct ovpn_crypto_key_slot *ks) +{ + const char *alg_name; + + if (!ks->encrypt) + return OVPN_CIPHER_ALG_NONE; + + alg_name = crypto_tfm_alg_name(crypto_aead_tfm(ks->encrypt)); + + if (!strcmp(alg_name, ALG_NAME_AES)) + return OVPN_CIPHER_ALG_AES_GCM; + else if (!strcmp(alg_name, ALG_NAME_CHACHAPOLY)) + return OVPN_CIPHER_ALG_CHACHA20_POLY1305; + else + return OVPN_CIPHER_ALG_NONE; +} diff --git a/drivers/net/ovpn/crypto_aead.h b/drivers/net/ovpn/crypto_aead.h new file mode 100644 index 0000000000000..65a2ff3078986 --- /dev/null +++ b/drivers/net/ovpn/crypto_aead.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_OVPNAEAD_H_ +#define _NET_OVPN_OVPNAEAD_H_ + +#include "crypto.h" + +#include +#include + +int ovpn_aead_encrypt(struct ovpn_peer *peer, struct ovpn_crypto_key_slot *ks, + struct sk_buff *skb); +int ovpn_aead_decrypt(struct ovpn_peer *peer, struct ovpn_crypto_key_slot *ks, + struct sk_buff *skb); + +struct ovpn_crypto_key_slot * +ovpn_aead_crypto_key_slot_new(const struct ovpn_key_config *kc); +void ovpn_aead_crypto_key_slot_destroy(struct ovpn_crypto_key_slot *ks); + +enum ovpn_cipher_alg ovpn_aead_crypto_alg(struct ovpn_crypto_key_slot *ks); + +#endif /* _NET_OVPN_OVPNAEAD_H_ */ diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c new file mode 100644 index 0000000000000..dd8a8055d9676 --- /dev/null +++ b/drivers/net/ovpn/io.c @@ -0,0 +1,446 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#include +#include +#include +#include +#include +#include + +#include "ovpnpriv.h" +#include "peer.h" +#include "io.h" +#include "bind.h" +#include "crypto.h" +#include "crypto_aead.h" +#include "netlink.h" +#include "proto.h" +#include "tcp.h" +#include "udp.h" +#include "skb.h" +#include "socket.h" + +const unsigned char ovpn_keepalive_message[OVPN_KEEPALIVE_SIZE] = { + 0x2a, 0x18, 0x7b, 0xf3, 0x64, 0x1e, 0xb4, 0xcb, + 0x07, 0xed, 0x2d, 0x0a, 0x98, 0x1f, 0xc7, 0x48 +}; + +/** + * ovpn_is_keepalive - check if skb contains a keepalive message + * @skb: packet to check + * + * Assumes that the first byte of skb->data is defined. + * + * Return: true if skb contains a keepalive or false otherwise + */ +static bool ovpn_is_keepalive(struct sk_buff *skb) +{ + if (*skb->data != ovpn_keepalive_message[0]) + return false; + + if (skb->len != OVPN_KEEPALIVE_SIZE) + return false; + + if (!pskb_may_pull(skb, OVPN_KEEPALIVE_SIZE)) + return false; + + return !memcmp(skb->data, ovpn_keepalive_message, OVPN_KEEPALIVE_SIZE); +} + +/* Called after decrypt to write the IP packet to the device. + * This method is expected to manage/free the skb. + */ +static void ovpn_netdev_write(struct ovpn_peer *peer, struct sk_buff *skb) +{ + unsigned int pkt_len; + int ret; + + /* we can't guarantee the packet wasn't corrupted before entering the + * VPN, therefore we give other layers a chance to check that + */ + skb->ip_summed = CHECKSUM_NONE; + + /* skb hash for transport packet no longer valid after decapsulation */ + skb_clear_hash(skb); + + /* post-decrypt scrub -- prepare to inject encapsulated packet onto the + * interface, based on __skb_tunnel_rx() in dst.h + */ + skb->dev = peer->ovpn->dev; + skb_set_queue_mapping(skb, 0); + skb_scrub_packet(skb, true); + + /* network header reset in ovpn_decrypt_post() */ + skb_reset_transport_header(skb); + skb_reset_inner_headers(skb); + + /* cause packet to be "received" by the interface */ + pkt_len = skb->len; + ret = gro_cells_receive(&peer->ovpn->gro_cells, skb); + if (likely(ret == NET_RX_SUCCESS)) { + /* update RX stats with the size of decrypted packet */ + ovpn_peer_stats_increment_rx(&peer->vpn_stats, pkt_len); + dev_dstats_rx_add(peer->ovpn->dev, pkt_len); + } +} + +void ovpn_decrypt_post(void *data, int ret) +{ + struct ovpn_crypto_key_slot *ks; + unsigned int payload_offset = 0; + struct sk_buff *skb = data; + struct ovpn_socket *sock; + struct ovpn_peer *peer; + __be16 proto; + __be32 *pid; + + /* crypto is happening asynchronously. this function will be called + * again later by the crypto callback with a proper return code + */ + if (unlikely(ret == -EINPROGRESS)) + return; + + payload_offset = ovpn_skb_cb(skb)->payload_offset; + ks = ovpn_skb_cb(skb)->ks; + peer = ovpn_skb_cb(skb)->peer; + + /* crypto is done, cleanup skb CB and its members */ + kfree(ovpn_skb_cb(skb)->iv); + kfree(ovpn_skb_cb(skb)->sg); + aead_request_free(ovpn_skb_cb(skb)->req); + + if (unlikely(ret < 0)) + goto drop; + + /* PID sits after the op */ + pid = (__force __be32 *)(skb->data + OVPN_OPCODE_SIZE); + ret = ovpn_pktid_recv(&ks->pid_recv, ntohl(*pid), 0); + if (unlikely(ret < 0)) { + net_err_ratelimited("%s: PKT ID RX error for peer %u: %d\n", + netdev_name(peer->ovpn->dev), peer->id, + ret); + goto drop; + } + + /* keep track of last received authenticated packet for keepalive */ + WRITE_ONCE(peer->last_recv, ktime_get_real_seconds()); + + rcu_read_lock(); + sock = rcu_dereference(peer->sock); + if (sock && sock->sock->sk->sk_protocol == IPPROTO_UDP) + /* check if this peer changed local or remote endpoint */ + ovpn_peer_endpoints_update(peer, skb); + rcu_read_unlock(); + + /* point to encapsulated IP packet */ + __skb_pull(skb, payload_offset); + + /* check if this is a valid datapacket that has to be delivered to the + * ovpn interface + */ + skb_reset_network_header(skb); + proto = ovpn_ip_check_protocol(skb); + if (unlikely(!proto)) { + /* check if null packet */ + if (unlikely(!pskb_may_pull(skb, 1))) { + net_info_ratelimited("%s: NULL packet received from peer %u\n", + netdev_name(peer->ovpn->dev), + peer->id); + goto drop; + } + + if (ovpn_is_keepalive(skb)) { + net_dbg_ratelimited("%s: ping received from peer %u\n", + netdev_name(peer->ovpn->dev), + peer->id); + /* we drop the packet, but this is not a failure */ + consume_skb(skb); + goto drop_nocount; + } + + net_info_ratelimited("%s: unsupported protocol received from peer %u\n", + netdev_name(peer->ovpn->dev), peer->id); + goto drop; + } + skb->protocol = proto; + + /* perform Reverse Path Filtering (RPF) */ + if (unlikely(!ovpn_peer_check_by_src(peer->ovpn, skb, peer))) { + if (skb->protocol == htons(ETH_P_IPV6)) + net_dbg_ratelimited("%s: RPF dropped packet from peer %u, src: %pI6c\n", + netdev_name(peer->ovpn->dev), + peer->id, &ipv6_hdr(skb)->saddr); + else + net_dbg_ratelimited("%s: RPF dropped packet from peer %u, src: %pI4\n", + netdev_name(peer->ovpn->dev), + peer->id, &ip_hdr(skb)->saddr); + goto drop; + } + + ovpn_netdev_write(peer, skb); + /* skb is passed to upper layer - don't free it */ + skb = NULL; +drop: + if (unlikely(skb)) + dev_dstats_rx_dropped(peer->ovpn->dev); + kfree_skb(skb); +drop_nocount: + if (likely(peer)) + ovpn_peer_put(peer); + if (likely(ks)) + ovpn_crypto_key_slot_put(ks); +} + +/* RX path entry point: decrypt packet and forward it to the device */ +void ovpn_recv(struct ovpn_peer *peer, struct sk_buff *skb) +{ + struct ovpn_crypto_key_slot *ks; + u8 key_id; + + ovpn_peer_stats_increment_rx(&peer->link_stats, skb->len); + + /* get the key slot matching the key ID in the received packet */ + key_id = ovpn_key_id_from_skb(skb); + ks = ovpn_crypto_key_id_to_slot(&peer->crypto, key_id); + if (unlikely(!ks)) { + net_info_ratelimited("%s: no available key for peer %u, key-id: %u\n", + netdev_name(peer->ovpn->dev), peer->id, + key_id); + dev_dstats_rx_dropped(peer->ovpn->dev); + kfree_skb(skb); + ovpn_peer_put(peer); + return; + } + + memset(ovpn_skb_cb(skb), 0, sizeof(struct ovpn_cb)); + ovpn_decrypt_post(skb, ovpn_aead_decrypt(peer, ks, skb)); +} + +void ovpn_encrypt_post(void *data, int ret) +{ + struct ovpn_crypto_key_slot *ks; + struct sk_buff *skb = data; + struct ovpn_socket *sock; + struct ovpn_peer *peer; + unsigned int orig_len; + + /* encryption is happening asynchronously. This function will be + * called later by the crypto callback with a proper return value + */ + if (unlikely(ret == -EINPROGRESS)) + return; + + ks = ovpn_skb_cb(skb)->ks; + peer = ovpn_skb_cb(skb)->peer; + + /* crypto is done, cleanup skb CB and its members */ + kfree(ovpn_skb_cb(skb)->iv); + kfree(ovpn_skb_cb(skb)->sg); + aead_request_free(ovpn_skb_cb(skb)->req); + + if (unlikely(ret == -ERANGE)) { + /* we ran out of IVs and we must kill the key as it can't be + * use anymore + */ + netdev_warn(peer->ovpn->dev, + "killing key %u for peer %u\n", ks->key_id, + peer->id); + if (ovpn_crypto_kill_key(&peer->crypto, ks->key_id)) + /* let userspace know so that a new key must be negotiated */ + ovpn_nl_key_swap_notify(peer, ks->key_id); + + goto err; + } + + if (unlikely(ret < 0)) + goto err; + + skb_mark_not_on_list(skb); + orig_len = skb->len; + + rcu_read_lock(); + sock = rcu_dereference(peer->sock); + if (unlikely(!sock)) + goto err_unlock; + + switch (sock->sock->sk->sk_protocol) { + case IPPROTO_UDP: + ovpn_udp_send_skb(peer, sock->sock, skb); + break; + case IPPROTO_TCP: + ovpn_tcp_send_skb(peer, sock->sock, skb); + break; + default: + /* no transport configured yet */ + goto err_unlock; + } + + ovpn_peer_stats_increment_tx(&peer->link_stats, orig_len); + /* keep track of last sent packet for keepalive */ + WRITE_ONCE(peer->last_sent, ktime_get_real_seconds()); + /* skb passed down the stack - don't free it */ + skb = NULL; +err_unlock: + rcu_read_unlock(); +err: + if (unlikely(skb)) + dev_dstats_tx_dropped(peer->ovpn->dev); + if (likely(peer)) + ovpn_peer_put(peer); + if (likely(ks)) + ovpn_crypto_key_slot_put(ks); + kfree_skb(skb); +} + +static bool ovpn_encrypt_one(struct ovpn_peer *peer, struct sk_buff *skb) +{ + struct ovpn_crypto_key_slot *ks; + + /* get primary key to be used for encrypting data */ + ks = ovpn_crypto_key_slot_primary(&peer->crypto); + if (unlikely(!ks)) + return false; + + /* take a reference to the peer because the crypto code may run async. + * ovpn_encrypt_post() will release it upon completion + */ + if (unlikely(!ovpn_peer_hold(peer))) { + DEBUG_NET_WARN_ON_ONCE(1); + ovpn_crypto_key_slot_put(ks); + return false; + } + + memset(ovpn_skb_cb(skb), 0, sizeof(struct ovpn_cb)); + ovpn_encrypt_post(skb, ovpn_aead_encrypt(peer, ks, skb)); + return true; +} + +/* send skb to connected peer, if any */ +static void ovpn_send(struct ovpn_priv *ovpn, struct sk_buff *skb, + struct ovpn_peer *peer) +{ + struct sk_buff *curr, *next; + + /* this might be a GSO-segmented skb list: process each skb + * independently + */ + skb_list_walk_safe(skb, curr, next) { + if (unlikely(!ovpn_encrypt_one(peer, curr))) { + dev_dstats_tx_dropped(ovpn->dev); + kfree_skb(curr); + } + } + + ovpn_peer_put(peer); +} + +/* Send user data to the network + */ +netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ovpn_priv *ovpn = netdev_priv(dev); + struct sk_buff *segments, *curr, *next; + struct sk_buff_head skb_list; + struct ovpn_peer *peer; + __be16 proto; + int ret; + + /* reset netfilter state */ + nf_reset_ct(skb); + + /* verify IP header size in network packet */ + proto = ovpn_ip_check_protocol(skb); + if (unlikely(!proto || skb->protocol != proto)) + goto drop; + + if (skb_is_gso(skb)) { + segments = skb_gso_segment(skb, 0); + if (IS_ERR(segments)) { + ret = PTR_ERR(segments); + net_err_ratelimited("%s: cannot segment payload packet: %d\n", + netdev_name(dev), ret); + goto drop; + } + + consume_skb(skb); + skb = segments; + } + + /* from this moment on, "skb" might be a list */ + + __skb_queue_head_init(&skb_list); + skb_list_walk_safe(skb, curr, next) { + skb_mark_not_on_list(curr); + + curr = skb_share_check(curr, GFP_ATOMIC); + if (unlikely(!curr)) { + net_err_ratelimited("%s: skb_share_check failed for payload packet\n", + netdev_name(dev)); + dev_dstats_tx_dropped(ovpn->dev); + continue; + } + + __skb_queue_tail(&skb_list, curr); + } + skb_list.prev->next = NULL; + + /* retrieve peer serving the destination IP of this packet */ + peer = ovpn_peer_get_by_dst(ovpn, skb); + if (unlikely(!peer)) { + net_dbg_ratelimited("%s: no peer to send data to\n", + netdev_name(ovpn->dev)); + goto drop; + } + + ovpn_peer_stats_increment_tx(&peer->vpn_stats, skb->len); + ovpn_send(ovpn, skb_list.next, peer); + + return NETDEV_TX_OK; + +drop: + dev_dstats_tx_dropped(ovpn->dev); + skb_tx_error(skb); + kfree_skb_list(skb); + return NET_XMIT_DROP; +} + +/** + * ovpn_xmit_special - encrypt and transmit an out-of-band message to peer + * @peer: peer to send the message to + * @data: message content + * @len: message length + * + * Assumes that caller holds a reference to peer, which will be + * passed to ovpn_send() + */ +void ovpn_xmit_special(struct ovpn_peer *peer, const void *data, + const unsigned int len) +{ + struct ovpn_priv *ovpn; + struct sk_buff *skb; + + ovpn = peer->ovpn; + if (unlikely(!ovpn)) { + ovpn_peer_put(peer); + return; + } + + skb = alloc_skb(256 + len, GFP_ATOMIC); + if (unlikely(!skb)) { + ovpn_peer_put(peer); + return; + } + + skb_reserve(skb, 128); + skb->priority = TC_PRIO_BESTEFFORT; + __skb_put_data(skb, data, len); + + ovpn_send(ovpn, skb, peer); +} diff --git a/drivers/net/ovpn/io.h b/drivers/net/ovpn/io.h new file mode 100644 index 0000000000000..db9e10f9077c4 --- /dev/null +++ b/drivers/net/ovpn/io.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_OVPN_H_ +#define _NET_OVPN_OVPN_H_ + +/* DATA_V2 header size with AEAD encryption */ +#define OVPN_HEAD_ROOM (OVPN_OPCODE_SIZE + OVPN_NONCE_WIRE_SIZE + \ + 16 /* AEAD TAG length */ + \ + max(sizeof(struct udphdr), sizeof(struct tcphdr)) +\ + max(sizeof(struct ipv6hdr), sizeof(struct iphdr))) + +/* max padding required by encryption */ +#define OVPN_MAX_PADDING 16 + +#define OVPN_KEEPALIVE_SIZE 16 +extern const unsigned char ovpn_keepalive_message[OVPN_KEEPALIVE_SIZE]; + +netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev); + +void ovpn_recv(struct ovpn_peer *peer, struct sk_buff *skb); +void ovpn_xmit_special(struct ovpn_peer *peer, const void *data, + const unsigned int len); + +void ovpn_encrypt_post(void *data, int ret); +void ovpn_decrypt_post(void *data, int ret); + +#endif /* _NET_OVPN_OVPN_H_ */ diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c new file mode 100644 index 0000000000000..0acb0934c1bea --- /dev/null +++ b/drivers/net/ovpn/main.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + * James Yonan + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "netlink.h" +#include "io.h" +#include "peer.h" +#include "proto.h" +#include "tcp.h" +#include "udp.h" + +static void ovpn_priv_free(struct net_device *net) +{ + struct ovpn_priv *ovpn = netdev_priv(net); + + kfree(ovpn->peers); +} + +static int ovpn_mp_alloc(struct ovpn_priv *ovpn) +{ + struct in_device *dev_v4; + int i; + + if (ovpn->mode != OVPN_MODE_MP) + return 0; + + dev_v4 = __in_dev_get_rtnl(ovpn->dev); + if (dev_v4) { + /* disable redirects as Linux gets confused by ovpn + * handling same-LAN routing. + * This happens because a multipeer interface is used as + * relay point between hosts in the same subnet, while + * in a classic LAN this would not be needed because the + * two hosts would be able to talk directly. + */ + IN_DEV_CONF_SET(dev_v4, SEND_REDIRECTS, false); + IPV4_DEVCONF_ALL(dev_net(ovpn->dev), SEND_REDIRECTS) = false; + } + + /* the peer container is fairly large, therefore we allocate it only in + * MP mode + */ + ovpn->peers = kzalloc(sizeof(*ovpn->peers), GFP_KERNEL); + if (!ovpn->peers) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(ovpn->peers->by_id); i++) { + INIT_HLIST_HEAD(&ovpn->peers->by_id[i]); + INIT_HLIST_NULLS_HEAD(&ovpn->peers->by_vpn_addr4[i], i); + INIT_HLIST_NULLS_HEAD(&ovpn->peers->by_vpn_addr6[i], i); + INIT_HLIST_NULLS_HEAD(&ovpn->peers->by_transp_addr[i], i); + } + + return 0; +} + +static int ovpn_net_init(struct net_device *dev) +{ + struct ovpn_priv *ovpn = netdev_priv(dev); + int err = gro_cells_init(&ovpn->gro_cells, dev); + + if (err < 0) + return err; + + err = ovpn_mp_alloc(ovpn); + if (err < 0) { + gro_cells_destroy(&ovpn->gro_cells); + return err; + } + + return 0; +} + +static void ovpn_net_uninit(struct net_device *dev) +{ + struct ovpn_priv *ovpn = netdev_priv(dev); + + gro_cells_destroy(&ovpn->gro_cells); +} + +static const struct net_device_ops ovpn_netdev_ops = { + .ndo_init = ovpn_net_init, + .ndo_uninit = ovpn_net_uninit, + .ndo_start_xmit = ovpn_net_xmit, +}; + +static const struct device_type ovpn_type = { + .name = OVPN_FAMILY_NAME, +}; + +static const struct nla_policy ovpn_policy[IFLA_OVPN_MAX + 1] = { + [IFLA_OVPN_MODE] = NLA_POLICY_RANGE(NLA_U8, OVPN_MODE_P2P, + OVPN_MODE_MP), +}; + +/** + * ovpn_dev_is_valid - check if the netdevice is of type 'ovpn' + * @dev: the interface to check + * + * Return: whether the netdevice is of type 'ovpn' + */ +bool ovpn_dev_is_valid(const struct net_device *dev) +{ + return dev->netdev_ops == &ovpn_netdev_ops; +} + +static void ovpn_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + strscpy(info->driver, "ovpn", sizeof(info->driver)); + strscpy(info->bus_info, "ovpn", sizeof(info->bus_info)); +} + +static const struct ethtool_ops ovpn_ethtool_ops = { + .get_drvinfo = ovpn_get_drvinfo, + .get_link = ethtool_op_get_link, + .get_ts_info = ethtool_op_get_ts_info, +}; + +static void ovpn_setup(struct net_device *dev) +{ + netdev_features_t feat = NETIF_F_SG | NETIF_F_GSO | + NETIF_F_GSO_SOFTWARE | NETIF_F_HIGHDMA; + + dev->needs_free_netdev = true; + + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; + + dev->ethtool_ops = &ovpn_ethtool_ops; + dev->netdev_ops = &ovpn_netdev_ops; + + dev->priv_destructor = ovpn_priv_free; + + dev->hard_header_len = 0; + dev->addr_len = 0; + dev->mtu = ETH_DATA_LEN - OVPN_HEAD_ROOM; + dev->min_mtu = IPV4_MIN_MTU; + dev->max_mtu = IP_MAX_MTU - OVPN_HEAD_ROOM; + + dev->type = ARPHRD_NONE; + dev->flags = IFF_POINTOPOINT | IFF_NOARP; + dev->priv_flags |= IFF_NO_QUEUE; + + dev->lltx = true; + dev->features |= feat; + dev->hw_features |= feat; + dev->hw_enc_features |= feat; + + dev->needed_headroom = ALIGN(OVPN_HEAD_ROOM, 4); + dev->needed_tailroom = OVPN_MAX_PADDING; + + SET_NETDEV_DEVTYPE(dev, &ovpn_type); +} + +static int ovpn_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, + struct netlink_ext_ack *extack) +{ + struct ovpn_priv *ovpn = netdev_priv(dev); + struct nlattr **data = params->data; + enum ovpn_mode mode = OVPN_MODE_P2P; + + if (data && data[IFLA_OVPN_MODE]) { + mode = nla_get_u8(data[IFLA_OVPN_MODE]); + netdev_dbg(dev, "setting device mode: %u\n", mode); + } + + ovpn->dev = dev; + ovpn->mode = mode; + spin_lock_init(&ovpn->lock); + INIT_DELAYED_WORK(&ovpn->keepalive_work, ovpn_peer_keepalive_work); + + /* Set carrier explicitly after registration, this way state is + * clearly defined. + * + * In case of MP interfaces we keep the carrier always on. + * + * Carrier for P2P interfaces is initially off and it is then + * switched on and off when the remote peer is added or deleted. + */ + if (ovpn->mode == OVPN_MODE_MP) + netif_carrier_on(dev); + else + netif_carrier_off(dev); + + return register_netdevice(dev); +} + +static void ovpn_dellink(struct net_device *dev, struct list_head *head) +{ + struct ovpn_priv *ovpn = netdev_priv(dev); + + cancel_delayed_work_sync(&ovpn->keepalive_work); + ovpn_peers_free(ovpn, NULL, OVPN_DEL_PEER_REASON_TEARDOWN); + unregister_netdevice_queue(dev, head); +} + +static int ovpn_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ovpn_priv *ovpn = netdev_priv(dev); + + if (nla_put_u8(skb, IFLA_OVPN_MODE, ovpn->mode)) + return -EMSGSIZE; + + return 0; +} + +static struct rtnl_link_ops ovpn_link_ops = { + .kind = "ovpn", + .netns_refund = false, + .priv_size = sizeof(struct ovpn_priv), + .setup = ovpn_setup, + .policy = ovpn_policy, + .maxtype = IFLA_OVPN_MAX, + .newlink = ovpn_newlink, + .dellink = ovpn_dellink, + .fill_info = ovpn_fill_info, +}; + +static int __init ovpn_init(void) +{ + int err = rtnl_link_register(&ovpn_link_ops); + + if (err) { + pr_err("ovpn: can't register rtnl link ops: %d\n", err); + return err; + } + + err = ovpn_nl_register(); + if (err) { + pr_err("ovpn: can't register netlink family: %d\n", err); + goto unreg_rtnl; + } + + ovpn_tcp_init(); + + return 0; + +unreg_rtnl: + rtnl_link_unregister(&ovpn_link_ops); + return err; +} + +static __exit void ovpn_cleanup(void) +{ + ovpn_nl_unregister(); + rtnl_link_unregister(&ovpn_link_ops); + + rcu_barrier(); +} + +module_init(ovpn_init); +module_exit(ovpn_cleanup); + +MODULE_DESCRIPTION("OpenVPN data channel offload (ovpn)"); +MODULE_AUTHOR("Antonio Quartulli "); +MODULE_LICENSE("GPL"); diff --git a/drivers/net/ovpn/main.h b/drivers/net/ovpn/main.h new file mode 100644 index 0000000000000..017cd01007659 --- /dev/null +++ b/drivers/net/ovpn/main.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#ifndef _NET_OVPN_MAIN_H_ +#define _NET_OVPN_MAIN_H_ + +bool ovpn_dev_is_valid(const struct net_device *dev); + +#endif /* _NET_OVPN_MAIN_H_ */ diff --git a/drivers/net/ovpn/netlink-gen.c b/drivers/net/ovpn/netlink-gen.c new file mode 100644 index 0000000000000..58e1a4342378e --- /dev/null +++ b/drivers/net/ovpn/netlink-gen.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/ovpn.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "netlink-gen.h" + +#include + +/* Integer value ranges */ +static const struct netlink_range_validation ovpn_a_peer_id_range = { + .max = 16777215ULL, +}; + +static const struct netlink_range_validation ovpn_a_keyconf_peer_id_range = { + .max = 16777215ULL, +}; + +/* Common nested types */ +const struct nla_policy ovpn_keyconf_nl_policy[OVPN_A_KEYCONF_DECRYPT_DIR + 1] = { + [OVPN_A_KEYCONF_PEER_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &ovpn_a_keyconf_peer_id_range), + [OVPN_A_KEYCONF_SLOT] = NLA_POLICY_MAX(NLA_U32, 1), + [OVPN_A_KEYCONF_KEY_ID] = NLA_POLICY_MAX(NLA_U32, 7), + [OVPN_A_KEYCONF_CIPHER_ALG] = NLA_POLICY_MAX(NLA_U32, 2), + [OVPN_A_KEYCONF_ENCRYPT_DIR] = NLA_POLICY_NESTED(ovpn_keydir_nl_policy), + [OVPN_A_KEYCONF_DECRYPT_DIR] = NLA_POLICY_NESTED(ovpn_keydir_nl_policy), +}; + +const struct nla_policy ovpn_keydir_nl_policy[OVPN_A_KEYDIR_NONCE_TAIL + 1] = { + [OVPN_A_KEYDIR_CIPHER_KEY] = NLA_POLICY_MAX_LEN(256), + [OVPN_A_KEYDIR_NONCE_TAIL] = NLA_POLICY_EXACT_LEN(OVPN_NONCE_TAIL_SIZE), +}; + +const struct nla_policy ovpn_peer_nl_policy[OVPN_A_PEER_LINK_TX_PACKETS + 1] = { + [OVPN_A_PEER_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &ovpn_a_peer_id_range), + [OVPN_A_PEER_REMOTE_IPV4] = { .type = NLA_BE32, }, + [OVPN_A_PEER_REMOTE_IPV6] = NLA_POLICY_EXACT_LEN(16), + [OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID] = { .type = NLA_U32, }, + [OVPN_A_PEER_REMOTE_PORT] = NLA_POLICY_MIN(NLA_BE16, 1), + [OVPN_A_PEER_SOCKET] = { .type = NLA_U32, }, + [OVPN_A_PEER_SOCKET_NETNSID] = { .type = NLA_S32, }, + [OVPN_A_PEER_VPN_IPV4] = { .type = NLA_BE32, }, + [OVPN_A_PEER_VPN_IPV6] = NLA_POLICY_EXACT_LEN(16), + [OVPN_A_PEER_LOCAL_IPV4] = { .type = NLA_BE32, }, + [OVPN_A_PEER_LOCAL_IPV6] = NLA_POLICY_EXACT_LEN(16), + [OVPN_A_PEER_LOCAL_PORT] = NLA_POLICY_MIN(NLA_BE16, 1), + [OVPN_A_PEER_KEEPALIVE_INTERVAL] = { .type = NLA_U32, }, + [OVPN_A_PEER_KEEPALIVE_TIMEOUT] = { .type = NLA_U32, }, + [OVPN_A_PEER_DEL_REASON] = NLA_POLICY_MAX(NLA_U32, 4), + [OVPN_A_PEER_VPN_RX_BYTES] = { .type = NLA_UINT, }, + [OVPN_A_PEER_VPN_TX_BYTES] = { .type = NLA_UINT, }, + [OVPN_A_PEER_VPN_RX_PACKETS] = { .type = NLA_UINT, }, + [OVPN_A_PEER_VPN_TX_PACKETS] = { .type = NLA_UINT, }, + [OVPN_A_PEER_LINK_RX_BYTES] = { .type = NLA_UINT, }, + [OVPN_A_PEER_LINK_TX_BYTES] = { .type = NLA_UINT, }, + [OVPN_A_PEER_LINK_RX_PACKETS] = { .type = NLA_UINT, }, + [OVPN_A_PEER_LINK_TX_PACKETS] = { .type = NLA_UINT, }, +}; + +/* OVPN_CMD_PEER_NEW - do */ +static const struct nla_policy ovpn_peer_new_nl_policy[OVPN_A_PEER + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_PEER] = NLA_POLICY_NESTED(ovpn_peer_nl_policy), +}; + +/* OVPN_CMD_PEER_SET - do */ +static const struct nla_policy ovpn_peer_set_nl_policy[OVPN_A_PEER + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_PEER] = NLA_POLICY_NESTED(ovpn_peer_nl_policy), +}; + +/* OVPN_CMD_PEER_GET - do */ +static const struct nla_policy ovpn_peer_get_do_nl_policy[OVPN_A_PEER + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_PEER] = NLA_POLICY_NESTED(ovpn_peer_nl_policy), +}; + +/* OVPN_CMD_PEER_GET - dump */ +static const struct nla_policy ovpn_peer_get_dump_nl_policy[OVPN_A_IFINDEX + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, +}; + +/* OVPN_CMD_PEER_DEL - do */ +static const struct nla_policy ovpn_peer_del_nl_policy[OVPN_A_PEER + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_PEER] = NLA_POLICY_NESTED(ovpn_peer_nl_policy), +}; + +/* OVPN_CMD_KEY_NEW - do */ +static const struct nla_policy ovpn_key_new_nl_policy[OVPN_A_KEYCONF + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_KEYCONF] = NLA_POLICY_NESTED(ovpn_keyconf_nl_policy), +}; + +/* OVPN_CMD_KEY_GET - do */ +static const struct nla_policy ovpn_key_get_nl_policy[OVPN_A_KEYCONF + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_KEYCONF] = NLA_POLICY_NESTED(ovpn_keyconf_nl_policy), +}; + +/* OVPN_CMD_KEY_SWAP - do */ +static const struct nla_policy ovpn_key_swap_nl_policy[OVPN_A_KEYCONF + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_KEYCONF] = NLA_POLICY_NESTED(ovpn_keyconf_nl_policy), +}; + +/* OVPN_CMD_KEY_DEL - do */ +static const struct nla_policy ovpn_key_del_nl_policy[OVPN_A_KEYCONF + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_KEYCONF] = NLA_POLICY_NESTED(ovpn_keyconf_nl_policy), +}; + +/* Ops table for ovpn */ +static const struct genl_split_ops ovpn_nl_ops[] = { + { + .cmd = OVPN_CMD_PEER_NEW, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_peer_new_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_peer_new_nl_policy, + .maxattr = OVPN_A_PEER, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_PEER_SET, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_peer_set_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_peer_set_nl_policy, + .maxattr = OVPN_A_PEER, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_PEER_GET, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_peer_get_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_peer_get_do_nl_policy, + .maxattr = OVPN_A_PEER, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_PEER_GET, + .dumpit = ovpn_nl_peer_get_dumpit, + .policy = ovpn_peer_get_dump_nl_policy, + .maxattr = OVPN_A_IFINDEX, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DUMP, + }, + { + .cmd = OVPN_CMD_PEER_DEL, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_peer_del_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_peer_del_nl_policy, + .maxattr = OVPN_A_PEER, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_KEY_NEW, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_key_new_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_key_new_nl_policy, + .maxattr = OVPN_A_KEYCONF, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_KEY_GET, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_key_get_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_key_get_nl_policy, + .maxattr = OVPN_A_KEYCONF, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_KEY_SWAP, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_key_swap_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_key_swap_nl_policy, + .maxattr = OVPN_A_KEYCONF, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_KEY_DEL, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_key_del_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_key_del_nl_policy, + .maxattr = OVPN_A_KEYCONF, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group ovpn_nl_mcgrps[] = { + [OVPN_NLGRP_PEERS] = { "peers", }, +}; + +struct genl_family ovpn_nl_family __ro_after_init = { + .name = OVPN_FAMILY_NAME, + .version = OVPN_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = ovpn_nl_ops, + .n_split_ops = ARRAY_SIZE(ovpn_nl_ops), + .mcgrps = ovpn_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(ovpn_nl_mcgrps), +}; diff --git a/drivers/net/ovpn/netlink-gen.h b/drivers/net/ovpn/netlink-gen.h new file mode 100644 index 0000000000000..66a4e4a0a055b --- /dev/null +++ b/drivers/net/ovpn/netlink-gen.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/ovpn.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_OVPN_GEN_H +#define _LINUX_OVPN_GEN_H + +#include +#include + +#include + +/* Common nested types */ +extern const struct nla_policy ovpn_keyconf_nl_policy[OVPN_A_KEYCONF_DECRYPT_DIR + 1]; +extern const struct nla_policy ovpn_keydir_nl_policy[OVPN_A_KEYDIR_NONCE_TAIL + 1]; +extern const struct nla_policy ovpn_peer_nl_policy[OVPN_A_PEER_LINK_TX_PACKETS + 1]; + +int ovpn_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); +void +ovpn_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +int ovpn_nl_peer_new_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_peer_set_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_peer_get_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_peer_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int ovpn_nl_peer_del_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_key_new_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_key_get_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_key_swap_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_key_del_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + OVPN_NLGRP_PEERS, +}; + +extern struct genl_family ovpn_nl_family; + +#endif /* _LINUX_OVPN_GEN_H */ diff --git a/drivers/net/ovpn/netlink.c b/drivers/net/ovpn/netlink.c new file mode 100644 index 0000000000000..bea03913bfb1e --- /dev/null +++ b/drivers/net/ovpn/netlink.c @@ -0,0 +1,1258 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#include +#include +#include + +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "netlink.h" +#include "netlink-gen.h" +#include "bind.h" +#include "crypto.h" +#include "peer.h" +#include "socket.h" + +MODULE_ALIAS_GENL_FAMILY(OVPN_FAMILY_NAME); + +/** + * ovpn_get_dev_from_attrs - retrieve the ovpn private data from the netdevice + * a netlink message is targeting + * @net: network namespace where to look for the interface + * @info: generic netlink info from the user request + * @tracker: tracker object to be used for the netdev reference acquisition + * + * Return: the ovpn private data, if found, or an error otherwise + */ +static struct ovpn_priv * +ovpn_get_dev_from_attrs(struct net *net, const struct genl_info *info, + netdevice_tracker *tracker) +{ + struct ovpn_priv *ovpn; + struct net_device *dev; + int ifindex; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_IFINDEX)) + return ERR_PTR(-EINVAL); + + ifindex = nla_get_u32(info->attrs[OVPN_A_IFINDEX]); + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); + if (!dev) { + rcu_read_unlock(); + NL_SET_ERR_MSG_MOD(info->extack, + "ifindex does not match any interface"); + return ERR_PTR(-ENODEV); + } + + if (!ovpn_dev_is_valid(dev)) { + rcu_read_unlock(); + NL_SET_ERR_MSG_MOD(info->extack, + "specified interface is not ovpn"); + NL_SET_BAD_ATTR(info->extack, info->attrs[OVPN_A_IFINDEX]); + return ERR_PTR(-EINVAL); + } + + ovpn = netdev_priv(dev); + netdev_hold(dev, tracker, GFP_ATOMIC); + rcu_read_unlock(); + + return ovpn; +} + +int ovpn_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + netdevice_tracker *tracker = (netdevice_tracker *)&info->user_ptr[1]; + struct ovpn_priv *ovpn = ovpn_get_dev_from_attrs(genl_info_net(info), + info, tracker); + + if (IS_ERR(ovpn)) + return PTR_ERR(ovpn); + + info->user_ptr[0] = ovpn; + + return 0; +} + +void ovpn_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + netdevice_tracker *tracker = (netdevice_tracker *)&info->user_ptr[1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + + if (ovpn) + netdev_put(ovpn->dev, tracker); +} + +static bool ovpn_nl_attr_sockaddr_remote(struct nlattr **attrs, + struct sockaddr_storage *ss) +{ + struct sockaddr_in6 *sin6; + struct sockaddr_in *sin; + struct in6_addr *in6; + __be16 port = 0; + __be32 *in; + + ss->ss_family = AF_UNSPEC; + + if (attrs[OVPN_A_PEER_REMOTE_PORT]) + port = nla_get_be16(attrs[OVPN_A_PEER_REMOTE_PORT]); + + if (attrs[OVPN_A_PEER_REMOTE_IPV4]) { + ss->ss_family = AF_INET; + in = nla_data(attrs[OVPN_A_PEER_REMOTE_IPV4]); + } else if (attrs[OVPN_A_PEER_REMOTE_IPV6]) { + ss->ss_family = AF_INET6; + in6 = nla_data(attrs[OVPN_A_PEER_REMOTE_IPV6]); + } else { + return false; + } + + switch (ss->ss_family) { + case AF_INET6: + /* If this is a regular IPv6 just break and move on, + * otherwise switch to AF_INET and extract the IPv4 accordingly + */ + if (!ipv6_addr_v4mapped(in6)) { + sin6 = (struct sockaddr_in6 *)ss; + sin6->sin6_port = port; + memcpy(&sin6->sin6_addr, in6, sizeof(*in6)); + break; + } + + /* v4-mapped-v6 address */ + ss->ss_family = AF_INET; + in = &in6->s6_addr32[3]; + fallthrough; + case AF_INET: + sin = (struct sockaddr_in *)ss; + sin->sin_port = port; + sin->sin_addr.s_addr = *in; + break; + } + + return true; +} + +static u8 *ovpn_nl_attr_local_ip(struct nlattr **attrs) +{ + u8 *addr6; + + if (!attrs[OVPN_A_PEER_LOCAL_IPV4] && !attrs[OVPN_A_PEER_LOCAL_IPV6]) + return NULL; + + if (attrs[OVPN_A_PEER_LOCAL_IPV4]) + return nla_data(attrs[OVPN_A_PEER_LOCAL_IPV4]); + + addr6 = nla_data(attrs[OVPN_A_PEER_LOCAL_IPV6]); + /* this is an IPv4-mapped IPv6 address, therefore extract the actual + * v4 address from the last 4 bytes + */ + if (ipv6_addr_v4mapped((struct in6_addr *)addr6)) + return addr6 + 12; + + return addr6; +} + +static sa_family_t ovpn_nl_family_get(struct nlattr *addr4, + struct nlattr *addr6) +{ + if (addr4) + return AF_INET; + + if (addr6) { + if (ipv6_addr_v4mapped((struct in6_addr *)nla_data(addr6))) + return AF_INET; + return AF_INET6; + } + + return AF_UNSPEC; +} + +static int ovpn_nl_peer_precheck(struct ovpn_priv *ovpn, + struct genl_info *info, + struct nlattr **attrs) +{ + sa_family_t local_fam, remote_fam; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_PEER], attrs, + OVPN_A_PEER_ID)) + return -EINVAL; + + if (attrs[OVPN_A_PEER_REMOTE_IPV4] && attrs[OVPN_A_PEER_REMOTE_IPV6]) { + NL_SET_ERR_MSG_MOD(info->extack, + "cannot specify both remote IPv4 or IPv6 address"); + return -EINVAL; + } + + if (!attrs[OVPN_A_PEER_REMOTE_IPV4] && + !attrs[OVPN_A_PEER_REMOTE_IPV6] && attrs[OVPN_A_PEER_REMOTE_PORT]) { + NL_SET_ERR_MSG_MOD(info->extack, + "cannot specify remote port without IP address"); + return -EINVAL; + } + + if ((attrs[OVPN_A_PEER_REMOTE_IPV4] || + attrs[OVPN_A_PEER_REMOTE_IPV6]) && + !attrs[OVPN_A_PEER_REMOTE_PORT]) { + NL_SET_ERR_MSG_MOD(info->extack, + "cannot specify remote IP address without port"); + return -EINVAL; + } + + if (!attrs[OVPN_A_PEER_REMOTE_IPV4] && + attrs[OVPN_A_PEER_LOCAL_IPV4]) { + NL_SET_ERR_MSG_MOD(info->extack, + "cannot specify local IPv4 address without remote"); + return -EINVAL; + } + + if (!attrs[OVPN_A_PEER_REMOTE_IPV6] && + attrs[OVPN_A_PEER_LOCAL_IPV6]) { + NL_SET_ERR_MSG_MOD(info->extack, + "cannot specify local IPV6 address without remote"); + return -EINVAL; + } + + /* check that local and remote address families are the same even + * after parsing v4mapped IPv6 addresses. + * (if addresses are not provided, family will be AF_UNSPEC and + * the check is skipped) + */ + local_fam = ovpn_nl_family_get(attrs[OVPN_A_PEER_LOCAL_IPV4], + attrs[OVPN_A_PEER_LOCAL_IPV6]); + remote_fam = ovpn_nl_family_get(attrs[OVPN_A_PEER_REMOTE_IPV4], + attrs[OVPN_A_PEER_REMOTE_IPV6]); + if (local_fam != AF_UNSPEC && remote_fam != AF_UNSPEC && + local_fam != remote_fam) { + NL_SET_ERR_MSG_MOD(info->extack, + "mismatching local and remote address families"); + return -EINVAL; + } + + if (remote_fam != AF_INET6 && attrs[OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID]) { + NL_SET_ERR_MSG_MOD(info->extack, + "cannot specify scope id without remote IPv6 address"); + return -EINVAL; + } + + /* VPN IPs are needed only in MP mode for selecting the right peer */ + if (ovpn->mode == OVPN_MODE_P2P && (attrs[OVPN_A_PEER_VPN_IPV4] || + attrs[OVPN_A_PEER_VPN_IPV6])) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "unexpected VPN IP in P2P mode"); + return -EINVAL; + } + + if ((attrs[OVPN_A_PEER_KEEPALIVE_INTERVAL] && + !attrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT]) || + (!attrs[OVPN_A_PEER_KEEPALIVE_INTERVAL] && + attrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT])) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "keepalive interval and timeout are required together"); + return -EINVAL; + } + + return 0; +} + +/** + * ovpn_nl_peer_modify - modify the peer attributes according to the incoming msg + * @peer: the peer to modify + * @info: generic netlink info from the user request + * @attrs: the attributes from the user request + * + * Return: a negative error code in case of failure, 0 on success or 1 on + * success and the VPN IPs have been modified (requires rehashing in MP + * mode) + */ +static int ovpn_nl_peer_modify(struct ovpn_peer *peer, struct genl_info *info, + struct nlattr **attrs) +{ + struct sockaddr_storage ss = {}; + void *local_ip = NULL; + u32 interv, timeout; + bool rehash = false; + int ret; + + spin_lock_bh(&peer->lock); + + if (ovpn_nl_attr_sockaddr_remote(attrs, &ss)) { + /* we carry the local IP in a generic container. + * ovpn_peer_reset_sockaddr() will properly interpret it + * based on ss.ss_family + */ + local_ip = ovpn_nl_attr_local_ip(attrs); + + /* set peer sockaddr */ + ret = ovpn_peer_reset_sockaddr(peer, &ss, local_ip); + if (ret < 0) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot set peer sockaddr: %d", + ret); + goto err_unlock; + } + dst_cache_reset(&peer->dst_cache); + } + + if (attrs[OVPN_A_PEER_VPN_IPV4]) { + rehash = true; + peer->vpn_addrs.ipv4.s_addr = + nla_get_in_addr(attrs[OVPN_A_PEER_VPN_IPV4]); + } + + if (attrs[OVPN_A_PEER_VPN_IPV6]) { + rehash = true; + peer->vpn_addrs.ipv6 = + nla_get_in6_addr(attrs[OVPN_A_PEER_VPN_IPV6]); + } + + /* when setting the keepalive, both parameters have to be configured */ + if (attrs[OVPN_A_PEER_KEEPALIVE_INTERVAL] && + attrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT]) { + interv = nla_get_u32(attrs[OVPN_A_PEER_KEEPALIVE_INTERVAL]); + timeout = nla_get_u32(attrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT]); + ovpn_peer_keepalive_set(peer, interv, timeout); + } + + netdev_dbg(peer->ovpn->dev, + "modify peer id=%u endpoint=%pIScp VPN-IPv4=%pI4 VPN-IPv6=%pI6c\n", + peer->id, &ss, + &peer->vpn_addrs.ipv4.s_addr, &peer->vpn_addrs.ipv6); + + spin_unlock_bh(&peer->lock); + + return rehash ? 1 : 0; +err_unlock: + spin_unlock_bh(&peer->lock); + return ret; +} + +int ovpn_nl_peer_new_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attrs[OVPN_A_PEER_MAX + 1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + struct ovpn_socket *ovpn_sock; + struct socket *sock = NULL; + struct ovpn_peer *peer; + u32 sockfd, peer_id; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_PEER)) + return -EINVAL; + + ret = nla_parse_nested(attrs, OVPN_A_PEER_MAX, info->attrs[OVPN_A_PEER], + ovpn_peer_nl_policy, info->extack); + if (ret) + return ret; + + ret = ovpn_nl_peer_precheck(ovpn, info, attrs); + if (ret < 0) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_PEER], attrs, + OVPN_A_PEER_SOCKET)) + return -EINVAL; + + /* in MP mode VPN IPs are required for selecting the right peer */ + if (ovpn->mode == OVPN_MODE_MP && !attrs[OVPN_A_PEER_VPN_IPV4] && + !attrs[OVPN_A_PEER_VPN_IPV6]) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "VPN IP must be provided in MP mode"); + return -EINVAL; + } + + peer_id = nla_get_u32(attrs[OVPN_A_PEER_ID]); + peer = ovpn_peer_new(ovpn, peer_id); + if (IS_ERR(peer)) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot create new peer object for peer %u: %ld", + peer_id, PTR_ERR(peer)); + return PTR_ERR(peer); + } + + /* lookup the fd in the kernel table and extract the socket object */ + sockfd = nla_get_u32(attrs[OVPN_A_PEER_SOCKET]); + /* sockfd_lookup() increases sock's refcounter */ + sock = sockfd_lookup(sockfd, &ret); + if (!sock) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot lookup peer socket (fd=%u): %d", + sockfd, ret); + ret = -ENOTSOCK; + goto peer_release; + } + + /* Only when using UDP as transport protocol the remote endpoint + * can be configured so that ovpn knows where to send packets to. + */ + if (sock->sk->sk_protocol == IPPROTO_UDP && + !attrs[OVPN_A_PEER_REMOTE_IPV4] && + !attrs[OVPN_A_PEER_REMOTE_IPV6]) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "missing remote IP address for UDP socket"); + sockfd_put(sock); + ret = -EINVAL; + goto peer_release; + } + + /* In case of TCP, the socket is connected to the peer and ovpn + * will just send bytes over it, without the need to specify a + * destination. + */ + if (sock->sk->sk_protocol == IPPROTO_TCP && + (attrs[OVPN_A_PEER_REMOTE_IPV4] || + attrs[OVPN_A_PEER_REMOTE_IPV6])) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "unexpected remote IP address with TCP socket"); + sockfd_put(sock); + ret = -EINVAL; + goto peer_release; + } + + ovpn_sock = ovpn_socket_new(sock, peer); + /* at this point we unconditionally drop the reference to the socket: + * - in case of error, the socket has to be dropped + * - if case of success, the socket is configured and let + * userspace own the reference, so that the latter can + * trigger the final close() + */ + sockfd_put(sock); + if (IS_ERR(ovpn_sock)) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot encapsulate socket: %ld", + PTR_ERR(ovpn_sock)); + ret = -ENOTSOCK; + goto peer_release; + } + + rcu_assign_pointer(peer->sock, ovpn_sock); + + ret = ovpn_nl_peer_modify(peer, info, attrs); + if (ret < 0) + goto sock_release; + + ret = ovpn_peer_add(ovpn, peer); + if (ret < 0) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot add new peer (id=%u) to hashtable: %d", + peer->id, ret); + goto sock_release; + } + + return 0; + +sock_release: + ovpn_socket_release(peer); +peer_release: + /* release right away because peer was not yet hashed, thus it is not + * used in any context + */ + ovpn_peer_release(peer); + + return ret; +} + +int ovpn_nl_peer_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attrs[OVPN_A_PEER_MAX + 1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + struct ovpn_socket *sock; + struct ovpn_peer *peer; + u32 peer_id; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_PEER)) + return -EINVAL; + + ret = nla_parse_nested(attrs, OVPN_A_PEER_MAX, info->attrs[OVPN_A_PEER], + ovpn_peer_nl_policy, info->extack); + if (ret) + return ret; + + ret = ovpn_nl_peer_precheck(ovpn, info, attrs); + if (ret < 0) + return ret; + + if (attrs[OVPN_A_PEER_SOCKET]) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "socket cannot be modified"); + return -EINVAL; + } + + peer_id = nla_get_u32(attrs[OVPN_A_PEER_ID]); + peer = ovpn_peer_get_by_id(ovpn, peer_id); + if (!peer) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot find peer with id %u", peer_id); + return -ENOENT; + } + + /* when using a TCP socket the remote IP is not expected */ + rcu_read_lock(); + sock = rcu_dereference(peer->sock); + if (sock && sock->sock->sk->sk_protocol == IPPROTO_TCP && + (attrs[OVPN_A_PEER_REMOTE_IPV4] || + attrs[OVPN_A_PEER_REMOTE_IPV6])) { + rcu_read_unlock(); + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "unexpected remote IP address with TCP socket"); + ovpn_peer_put(peer); + return -EINVAL; + } + rcu_read_unlock(); + + spin_lock_bh(&ovpn->lock); + ret = ovpn_nl_peer_modify(peer, info, attrs); + if (ret < 0) { + spin_unlock_bh(&ovpn->lock); + ovpn_peer_put(peer); + return ret; + } + + /* ret == 1 means that VPN IPv4/6 has been modified and rehashing + * is required + */ + if (ret > 0) + ovpn_peer_hash_vpn_ip(peer); + spin_unlock_bh(&ovpn->lock); + ovpn_peer_put(peer); + + return 0; +} + +static int ovpn_nl_send_peer(struct sk_buff *skb, const struct genl_info *info, + const struct ovpn_peer *peer, u32 portid, u32 seq, + int flags) +{ + const struct ovpn_bind *bind; + struct ovpn_socket *sock; + int ret = -EMSGSIZE; + struct nlattr *attr; + __be16 local_port; + void *hdr; + int id; + + hdr = genlmsg_put(skb, portid, seq, &ovpn_nl_family, flags, + OVPN_CMD_PEER_GET); + if (!hdr) + return -ENOBUFS; + + attr = nla_nest_start(skb, OVPN_A_PEER); + if (!attr) + goto err; + + rcu_read_lock(); + sock = rcu_dereference(peer->sock); + if (!sock) { + ret = -EINVAL; + goto err_unlock; + } + + if (!net_eq(genl_info_net(info), sock_net(sock->sock->sk))) { + id = peernet2id_alloc(genl_info_net(info), + sock_net(sock->sock->sk), + GFP_ATOMIC); + if (nla_put_s32(skb, OVPN_A_PEER_SOCKET_NETNSID, id)) + goto err_unlock; + } + local_port = inet_sk(sock->sock->sk)->inet_sport; + rcu_read_unlock(); + + if (nla_put_u32(skb, OVPN_A_PEER_ID, peer->id)) + goto err; + + if (peer->vpn_addrs.ipv4.s_addr != htonl(INADDR_ANY)) + if (nla_put_in_addr(skb, OVPN_A_PEER_VPN_IPV4, + peer->vpn_addrs.ipv4.s_addr)) + goto err; + + if (!ipv6_addr_equal(&peer->vpn_addrs.ipv6, &in6addr_any)) + if (nla_put_in6_addr(skb, OVPN_A_PEER_VPN_IPV6, + &peer->vpn_addrs.ipv6)) + goto err; + + if (nla_put_u32(skb, OVPN_A_PEER_KEEPALIVE_INTERVAL, + peer->keepalive_interval) || + nla_put_u32(skb, OVPN_A_PEER_KEEPALIVE_TIMEOUT, + peer->keepalive_timeout)) + goto err; + + rcu_read_lock(); + bind = rcu_dereference(peer->bind); + if (bind) { + if (bind->remote.in4.sin_family == AF_INET) { + if (nla_put_in_addr(skb, OVPN_A_PEER_REMOTE_IPV4, + bind->remote.in4.sin_addr.s_addr) || + nla_put_net16(skb, OVPN_A_PEER_REMOTE_PORT, + bind->remote.in4.sin_port) || + nla_put_in_addr(skb, OVPN_A_PEER_LOCAL_IPV4, + bind->local.ipv4.s_addr)) + goto err_unlock; + } else if (bind->remote.in4.sin_family == AF_INET6) { + if (nla_put_in6_addr(skb, OVPN_A_PEER_REMOTE_IPV6, + &bind->remote.in6.sin6_addr) || + nla_put_u32(skb, OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID, + bind->remote.in6.sin6_scope_id) || + nla_put_net16(skb, OVPN_A_PEER_REMOTE_PORT, + bind->remote.in6.sin6_port) || + nla_put_in6_addr(skb, OVPN_A_PEER_LOCAL_IPV6, + &bind->local.ipv6)) + goto err_unlock; + } + } + rcu_read_unlock(); + + if (nla_put_net16(skb, OVPN_A_PEER_LOCAL_PORT, local_port) || + /* VPN RX stats */ + nla_put_uint(skb, OVPN_A_PEER_VPN_RX_BYTES, + atomic64_read(&peer->vpn_stats.rx.bytes)) || + nla_put_uint(skb, OVPN_A_PEER_VPN_RX_PACKETS, + atomic64_read(&peer->vpn_stats.rx.packets)) || + /* VPN TX stats */ + nla_put_uint(skb, OVPN_A_PEER_VPN_TX_BYTES, + atomic64_read(&peer->vpn_stats.tx.bytes)) || + nla_put_uint(skb, OVPN_A_PEER_VPN_TX_PACKETS, + atomic64_read(&peer->vpn_stats.tx.packets)) || + /* link RX stats */ + nla_put_uint(skb, OVPN_A_PEER_LINK_RX_BYTES, + atomic64_read(&peer->link_stats.rx.bytes)) || + nla_put_uint(skb, OVPN_A_PEER_LINK_RX_PACKETS, + atomic64_read(&peer->link_stats.rx.packets)) || + /* link TX stats */ + nla_put_uint(skb, OVPN_A_PEER_LINK_TX_BYTES, + atomic64_read(&peer->link_stats.tx.bytes)) || + nla_put_uint(skb, OVPN_A_PEER_LINK_TX_PACKETS, + atomic64_read(&peer->link_stats.tx.packets))) + goto err; + + nla_nest_end(skb, attr); + genlmsg_end(skb, hdr); + + return 0; +err_unlock: + rcu_read_unlock(); +err: + genlmsg_cancel(skb, hdr); + return ret; +} + +int ovpn_nl_peer_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attrs[OVPN_A_PEER_MAX + 1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + struct ovpn_peer *peer; + struct sk_buff *msg; + u32 peer_id; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_PEER)) + return -EINVAL; + + ret = nla_parse_nested(attrs, OVPN_A_PEER_MAX, info->attrs[OVPN_A_PEER], + ovpn_peer_nl_policy, info->extack); + if (ret) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_PEER], attrs, + OVPN_A_PEER_ID)) + return -EINVAL; + + peer_id = nla_get_u32(attrs[OVPN_A_PEER_ID]); + peer = ovpn_peer_get_by_id(ovpn, peer_id); + if (!peer) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot find peer with id %u", peer_id); + return -ENOENT; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + ret = ovpn_nl_send_peer(msg, info, peer, info->snd_portid, + info->snd_seq, 0); + if (ret < 0) { + nlmsg_free(msg); + goto err; + } + + ret = genlmsg_reply(msg, info); +err: + ovpn_peer_put(peer); + return ret; +} + +int ovpn_nl_peer_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct genl_info *info = genl_info_dump(cb); + int bkt, last_idx = cb->args[1], dumped = 0; + netdevice_tracker tracker; + struct ovpn_priv *ovpn; + struct ovpn_peer *peer; + + ovpn = ovpn_get_dev_from_attrs(sock_net(cb->skb->sk), info, &tracker); + if (IS_ERR(ovpn)) + return PTR_ERR(ovpn); + + if (ovpn->mode == OVPN_MODE_P2P) { + /* if we already dumped a peer it means we are done */ + if (last_idx) + goto out; + + rcu_read_lock(); + peer = rcu_dereference(ovpn->peer); + if (peer) { + if (ovpn_nl_send_peer(skb, info, peer, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI) == 0) + dumped++; + } + rcu_read_unlock(); + } else { + rcu_read_lock(); + hash_for_each_rcu(ovpn->peers->by_id, bkt, peer, + hash_entry_id) { + /* skip already dumped peers that were dumped by + * previous invocations + */ + if (last_idx > 0) { + last_idx--; + continue; + } + + if (ovpn_nl_send_peer(skb, info, peer, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI) < 0) + break; + + /* count peers being dumped during this invocation */ + dumped++; + } + rcu_read_unlock(); + } + +out: + netdev_put(ovpn->dev, &tracker); + + /* sum up peers dumped in this message, so that at the next invocation + * we can continue from where we left + */ + cb->args[1] += dumped; + return skb->len; +} + +int ovpn_nl_peer_del_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attrs[OVPN_A_PEER_MAX + 1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + struct ovpn_peer *peer; + u32 peer_id; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_PEER)) + return -EINVAL; + + ret = nla_parse_nested(attrs, OVPN_A_PEER_MAX, info->attrs[OVPN_A_PEER], + ovpn_peer_nl_policy, info->extack); + if (ret) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_PEER], attrs, + OVPN_A_PEER_ID)) + return -EINVAL; + + peer_id = nla_get_u32(attrs[OVPN_A_PEER_ID]); + peer = ovpn_peer_get_by_id(ovpn, peer_id); + if (!peer) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot find peer with id %u", peer_id); + return -ENOENT; + } + + netdev_dbg(ovpn->dev, "del peer %u\n", peer->id); + ret = ovpn_peer_del(peer, OVPN_DEL_PEER_REASON_USERSPACE); + ovpn_peer_put(peer); + + return ret; +} + +static int ovpn_nl_get_key_dir(struct genl_info *info, struct nlattr *key, + enum ovpn_cipher_alg cipher, + struct ovpn_key_direction *dir) +{ + struct nlattr *attrs[OVPN_A_KEYDIR_MAX + 1]; + int ret; + + ret = nla_parse_nested(attrs, OVPN_A_KEYDIR_MAX, key, + ovpn_keydir_nl_policy, info->extack); + if (ret) + return ret; + + switch (cipher) { + case OVPN_CIPHER_ALG_AES_GCM: + case OVPN_CIPHER_ALG_CHACHA20_POLY1305: + if (NL_REQ_ATTR_CHECK(info->extack, key, attrs, + OVPN_A_KEYDIR_CIPHER_KEY) || + NL_REQ_ATTR_CHECK(info->extack, key, attrs, + OVPN_A_KEYDIR_NONCE_TAIL)) + return -EINVAL; + + dir->cipher_key = nla_data(attrs[OVPN_A_KEYDIR_CIPHER_KEY]); + dir->cipher_key_size = nla_len(attrs[OVPN_A_KEYDIR_CIPHER_KEY]); + + /* These algorithms require a 96bit nonce, + * Construct it by combining 4-bytes packet id and + * 8-bytes nonce-tail from userspace + */ + dir->nonce_tail = nla_data(attrs[OVPN_A_KEYDIR_NONCE_TAIL]); + dir->nonce_tail_size = nla_len(attrs[OVPN_A_KEYDIR_NONCE_TAIL]); + break; + default: + NL_SET_ERR_MSG_MOD(info->extack, "unsupported cipher"); + return -EINVAL; + } + + return 0; +} + +/** + * ovpn_nl_key_new_doit - configure a new key for the specified peer + * @skb: incoming netlink message + * @info: genetlink metadata + * + * This function allows the user to install a new key in the peer crypto + * state. + * Each peer has two 'slots', namely 'primary' and 'secondary', where + * keys can be installed. The key in the 'primary' slot is used for + * encryption, while both keys can be used for decryption by matching the + * key ID carried in the incoming packet. + * + * The user is responsible for rotating keys when necessary. The user + * may fetch peer traffic statistics via netlink in order to better + * identify the right time to rotate keys. + * The renegotiation follows these steps: + * 1. a new key is computed by the user and is installed in the 'secondary' + * slot + * 2. at user discretion (usually after a predetermined time) 'primary' and + * 'secondary' contents are swapped and the new key starts being used for + * encryption, while the old key is kept around for decryption of late + * packets. + * + * Return: 0 on success or a negative error code otherwise. + */ +int ovpn_nl_key_new_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attrs[OVPN_A_KEYCONF_MAX + 1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + struct ovpn_peer_key_reset pkr; + struct ovpn_peer *peer; + u32 peer_id; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_KEYCONF)) + return -EINVAL; + + ret = nla_parse_nested(attrs, OVPN_A_KEYCONF_MAX, + info->attrs[OVPN_A_KEYCONF], + ovpn_keyconf_nl_policy, info->extack); + if (ret) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_PEER_ID)) + return -EINVAL; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_SLOT) || + NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_KEY_ID) || + NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_CIPHER_ALG) || + NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_ENCRYPT_DIR) || + NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_DECRYPT_DIR)) + return -EINVAL; + + pkr.slot = nla_get_u32(attrs[OVPN_A_KEYCONF_SLOT]); + pkr.key.key_id = nla_get_u32(attrs[OVPN_A_KEYCONF_KEY_ID]); + pkr.key.cipher_alg = nla_get_u32(attrs[OVPN_A_KEYCONF_CIPHER_ALG]); + + ret = ovpn_nl_get_key_dir(info, attrs[OVPN_A_KEYCONF_ENCRYPT_DIR], + pkr.key.cipher_alg, &pkr.key.encrypt); + if (ret < 0) + return ret; + + ret = ovpn_nl_get_key_dir(info, attrs[OVPN_A_KEYCONF_DECRYPT_DIR], + pkr.key.cipher_alg, &pkr.key.decrypt); + if (ret < 0) + return ret; + + peer_id = nla_get_u32(attrs[OVPN_A_KEYCONF_PEER_ID]); + peer = ovpn_peer_get_by_id(ovpn, peer_id); + if (!peer) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "no peer with id %u to set key for", + peer_id); + return -ENOENT; + } + + ret = ovpn_crypto_state_reset(&peer->crypto, &pkr); + if (ret < 0) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot install new key for peer %u", + peer_id); + goto out; + } + + netdev_dbg(ovpn->dev, "new key installed (id=%u) for peer %u\n", + pkr.key.key_id, peer_id); +out: + ovpn_peer_put(peer); + return ret; +} + +static int ovpn_nl_send_key(struct sk_buff *skb, const struct genl_info *info, + u32 peer_id, enum ovpn_key_slot slot, + const struct ovpn_key_config *keyconf) +{ + struct nlattr *attr; + void *hdr; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, &ovpn_nl_family, + 0, OVPN_CMD_KEY_GET); + if (!hdr) + return -ENOBUFS; + + attr = nla_nest_start(skb, OVPN_A_KEYCONF); + if (!attr) + goto err; + + if (nla_put_u32(skb, OVPN_A_KEYCONF_PEER_ID, peer_id)) + goto err; + + if (nla_put_u32(skb, OVPN_A_KEYCONF_SLOT, slot) || + nla_put_u32(skb, OVPN_A_KEYCONF_KEY_ID, keyconf->key_id) || + nla_put_u32(skb, OVPN_A_KEYCONF_CIPHER_ALG, keyconf->cipher_alg)) + goto err; + + nla_nest_end(skb, attr); + genlmsg_end(skb, hdr); + + return 0; +err: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +int ovpn_nl_key_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attrs[OVPN_A_KEYCONF_MAX + 1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + struct ovpn_key_config keyconf = { 0 }; + enum ovpn_key_slot slot; + struct ovpn_peer *peer; + struct sk_buff *msg; + u32 peer_id; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_KEYCONF)) + return -EINVAL; + + ret = nla_parse_nested(attrs, OVPN_A_KEYCONF_MAX, + info->attrs[OVPN_A_KEYCONF], + ovpn_keyconf_nl_policy, info->extack); + if (ret) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_PEER_ID)) + return -EINVAL; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_SLOT)) + return -EINVAL; + + peer_id = nla_get_u32(attrs[OVPN_A_KEYCONF_PEER_ID]); + peer = ovpn_peer_get_by_id(ovpn, peer_id); + if (!peer) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot find peer with id %u", peer_id); + return -ENOENT; + } + + slot = nla_get_u32(attrs[OVPN_A_KEYCONF_SLOT]); + + ret = ovpn_crypto_config_get(&peer->crypto, slot, &keyconf); + if (ret < 0) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot extract key from slot %u for peer %u", + slot, peer_id); + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + ret = ovpn_nl_send_key(msg, info, peer->id, slot, &keyconf); + if (ret < 0) { + nlmsg_free(msg); + goto err; + } + + ret = genlmsg_reply(msg, info); +err: + ovpn_peer_put(peer); + return ret; +} + +int ovpn_nl_key_swap_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct ovpn_priv *ovpn = info->user_ptr[0]; + struct nlattr *attrs[OVPN_A_PEER_MAX + 1]; + struct ovpn_peer *peer; + u32 peer_id; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_KEYCONF)) + return -EINVAL; + + ret = nla_parse_nested(attrs, OVPN_A_KEYCONF_MAX, + info->attrs[OVPN_A_KEYCONF], + ovpn_keyconf_nl_policy, info->extack); + if (ret) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_PEER_ID)) + return -EINVAL; + + peer_id = nla_get_u32(attrs[OVPN_A_KEYCONF_PEER_ID]); + peer = ovpn_peer_get_by_id(ovpn, peer_id); + if (!peer) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "no peer with id %u to swap keys for", + peer_id); + return -ENOENT; + } + + ovpn_crypto_key_slots_swap(&peer->crypto); + ovpn_peer_put(peer); + + return 0; +} + +int ovpn_nl_key_del_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attrs[OVPN_A_KEYCONF_MAX + 1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + enum ovpn_key_slot slot; + struct ovpn_peer *peer; + u32 peer_id; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_KEYCONF)) + return -EINVAL; + + ret = nla_parse_nested(attrs, OVPN_A_KEYCONF_MAX, + info->attrs[OVPN_A_KEYCONF], + ovpn_keyconf_nl_policy, info->extack); + if (ret) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_PEER_ID)) + return -EINVAL; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_SLOT)) + return -EINVAL; + + peer_id = nla_get_u32(attrs[OVPN_A_KEYCONF_PEER_ID]); + slot = nla_get_u32(attrs[OVPN_A_KEYCONF_SLOT]); + + peer = ovpn_peer_get_by_id(ovpn, peer_id); + if (!peer) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "no peer with id %u to delete key for", + peer_id); + return -ENOENT; + } + + ovpn_crypto_key_slot_delete(&peer->crypto, slot); + ovpn_peer_put(peer); + + return 0; +} + +/** + * ovpn_nl_peer_del_notify - notify userspace about peer being deleted + * @peer: the peer being deleted + * + * Return: 0 on success or a negative error code otherwise + */ +int ovpn_nl_peer_del_notify(struct ovpn_peer *peer) +{ + struct ovpn_socket *sock; + struct sk_buff *msg; + struct nlattr *attr; + int ret = -EMSGSIZE; + void *hdr; + + netdev_info(peer->ovpn->dev, "deleting peer with id %u, reason %d\n", + peer->id, peer->delete_reason); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, 0, 0, &ovpn_nl_family, 0, OVPN_CMD_PEER_DEL_NTF); + if (!hdr) { + ret = -ENOBUFS; + goto err_free_msg; + } + + if (nla_put_u32(msg, OVPN_A_IFINDEX, peer->ovpn->dev->ifindex)) + goto err_cancel_msg; + + attr = nla_nest_start(msg, OVPN_A_PEER); + if (!attr) + goto err_cancel_msg; + + if (nla_put_u32(msg, OVPN_A_PEER_DEL_REASON, peer->delete_reason)) + goto err_cancel_msg; + + if (nla_put_u32(msg, OVPN_A_PEER_ID, peer->id)) + goto err_cancel_msg; + + nla_nest_end(msg, attr); + + genlmsg_end(msg, hdr); + + rcu_read_lock(); + sock = rcu_dereference(peer->sock); + if (!sock) { + ret = -EINVAL; + goto err_unlock; + } + genlmsg_multicast_netns(&ovpn_nl_family, sock_net(sock->sock->sk), + msg, 0, OVPN_NLGRP_PEERS, GFP_ATOMIC); + rcu_read_unlock(); + + return 0; + +err_unlock: + rcu_read_unlock(); +err_cancel_msg: + genlmsg_cancel(msg, hdr); +err_free_msg: + nlmsg_free(msg); + return ret; +} + +/** + * ovpn_nl_key_swap_notify - notify userspace peer's key must be renewed + * @peer: the peer whose key needs to be renewed + * @key_id: the ID of the key that needs to be renewed + * + * Return: 0 on success or a negative error code otherwise + */ +int ovpn_nl_key_swap_notify(struct ovpn_peer *peer, u8 key_id) +{ + struct ovpn_socket *sock; + struct nlattr *k_attr; + struct sk_buff *msg; + int ret = -EMSGSIZE; + void *hdr; + + netdev_info(peer->ovpn->dev, "peer with id %u must rekey - primary key unusable.\n", + peer->id); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, 0, 0, &ovpn_nl_family, 0, OVPN_CMD_KEY_SWAP_NTF); + if (!hdr) { + ret = -ENOBUFS; + goto err_free_msg; + } + + if (nla_put_u32(msg, OVPN_A_IFINDEX, peer->ovpn->dev->ifindex)) + goto err_cancel_msg; + + k_attr = nla_nest_start(msg, OVPN_A_KEYCONF); + if (!k_attr) + goto err_cancel_msg; + + if (nla_put_u32(msg, OVPN_A_KEYCONF_PEER_ID, peer->id)) + goto err_cancel_msg; + + if (nla_put_u16(msg, OVPN_A_KEYCONF_KEY_ID, key_id)) + goto err_cancel_msg; + + nla_nest_end(msg, k_attr); + genlmsg_end(msg, hdr); + + rcu_read_lock(); + sock = rcu_dereference(peer->sock); + if (!sock) { + ret = -EINVAL; + goto err_unlock; + } + genlmsg_multicast_netns(&ovpn_nl_family, sock_net(sock->sock->sk), + msg, 0, OVPN_NLGRP_PEERS, GFP_ATOMIC); + rcu_read_unlock(); + + return 0; +err_unlock: + rcu_read_unlock(); +err_cancel_msg: + genlmsg_cancel(msg, hdr); +err_free_msg: + nlmsg_free(msg); + return ret; +} + +/** + * ovpn_nl_register - perform any needed registration in the NL subsustem + * + * Return: 0 on success, a negative error code otherwise + */ +int __init ovpn_nl_register(void) +{ + int ret = genl_register_family(&ovpn_nl_family); + + if (ret) { + pr_err("ovpn: genl_register_family failed: %d\n", ret); + return ret; + } + + return 0; +} + +/** + * ovpn_nl_unregister - undo any module wide netlink registration + */ +void ovpn_nl_unregister(void) +{ + genl_unregister_family(&ovpn_nl_family); +} diff --git a/drivers/net/ovpn/netlink.h b/drivers/net/ovpn/netlink.h new file mode 100644 index 0000000000000..8615dfc3c4720 --- /dev/null +++ b/drivers/net/ovpn/netlink.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#ifndef _NET_OVPN_NETLINK_H_ +#define _NET_OVPN_NETLINK_H_ + +int ovpn_nl_register(void); +void ovpn_nl_unregister(void); + +int ovpn_nl_peer_del_notify(struct ovpn_peer *peer); +int ovpn_nl_key_swap_notify(struct ovpn_peer *peer, u8 key_id); + +#endif /* _NET_OVPN_NETLINK_H_ */ diff --git a/drivers/net/ovpn/ovpnpriv.h b/drivers/net/ovpn/ovpnpriv.h new file mode 100644 index 0000000000000..5898f6adada7f --- /dev/null +++ b/drivers/net/ovpn/ovpnpriv.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_OVPNSTRUCT_H_ +#define _NET_OVPN_OVPNSTRUCT_H_ + +#include +#include +#include +#include + +/** + * struct ovpn_peer_collection - container of peers for MultiPeer mode + * @by_id: table of peers index by ID + * @by_vpn_addr4: table of peers indexed by VPN IPv4 address (items can be + * rehashed on the fly due to peer IP change) + * @by_vpn_addr6: table of peers indexed by VPN IPv6 address (items can be + * rehashed on the fly due to peer IP change) + * @by_transp_addr: table of peers indexed by transport address (items can be + * rehashed on the fly due to peer IP change) + */ +struct ovpn_peer_collection { + DECLARE_HASHTABLE(by_id, 12); + struct hlist_nulls_head by_vpn_addr4[1 << 12]; + struct hlist_nulls_head by_vpn_addr6[1 << 12]; + struct hlist_nulls_head by_transp_addr[1 << 12]; +}; + +/** + * struct ovpn_priv - per ovpn interface state + * @dev: the actual netdev representing the tunnel + * @mode: device operation mode (i.e. p2p, mp, ..) + * @lock: protect this object + * @peers: data structures holding multi-peer references + * @peer: in P2P mode, this is the only remote peer + * @gro_cells: pointer to the Generic Receive Offload cell + * @keepalive_work: struct used to schedule keepalive periodic job + */ +struct ovpn_priv { + struct net_device *dev; + enum ovpn_mode mode; + spinlock_t lock; /* protect writing to the ovpn_priv object */ + struct ovpn_peer_collection *peers; + struct ovpn_peer __rcu *peer; + struct gro_cells gro_cells; + struct delayed_work keepalive_work; +}; + +#endif /* _NET_OVPN_OVPNSTRUCT_H_ */ diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c new file mode 100644 index 0000000000000..a37f89fffb02e --- /dev/null +++ b/drivers/net/ovpn/peer.c @@ -0,0 +1,1365 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#include +#include +#include +#include + +#include "ovpnpriv.h" +#include "bind.h" +#include "pktid.h" +#include "crypto.h" +#include "io.h" +#include "main.h" +#include "netlink.h" +#include "peer.h" +#include "socket.h" + +static void unlock_ovpn(struct ovpn_priv *ovpn, + struct llist_head *release_list) + __releases(&ovpn->lock) +{ + struct ovpn_peer *peer; + + spin_unlock_bh(&ovpn->lock); + + llist_for_each_entry(peer, release_list->first, release_entry) { + ovpn_socket_release(peer); + ovpn_peer_put(peer); + } +} + +/** + * ovpn_peer_keepalive_set - configure keepalive values for peer + * @peer: the peer to configure + * @interval: outgoing keepalive interval + * @timeout: incoming keepalive timeout + */ +void ovpn_peer_keepalive_set(struct ovpn_peer *peer, u32 interval, u32 timeout) +{ + time64_t now = ktime_get_real_seconds(); + + netdev_dbg(peer->ovpn->dev, + "scheduling keepalive for peer %u: interval=%u timeout=%u\n", + peer->id, interval, timeout); + + peer->keepalive_interval = interval; + WRITE_ONCE(peer->last_sent, now); + peer->keepalive_xmit_exp = now + interval; + + peer->keepalive_timeout = timeout; + WRITE_ONCE(peer->last_recv, now); + peer->keepalive_recv_exp = now + timeout; + + /* now that interval and timeout have been changed, kick + * off the worker so that the next delay can be recomputed + */ + mod_delayed_work(system_wq, &peer->ovpn->keepalive_work, 0); +} + +/** + * ovpn_peer_keepalive_send - periodic worker sending keepalive packets + * @work: pointer to the work member of the related peer object + * + * NOTE: the reference to peer is not dropped because it gets inherited + * by ovpn_xmit_special() + */ +static void ovpn_peer_keepalive_send(struct work_struct *work) +{ + struct ovpn_peer *peer = container_of(work, struct ovpn_peer, + keepalive_work); + + local_bh_disable(); + ovpn_xmit_special(peer, ovpn_keepalive_message, + sizeof(ovpn_keepalive_message)); + local_bh_enable(); +} + +/** + * ovpn_peer_new - allocate and initialize a new peer object + * @ovpn: the openvpn instance inside which the peer should be created + * @id: the ID assigned to this peer + * + * Return: a pointer to the new peer on success or an error code otherwise + */ +struct ovpn_peer *ovpn_peer_new(struct ovpn_priv *ovpn, u32 id) +{ + struct ovpn_peer *peer; + int ret; + + /* alloc and init peer object */ + peer = kzalloc(sizeof(*peer), GFP_KERNEL); + if (!peer) + return ERR_PTR(-ENOMEM); + + peer->id = id; + peer->ovpn = ovpn; + + peer->vpn_addrs.ipv4.s_addr = htonl(INADDR_ANY); + peer->vpn_addrs.ipv6 = in6addr_any; + + RCU_INIT_POINTER(peer->bind, NULL); + ovpn_crypto_state_init(&peer->crypto); + spin_lock_init(&peer->lock); + kref_init(&peer->refcount); + ovpn_peer_stats_init(&peer->vpn_stats); + ovpn_peer_stats_init(&peer->link_stats); + INIT_WORK(&peer->keepalive_work, ovpn_peer_keepalive_send); + + ret = dst_cache_init(&peer->dst_cache, GFP_KERNEL); + if (ret < 0) { + netdev_err(ovpn->dev, + "cannot initialize dst cache for peer %u\n", + peer->id); + kfree(peer); + return ERR_PTR(ret); + } + + netdev_hold(ovpn->dev, &peer->dev_tracker, GFP_KERNEL); + + return peer; +} + +/** + * ovpn_peer_reset_sockaddr - recreate binding for peer + * @peer: peer to recreate the binding for + * @ss: sockaddr to use as remote endpoint for the binding + * @local_ip: local IP for the binding + * + * Return: 0 on success or a negative error code otherwise + */ +int ovpn_peer_reset_sockaddr(struct ovpn_peer *peer, + const struct sockaddr_storage *ss, + const void *local_ip) +{ + struct ovpn_bind *bind; + size_t ip_len; + + lockdep_assert_held(&peer->lock); + + /* create new ovpn_bind object */ + bind = ovpn_bind_from_sockaddr(ss); + if (IS_ERR(bind)) + return PTR_ERR(bind); + + if (local_ip) { + if (ss->ss_family == AF_INET) { + ip_len = sizeof(struct in_addr); + } else if (ss->ss_family == AF_INET6) { + ip_len = sizeof(struct in6_addr); + } else { + net_dbg_ratelimited("%s: invalid family %u for remote endpoint for peer %u\n", + netdev_name(peer->ovpn->dev), + ss->ss_family, peer->id); + kfree(bind); + return -EINVAL; + } + + memcpy(&bind->local, local_ip, ip_len); + } + + /* set binding */ + ovpn_bind_reset(peer, bind); + + return 0; +} + +/* variable name __tbl2 needs to be different from __tbl1 + * in the macro below to avoid confusing clang + */ +#define ovpn_get_hash_slot(_tbl, _key, _key_len) ({ \ + typeof(_tbl) *__tbl2 = &(_tbl); \ + jhash(_key, _key_len, 0) % HASH_SIZE(*__tbl2); \ +}) + +#define ovpn_get_hash_head(_tbl, _key, _key_len) ({ \ + typeof(_tbl) *__tbl1 = &(_tbl); \ + &(*__tbl1)[ovpn_get_hash_slot(*__tbl1, _key, _key_len)];\ +}) + +/** + * ovpn_peer_endpoints_update - update remote or local endpoint for peer + * @peer: peer to update the remote endpoint for + * @skb: incoming packet to retrieve the source/destination address from + */ +void ovpn_peer_endpoints_update(struct ovpn_peer *peer, struct sk_buff *skb) +{ + struct hlist_nulls_head *nhead; + struct sockaddr_storage ss; + struct sockaddr_in6 *sa6; + bool reset_cache = false; + struct sockaddr_in *sa; + struct ovpn_bind *bind; + const void *local_ip; + size_t salen = 0; + + spin_lock_bh(&peer->lock); + bind = rcu_dereference_protected(peer->bind, + lockdep_is_held(&peer->lock)); + if (unlikely(!bind)) + goto unlock; + + switch (skb->protocol) { + case htons(ETH_P_IP): + /* float check */ + if (unlikely(!ovpn_bind_skb_src_match(bind, skb))) { + /* unconditionally save local endpoint in case + * of float, as it may have changed as well + */ + local_ip = &ip_hdr(skb)->daddr; + sa = (struct sockaddr_in *)&ss; + sa->sin_family = AF_INET; + sa->sin_addr.s_addr = ip_hdr(skb)->saddr; + sa->sin_port = udp_hdr(skb)->source; + salen = sizeof(*sa); + reset_cache = true; + break; + } + + /* if no float happened, let's double check if the local endpoint + * has changed + */ + if (unlikely(bind->local.ipv4.s_addr != ip_hdr(skb)->daddr)) { + net_dbg_ratelimited("%s: learning local IPv4 for peer %d (%pI4 -> %pI4)\n", + netdev_name(peer->ovpn->dev), + peer->id, &bind->local.ipv4.s_addr, + &ip_hdr(skb)->daddr); + bind->local.ipv4.s_addr = ip_hdr(skb)->daddr; + reset_cache = true; + } + break; + case htons(ETH_P_IPV6): + /* float check */ + if (unlikely(!ovpn_bind_skb_src_match(bind, skb))) { + /* unconditionally save local endpoint in case + * of float, as it may have changed as well + */ + local_ip = &ipv6_hdr(skb)->daddr; + sa6 = (struct sockaddr_in6 *)&ss; + sa6->sin6_family = AF_INET6; + sa6->sin6_addr = ipv6_hdr(skb)->saddr; + sa6->sin6_port = udp_hdr(skb)->source; + sa6->sin6_scope_id = ipv6_iface_scope_id(&ipv6_hdr(skb)->saddr, + skb->skb_iif); + salen = sizeof(*sa6); + reset_cache = true; + break; + } + + /* if no float happened, let's double check if the local endpoint + * has changed + */ + if (unlikely(!ipv6_addr_equal(&bind->local.ipv6, + &ipv6_hdr(skb)->daddr))) { + net_dbg_ratelimited("%s: learning local IPv6 for peer %d (%pI6c -> %pI6c\n", + netdev_name(peer->ovpn->dev), + peer->id, &bind->local.ipv6, + &ipv6_hdr(skb)->daddr); + bind->local.ipv6 = ipv6_hdr(skb)->daddr; + reset_cache = true; + } + break; + default: + goto unlock; + } + + if (unlikely(reset_cache)) + dst_cache_reset(&peer->dst_cache); + + /* if the peer did not float, we can bail out now */ + if (likely(!salen)) + goto unlock; + + if (unlikely(ovpn_peer_reset_sockaddr(peer, + (struct sockaddr_storage *)&ss, + local_ip) < 0)) + goto unlock; + + net_dbg_ratelimited("%s: peer %d floated to %pIScp", + netdev_name(peer->ovpn->dev), peer->id, &ss); + + spin_unlock_bh(&peer->lock); + + /* rehashing is required only in MP mode as P2P has one peer + * only and thus there is no hashtable + */ + if (peer->ovpn->mode == OVPN_MODE_MP) { + spin_lock_bh(&peer->ovpn->lock); + spin_lock_bh(&peer->lock); + bind = rcu_dereference_protected(peer->bind, + lockdep_is_held(&peer->lock)); + if (unlikely(!bind)) { + spin_unlock_bh(&peer->lock); + spin_unlock_bh(&peer->ovpn->lock); + return; + } + + /* This function may be invoked concurrently, therefore another + * float may have happened in parallel: perform rehashing + * using the peer->bind->remote directly as key + */ + + switch (bind->remote.in4.sin_family) { + case AF_INET: + salen = sizeof(*sa); + break; + case AF_INET6: + salen = sizeof(*sa6); + break; + } + + /* remove old hashing */ + hlist_nulls_del_init_rcu(&peer->hash_entry_transp_addr); + /* re-add with new transport address */ + nhead = ovpn_get_hash_head(peer->ovpn->peers->by_transp_addr, + &bind->remote, salen); + hlist_nulls_add_head_rcu(&peer->hash_entry_transp_addr, nhead); + spin_unlock_bh(&peer->lock); + spin_unlock_bh(&peer->ovpn->lock); + } + return; +unlock: + spin_unlock_bh(&peer->lock); +} + +/** + * ovpn_peer_release_rcu - RCU callback performing last peer release steps + * @head: RCU member of the ovpn_peer + */ +static void ovpn_peer_release_rcu(struct rcu_head *head) +{ + struct ovpn_peer *peer = container_of(head, struct ovpn_peer, rcu); + + /* this call will immediately free the dst_cache, therefore we + * perform it in the RCU callback, when all contexts are done + */ + dst_cache_destroy(&peer->dst_cache); + kfree(peer); +} + +/** + * ovpn_peer_release - release peer private members + * @peer: the peer to release + */ +void ovpn_peer_release(struct ovpn_peer *peer) +{ + ovpn_crypto_state_release(&peer->crypto); + spin_lock_bh(&peer->lock); + ovpn_bind_reset(peer, NULL); + spin_unlock_bh(&peer->lock); + call_rcu(&peer->rcu, ovpn_peer_release_rcu); + netdev_put(peer->ovpn->dev, &peer->dev_tracker); +} + +/** + * ovpn_peer_release_kref - callback for kref_put + * @kref: the kref object belonging to the peer + */ +void ovpn_peer_release_kref(struct kref *kref) +{ + struct ovpn_peer *peer = container_of(kref, struct ovpn_peer, refcount); + + ovpn_peer_release(peer); +} + +/** + * ovpn_peer_skb_to_sockaddr - fill sockaddr with skb source address + * @skb: the packet to extract data from + * @ss: the sockaddr to fill + * + * Return: sockaddr length on success or -1 otherwise + */ +static int ovpn_peer_skb_to_sockaddr(struct sk_buff *skb, + struct sockaddr_storage *ss) +{ + struct sockaddr_in6 *sa6; + struct sockaddr_in *sa4; + + switch (skb->protocol) { + case htons(ETH_P_IP): + sa4 = (struct sockaddr_in *)ss; + sa4->sin_family = AF_INET; + sa4->sin_addr.s_addr = ip_hdr(skb)->saddr; + sa4->sin_port = udp_hdr(skb)->source; + return sizeof(*sa4); + case htons(ETH_P_IPV6): + sa6 = (struct sockaddr_in6 *)ss; + sa6->sin6_family = AF_INET6; + sa6->sin6_addr = ipv6_hdr(skb)->saddr; + sa6->sin6_port = udp_hdr(skb)->source; + return sizeof(*sa6); + } + + return -1; +} + +/** + * ovpn_nexthop_from_skb4 - retrieve IPv4 nexthop for outgoing skb + * @skb: the outgoing packet + * + * Return: the IPv4 of the nexthop + */ +static __be32 ovpn_nexthop_from_skb4(struct sk_buff *skb) +{ + const struct rtable *rt = skb_rtable(skb); + + if (rt && rt->rt_uses_gateway) + return rt->rt_gw4; + + return ip_hdr(skb)->daddr; +} + +/** + * ovpn_nexthop_from_skb6 - retrieve IPv6 nexthop for outgoing skb + * @skb: the outgoing packet + * + * Return: the IPv6 of the nexthop + */ +static struct in6_addr ovpn_nexthop_from_skb6(struct sk_buff *skb) +{ + const struct rt6_info *rt = skb_rt6_info(skb); + + if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) + return ipv6_hdr(skb)->daddr; + + return rt->rt6i_gateway; +} + +/** + * ovpn_peer_get_by_vpn_addr4 - retrieve peer by its VPN IPv4 address + * @ovpn: the openvpn instance to search + * @addr: VPN IPv4 to use as search key + * + * Refcounter is not increased for the returned peer. + * + * Return: the peer if found or NULL otherwise + */ +static struct ovpn_peer *ovpn_peer_get_by_vpn_addr4(struct ovpn_priv *ovpn, + __be32 addr) +{ + struct hlist_nulls_head *nhead; + struct hlist_nulls_node *ntmp; + struct ovpn_peer *tmp; + unsigned int slot; + +begin: + slot = ovpn_get_hash_slot(ovpn->peers->by_vpn_addr4, &addr, + sizeof(addr)); + nhead = &ovpn->peers->by_vpn_addr4[slot]; + + hlist_nulls_for_each_entry_rcu(tmp, ntmp, nhead, hash_entry_addr4) + if (addr == tmp->vpn_addrs.ipv4.s_addr) + return tmp; + + /* item may have moved during lookup - check nulls and restart + * if that's the case + */ + if (get_nulls_value(ntmp) != slot) + goto begin; + + return NULL; +} + +/** + * ovpn_peer_get_by_vpn_addr6 - retrieve peer by its VPN IPv6 address + * @ovpn: the openvpn instance to search + * @addr: VPN IPv6 to use as search key + * + * Refcounter is not increased for the returned peer. + * + * Return: the peer if found or NULL otherwise + */ +static struct ovpn_peer *ovpn_peer_get_by_vpn_addr6(struct ovpn_priv *ovpn, + struct in6_addr *addr) +{ + struct hlist_nulls_head *nhead; + struct hlist_nulls_node *ntmp; + struct ovpn_peer *tmp; + unsigned int slot; + +begin: + slot = ovpn_get_hash_slot(ovpn->peers->by_vpn_addr6, addr, + sizeof(*addr)); + nhead = &ovpn->peers->by_vpn_addr6[slot]; + + hlist_nulls_for_each_entry_rcu(tmp, ntmp, nhead, hash_entry_addr6) + if (ipv6_addr_equal(addr, &tmp->vpn_addrs.ipv6)) + return tmp; + + /* item may have moved during lookup - check nulls and restart + * if that's the case + */ + if (get_nulls_value(ntmp) != slot) + goto begin; + + return NULL; +} + +/** + * ovpn_peer_transp_match - check if sockaddr and peer binding match + * @peer: the peer to get the binding from + * @ss: the sockaddr to match + * + * Return: true if sockaddr and binding match or false otherwise + */ +static bool ovpn_peer_transp_match(const struct ovpn_peer *peer, + const struct sockaddr_storage *ss) +{ + struct ovpn_bind *bind = rcu_dereference(peer->bind); + struct sockaddr_in6 *sa6; + struct sockaddr_in *sa4; + + if (unlikely(!bind)) + return false; + + if (ss->ss_family != bind->remote.in4.sin_family) + return false; + + switch (ss->ss_family) { + case AF_INET: + sa4 = (struct sockaddr_in *)ss; + if (sa4->sin_addr.s_addr != bind->remote.in4.sin_addr.s_addr) + return false; + if (sa4->sin_port != bind->remote.in4.sin_port) + return false; + break; + case AF_INET6: + sa6 = (struct sockaddr_in6 *)ss; + if (!ipv6_addr_equal(&sa6->sin6_addr, + &bind->remote.in6.sin6_addr)) + return false; + if (sa6->sin6_port != bind->remote.in6.sin6_port) + return false; + break; + default: + return false; + } + + return true; +} + +/** + * ovpn_peer_get_by_transp_addr_p2p - get peer by transport address in a P2P + * instance + * @ovpn: the openvpn instance to search + * @ss: the transport socket address + * + * Return: the peer if found or NULL otherwise + */ +static struct ovpn_peer * +ovpn_peer_get_by_transp_addr_p2p(struct ovpn_priv *ovpn, + struct sockaddr_storage *ss) +{ + struct ovpn_peer *tmp, *peer = NULL; + + rcu_read_lock(); + tmp = rcu_dereference(ovpn->peer); + if (likely(tmp && ovpn_peer_transp_match(tmp, ss) && + ovpn_peer_hold(tmp))) + peer = tmp; + rcu_read_unlock(); + + return peer; +} + +/** + * ovpn_peer_get_by_transp_addr - retrieve peer by transport address + * @ovpn: the openvpn instance to search + * @skb: the skb to retrieve the source transport address from + * + * Return: a pointer to the peer if found or NULL otherwise + */ +struct ovpn_peer *ovpn_peer_get_by_transp_addr(struct ovpn_priv *ovpn, + struct sk_buff *skb) +{ + struct ovpn_peer *tmp, *peer = NULL; + struct sockaddr_storage ss = { 0 }; + struct hlist_nulls_head *nhead; + struct hlist_nulls_node *ntmp; + unsigned int slot; + ssize_t sa_len; + + sa_len = ovpn_peer_skb_to_sockaddr(skb, &ss); + if (unlikely(sa_len < 0)) + return NULL; + + if (ovpn->mode == OVPN_MODE_P2P) + return ovpn_peer_get_by_transp_addr_p2p(ovpn, &ss); + + rcu_read_lock(); +begin: + slot = ovpn_get_hash_slot(ovpn->peers->by_transp_addr, &ss, sa_len); + nhead = &ovpn->peers->by_transp_addr[slot]; + + hlist_nulls_for_each_entry_rcu(tmp, ntmp, nhead, + hash_entry_transp_addr) { + if (!ovpn_peer_transp_match(tmp, &ss)) + continue; + + if (!ovpn_peer_hold(tmp)) + continue; + + peer = tmp; + break; + } + + /* item may have moved during lookup - check nulls and restart + * if that's the case + */ + if (!peer && get_nulls_value(ntmp) != slot) + goto begin; + rcu_read_unlock(); + + return peer; +} + +/** + * ovpn_peer_get_by_id_p2p - get peer by ID in a P2P instance + * @ovpn: the openvpn instance to search + * @peer_id: the ID of the peer to find + * + * Return: the peer if found or NULL otherwise + */ +static struct ovpn_peer *ovpn_peer_get_by_id_p2p(struct ovpn_priv *ovpn, + u32 peer_id) +{ + struct ovpn_peer *tmp, *peer = NULL; + + rcu_read_lock(); + tmp = rcu_dereference(ovpn->peer); + if (likely(tmp && tmp->id == peer_id && ovpn_peer_hold(tmp))) + peer = tmp; + rcu_read_unlock(); + + return peer; +} + +/** + * ovpn_peer_get_by_id - retrieve peer by ID + * @ovpn: the openvpn instance to search + * @peer_id: the unique peer identifier to match + * + * Return: a pointer to the peer if found or NULL otherwise + */ +struct ovpn_peer *ovpn_peer_get_by_id(struct ovpn_priv *ovpn, u32 peer_id) +{ + struct ovpn_peer *tmp, *peer = NULL; + struct hlist_head *head; + + if (ovpn->mode == OVPN_MODE_P2P) + return ovpn_peer_get_by_id_p2p(ovpn, peer_id); + + head = ovpn_get_hash_head(ovpn->peers->by_id, &peer_id, + sizeof(peer_id)); + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp, head, hash_entry_id) { + if (tmp->id != peer_id) + continue; + + if (!ovpn_peer_hold(tmp)) + continue; + + peer = tmp; + break; + } + rcu_read_unlock(); + + return peer; +} + +static void ovpn_peer_remove(struct ovpn_peer *peer, + enum ovpn_del_peer_reason reason, + struct llist_head *release_list) +{ + lockdep_assert_held(&peer->ovpn->lock); + + switch (peer->ovpn->mode) { + case OVPN_MODE_MP: + /* prevent double remove */ + if (hlist_unhashed(&peer->hash_entry_id)) + return; + + hlist_del_init_rcu(&peer->hash_entry_id); + hlist_nulls_del_init_rcu(&peer->hash_entry_addr4); + hlist_nulls_del_init_rcu(&peer->hash_entry_addr6); + hlist_nulls_del_init_rcu(&peer->hash_entry_transp_addr); + break; + case OVPN_MODE_P2P: + /* prevent double remove */ + if (peer != rcu_access_pointer(peer->ovpn->peer)) + return; + + RCU_INIT_POINTER(peer->ovpn->peer, NULL); + /* in P2P mode the carrier is switched off when the peer is + * deleted so that third party protocols can react accordingly + */ + netif_carrier_off(peer->ovpn->dev); + break; + } + + peer->delete_reason = reason; + ovpn_nl_peer_del_notify(peer); + + /* append to provided list for later socket release and ref drop */ + llist_add(&peer->release_entry, release_list); +} + +/** + * ovpn_peer_get_by_dst - Lookup peer to send skb to + * @ovpn: the private data representing the current VPN session + * @skb: the skb to extract the destination address from + * + * This function takes a tunnel packet and looks up the peer to send it to + * after encapsulation. The skb is expected to be the in-tunnel packet, without + * any OpenVPN related header. + * + * Assume that the IP header is accessible in the skb data. + * + * Return: the peer if found or NULL otherwise. + */ +struct ovpn_peer *ovpn_peer_get_by_dst(struct ovpn_priv *ovpn, + struct sk_buff *skb) +{ + struct ovpn_peer *peer = NULL; + struct in6_addr addr6; + __be32 addr4; + + /* in P2P mode, no matter the destination, packets are always sent to + * the single peer listening on the other side + */ + if (ovpn->mode == OVPN_MODE_P2P) { + rcu_read_lock(); + peer = rcu_dereference(ovpn->peer); + if (unlikely(peer && !ovpn_peer_hold(peer))) + peer = NULL; + rcu_read_unlock(); + return peer; + } + + rcu_read_lock(); + switch (skb->protocol) { + case htons(ETH_P_IP): + addr4 = ovpn_nexthop_from_skb4(skb); + peer = ovpn_peer_get_by_vpn_addr4(ovpn, addr4); + break; + case htons(ETH_P_IPV6): + addr6 = ovpn_nexthop_from_skb6(skb); + peer = ovpn_peer_get_by_vpn_addr6(ovpn, &addr6); + break; + } + + if (unlikely(peer && !ovpn_peer_hold(peer))) + peer = NULL; + rcu_read_unlock(); + + return peer; +} + +/** + * ovpn_nexthop_from_rt4 - look up the IPv4 nexthop for the given destination + * @ovpn: the private data representing the current VPN session + * @dest: the destination to be looked up + * + * Looks up in the IPv4 system routing table the IP of the nexthop to be used + * to reach the destination passed as argument. If no nexthop can be found, the + * destination itself is returned as it probably has to be used as nexthop. + * + * Return: the IP of the next hop if found or dest itself otherwise + */ +static __be32 ovpn_nexthop_from_rt4(struct ovpn_priv *ovpn, __be32 dest) +{ + struct rtable *rt; + struct flowi4 fl = { + .daddr = dest + }; + + rt = ip_route_output_flow(dev_net(ovpn->dev), &fl, NULL); + if (IS_ERR(rt)) { + net_dbg_ratelimited("%s: no route to host %pI4\n", + netdev_name(ovpn->dev), &dest); + /* if we end up here this packet is probably going to be + * thrown away later + */ + return dest; + } + + if (!rt->rt_uses_gateway) + goto out; + + dest = rt->rt_gw4; +out: + ip_rt_put(rt); + return dest; +} + +/** + * ovpn_nexthop_from_rt6 - look up the IPv6 nexthop for the given destination + * @ovpn: the private data representing the current VPN session + * @dest: the destination to be looked up + * + * Looks up in the IPv6 system routing table the IP of the nexthop to be used + * to reach the destination passed as argument. If no nexthop can be found, the + * destination itself is returned as it probably has to be used as nexthop. + * + * Return: the IP of the next hop if found or dest itself otherwise + */ +static struct in6_addr ovpn_nexthop_from_rt6(struct ovpn_priv *ovpn, + struct in6_addr dest) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct dst_entry *entry; + struct rt6_info *rt; + struct flowi6 fl = { + .daddr = dest, + }; + + entry = ipv6_stub->ipv6_dst_lookup_flow(dev_net(ovpn->dev), NULL, &fl, + NULL); + if (IS_ERR(entry)) { + net_dbg_ratelimited("%s: no route to host %pI6c\n", + netdev_name(ovpn->dev), &dest); + /* if we end up here this packet is probably going to be + * thrown away later + */ + return dest; + } + + rt = dst_rt6_info(entry); + + if (!(rt->rt6i_flags & RTF_GATEWAY)) + goto out; + + dest = rt->rt6i_gateway; +out: + dst_release((struct dst_entry *)rt); +#endif + return dest; +} + +/** + * ovpn_peer_check_by_src - check that skb source is routed via peer + * @ovpn: the openvpn instance to search + * @skb: the packet to extract source address from + * @peer: the peer to check against the source address + * + * Return: true if the peer is matching or false otherwise + */ +bool ovpn_peer_check_by_src(struct ovpn_priv *ovpn, struct sk_buff *skb, + struct ovpn_peer *peer) +{ + bool match = false; + struct in6_addr addr6; + __be32 addr4; + + if (ovpn->mode == OVPN_MODE_P2P) { + /* in P2P mode, no matter the destination, packets are always + * sent to the single peer listening on the other side + */ + return peer == rcu_access_pointer(ovpn->peer); + } + + /* This function performs a reverse path check, therefore we now + * lookup the nexthop we would use if we wanted to route a packet + * to the source IP. If the nexthop matches the sender we know the + * latter is valid and we allow the packet to come in + */ + + switch (skb->protocol) { + case htons(ETH_P_IP): + addr4 = ovpn_nexthop_from_rt4(ovpn, ip_hdr(skb)->saddr); + rcu_read_lock(); + match = (peer == ovpn_peer_get_by_vpn_addr4(ovpn, addr4)); + rcu_read_unlock(); + break; + case htons(ETH_P_IPV6): + addr6 = ovpn_nexthop_from_rt6(ovpn, ipv6_hdr(skb)->saddr); + rcu_read_lock(); + match = (peer == ovpn_peer_get_by_vpn_addr6(ovpn, &addr6)); + rcu_read_unlock(); + break; + } + + return match; +} + +void ovpn_peer_hash_vpn_ip(struct ovpn_peer *peer) +{ + struct hlist_nulls_head *nhead; + + lockdep_assert_held(&peer->ovpn->lock); + + /* rehashing makes sense only in multipeer mode */ + if (peer->ovpn->mode != OVPN_MODE_MP) + return; + + if (peer->vpn_addrs.ipv4.s_addr != htonl(INADDR_ANY)) { + /* remove potential old hashing */ + hlist_nulls_del_init_rcu(&peer->hash_entry_addr4); + + nhead = ovpn_get_hash_head(peer->ovpn->peers->by_vpn_addr4, + &peer->vpn_addrs.ipv4, + sizeof(peer->vpn_addrs.ipv4)); + hlist_nulls_add_head_rcu(&peer->hash_entry_addr4, nhead); + } + + if (!ipv6_addr_any(&peer->vpn_addrs.ipv6)) { + /* remove potential old hashing */ + hlist_nulls_del_init_rcu(&peer->hash_entry_addr6); + + nhead = ovpn_get_hash_head(peer->ovpn->peers->by_vpn_addr6, + &peer->vpn_addrs.ipv6, + sizeof(peer->vpn_addrs.ipv6)); + hlist_nulls_add_head_rcu(&peer->hash_entry_addr6, nhead); + } +} + +/** + * ovpn_peer_add_mp - add peer to related tables in a MP instance + * @ovpn: the instance to add the peer to + * @peer: the peer to add + * + * Return: 0 on success or a negative error code otherwise + */ +static int ovpn_peer_add_mp(struct ovpn_priv *ovpn, struct ovpn_peer *peer) +{ + struct sockaddr_storage sa = { 0 }; + struct hlist_nulls_head *nhead; + struct sockaddr_in6 *sa6; + struct sockaddr_in *sa4; + struct ovpn_bind *bind; + struct ovpn_peer *tmp; + size_t salen; + int ret = 0; + + spin_lock_bh(&ovpn->lock); + /* do not add duplicates */ + tmp = ovpn_peer_get_by_id(ovpn, peer->id); + if (tmp) { + ovpn_peer_put(tmp); + ret = -EEXIST; + goto out; + } + + bind = rcu_dereference_protected(peer->bind, true); + /* peers connected via TCP have bind == NULL */ + if (bind) { + switch (bind->remote.in4.sin_family) { + case AF_INET: + sa4 = (struct sockaddr_in *)&sa; + + sa4->sin_family = AF_INET; + sa4->sin_addr.s_addr = bind->remote.in4.sin_addr.s_addr; + sa4->sin_port = bind->remote.in4.sin_port; + salen = sizeof(*sa4); + break; + case AF_INET6: + sa6 = (struct sockaddr_in6 *)&sa; + + sa6->sin6_family = AF_INET6; + sa6->sin6_addr = bind->remote.in6.sin6_addr; + sa6->sin6_port = bind->remote.in6.sin6_port; + salen = sizeof(*sa6); + break; + default: + ret = -EPROTONOSUPPORT; + goto out; + } + + nhead = ovpn_get_hash_head(ovpn->peers->by_transp_addr, &sa, + salen); + hlist_nulls_add_head_rcu(&peer->hash_entry_transp_addr, nhead); + } + + hlist_add_head_rcu(&peer->hash_entry_id, + ovpn_get_hash_head(ovpn->peers->by_id, &peer->id, + sizeof(peer->id))); + + ovpn_peer_hash_vpn_ip(peer); +out: + spin_unlock_bh(&ovpn->lock); + return ret; +} + +/** + * ovpn_peer_add_p2p - add peer to related tables in a P2P instance + * @ovpn: the instance to add the peer to + * @peer: the peer to add + * + * Return: 0 on success or a negative error code otherwise + */ +static int ovpn_peer_add_p2p(struct ovpn_priv *ovpn, struct ovpn_peer *peer) +{ + LLIST_HEAD(release_list); + struct ovpn_peer *tmp; + + spin_lock_bh(&ovpn->lock); + /* in p2p mode it is possible to have a single peer only, therefore the + * old one is released and substituted by the new one + */ + tmp = rcu_dereference_protected(ovpn->peer, + lockdep_is_held(&ovpn->lock)); + if (tmp) + ovpn_peer_remove(tmp, OVPN_DEL_PEER_REASON_TEARDOWN, + &release_list); + + rcu_assign_pointer(ovpn->peer, peer); + /* in P2P mode the carrier is switched on when the peer is added */ + netif_carrier_on(ovpn->dev); + unlock_ovpn(ovpn, &release_list); + + return 0; +} + +/** + * ovpn_peer_add - add peer to the related tables + * @ovpn: the openvpn instance the peer belongs to + * @peer: the peer object to add + * + * Assume refcounter was increased by caller + * + * Return: 0 on success or a negative error code otherwise + */ +int ovpn_peer_add(struct ovpn_priv *ovpn, struct ovpn_peer *peer) +{ + switch (ovpn->mode) { + case OVPN_MODE_MP: + return ovpn_peer_add_mp(ovpn, peer); + case OVPN_MODE_P2P: + return ovpn_peer_add_p2p(ovpn, peer); + } + + return -EOPNOTSUPP; +} + +/** + * ovpn_peer_del_mp - delete peer from related tables in a MP instance + * @peer: the peer to delete + * @reason: reason why the peer was deleted (sent to userspace) + * @release_list: list where delete peer should be appended + * + * Return: 0 on success or a negative error code otherwise + */ +static int ovpn_peer_del_mp(struct ovpn_peer *peer, + enum ovpn_del_peer_reason reason, + struct llist_head *release_list) +{ + struct ovpn_peer *tmp; + int ret = -ENOENT; + + lockdep_assert_held(&peer->ovpn->lock); + + tmp = ovpn_peer_get_by_id(peer->ovpn, peer->id); + if (tmp == peer) { + ovpn_peer_remove(peer, reason, release_list); + ret = 0; + } + + if (tmp) + ovpn_peer_put(tmp); + + return ret; +} + +/** + * ovpn_peer_del_p2p - delete peer from related tables in a P2P instance + * @peer: the peer to delete + * @reason: reason why the peer was deleted (sent to userspace) + * @release_list: list where delete peer should be appended + * + * Return: 0 on success or a negative error code otherwise + */ +static int ovpn_peer_del_p2p(struct ovpn_peer *peer, + enum ovpn_del_peer_reason reason, + struct llist_head *release_list) +{ + struct ovpn_peer *tmp; + + lockdep_assert_held(&peer->ovpn->lock); + + tmp = rcu_dereference_protected(peer->ovpn->peer, + lockdep_is_held(&peer->ovpn->lock)); + if (tmp != peer) + return -ENOENT; + + ovpn_peer_remove(peer, reason, release_list); + + return 0; +} + +/** + * ovpn_peer_del - delete peer from related tables + * @peer: the peer object to delete + * @reason: reason for deleting peer (will be sent to userspace) + * + * Return: 0 on success or a negative error code otherwise + */ +int ovpn_peer_del(struct ovpn_peer *peer, enum ovpn_del_peer_reason reason) +{ + LLIST_HEAD(release_list); + int ret = -EOPNOTSUPP; + + spin_lock_bh(&peer->ovpn->lock); + switch (peer->ovpn->mode) { + case OVPN_MODE_MP: + ret = ovpn_peer_del_mp(peer, reason, &release_list); + break; + case OVPN_MODE_P2P: + ret = ovpn_peer_del_p2p(peer, reason, &release_list); + break; + default: + break; + } + unlock_ovpn(peer->ovpn, &release_list); + + return ret; +} + +/** + * ovpn_peer_release_p2p - release peer upon P2P device teardown + * @ovpn: the instance being torn down + * @sk: if not NULL, release peer only if it's using this specific socket + * @reason: the reason for releasing the peer + */ +static void ovpn_peer_release_p2p(struct ovpn_priv *ovpn, struct sock *sk, + enum ovpn_del_peer_reason reason) +{ + struct ovpn_socket *ovpn_sock; + LLIST_HEAD(release_list); + struct ovpn_peer *peer; + + spin_lock_bh(&ovpn->lock); + peer = rcu_dereference_protected(ovpn->peer, + lockdep_is_held(&ovpn->lock)); + if (!peer) { + spin_unlock_bh(&ovpn->lock); + return; + } + + if (sk) { + ovpn_sock = rcu_access_pointer(peer->sock); + if (!ovpn_sock || ovpn_sock->sock->sk != sk) { + spin_unlock_bh(&ovpn->lock); + ovpn_peer_put(peer); + return; + } + } + + ovpn_peer_remove(peer, reason, &release_list); + unlock_ovpn(ovpn, &release_list); +} + +static void ovpn_peers_release_mp(struct ovpn_priv *ovpn, struct sock *sk, + enum ovpn_del_peer_reason reason) +{ + struct ovpn_socket *ovpn_sock; + LLIST_HEAD(release_list); + struct ovpn_peer *peer; + struct hlist_node *tmp; + int bkt; + + spin_lock_bh(&ovpn->lock); + hash_for_each_safe(ovpn->peers->by_id, bkt, tmp, peer, hash_entry_id) { + bool remove = true; + + /* if a socket was passed as argument, skip all peers except + * those using it + */ + if (sk) { + rcu_read_lock(); + ovpn_sock = rcu_dereference(peer->sock); + remove = ovpn_sock && ovpn_sock->sock->sk == sk; + rcu_read_unlock(); + } + + if (remove) + ovpn_peer_remove(peer, reason, &release_list); + } + unlock_ovpn(ovpn, &release_list); +} + +/** + * ovpn_peers_free - free all peers in the instance + * @ovpn: the instance whose peers should be released + * @sk: if not NULL, only peers using this socket are removed and the socket + * is released immediately + * @reason: the reason for releasing all peers + */ +void ovpn_peers_free(struct ovpn_priv *ovpn, struct sock *sk, + enum ovpn_del_peer_reason reason) +{ + switch (ovpn->mode) { + case OVPN_MODE_P2P: + ovpn_peer_release_p2p(ovpn, sk, reason); + break; + case OVPN_MODE_MP: + ovpn_peers_release_mp(ovpn, sk, reason); + break; + } +} + +static time64_t ovpn_peer_keepalive_work_single(struct ovpn_peer *peer, + time64_t now, + struct llist_head *release_list) +{ + time64_t last_recv, last_sent, next_run1, next_run2; + unsigned long timeout, interval; + bool expired; + + spin_lock_bh(&peer->lock); + /* we expect both timers to be configured at the same time, + * therefore bail out if either is not set + */ + if (!peer->keepalive_timeout || !peer->keepalive_interval) { + spin_unlock_bh(&peer->lock); + return 0; + } + + /* check for peer timeout */ + expired = false; + timeout = peer->keepalive_timeout; + last_recv = READ_ONCE(peer->last_recv); + if (now < last_recv + timeout) { + peer->keepalive_recv_exp = last_recv + timeout; + next_run1 = peer->keepalive_recv_exp; + } else if (peer->keepalive_recv_exp > now) { + next_run1 = peer->keepalive_recv_exp; + } else { + expired = true; + } + + if (expired) { + /* peer is dead -> kill it and move on */ + spin_unlock_bh(&peer->lock); + netdev_dbg(peer->ovpn->dev, "peer %u expired\n", + peer->id); + ovpn_peer_remove(peer, OVPN_DEL_PEER_REASON_EXPIRED, + release_list); + return 0; + } + + /* check for peer keepalive */ + expired = false; + interval = peer->keepalive_interval; + last_sent = READ_ONCE(peer->last_sent); + if (now < last_sent + interval) { + peer->keepalive_xmit_exp = last_sent + interval; + next_run2 = peer->keepalive_xmit_exp; + } else if (peer->keepalive_xmit_exp > now) { + next_run2 = peer->keepalive_xmit_exp; + } else { + expired = true; + next_run2 = now + interval; + } + spin_unlock_bh(&peer->lock); + + if (expired) { + /* a keepalive packet is required */ + netdev_dbg(peer->ovpn->dev, + "sending keepalive to peer %u\n", + peer->id); + if (schedule_work(&peer->keepalive_work)) + ovpn_peer_hold(peer); + } + + if (next_run1 < next_run2) + return next_run1; + + return next_run2; +} + +static time64_t ovpn_peer_keepalive_work_mp(struct ovpn_priv *ovpn, + time64_t now, + struct llist_head *release_list) +{ + time64_t tmp_next_run, next_run = 0; + struct hlist_node *tmp; + struct ovpn_peer *peer; + int bkt; + + lockdep_assert_held(&ovpn->lock); + + hash_for_each_safe(ovpn->peers->by_id, bkt, tmp, peer, hash_entry_id) { + tmp_next_run = ovpn_peer_keepalive_work_single(peer, now, + release_list); + if (!tmp_next_run) + continue; + + /* the next worker run will be scheduled based on the shortest + * required interval across all peers + */ + if (!next_run || tmp_next_run < next_run) + next_run = tmp_next_run; + } + + return next_run; +} + +static time64_t ovpn_peer_keepalive_work_p2p(struct ovpn_priv *ovpn, + time64_t now, + struct llist_head *release_list) +{ + struct ovpn_peer *peer; + time64_t next_run = 0; + + lockdep_assert_held(&ovpn->lock); + + peer = rcu_dereference_protected(ovpn->peer, + lockdep_is_held(&ovpn->lock)); + if (peer) + next_run = ovpn_peer_keepalive_work_single(peer, now, + release_list); + + return next_run; +} + +/** + * ovpn_peer_keepalive_work - run keepalive logic on each known peer + * @work: pointer to the work member of the related ovpn object + * + * Each peer has two timers (if configured): + * 1. peer timeout: when no data is received for a certain interval, + * the peer is considered dead and it gets killed. + * 2. peer keepalive: when no data is sent to a certain peer for a + * certain interval, a special 'keepalive' packet is explicitly sent. + * + * This function iterates across the whole peer collection while + * checking the timers described above. + */ +void ovpn_peer_keepalive_work(struct work_struct *work) +{ + struct ovpn_priv *ovpn = container_of(work, struct ovpn_priv, + keepalive_work.work); + time64_t next_run = 0, now = ktime_get_real_seconds(); + LLIST_HEAD(release_list); + + spin_lock_bh(&ovpn->lock); + switch (ovpn->mode) { + case OVPN_MODE_MP: + next_run = ovpn_peer_keepalive_work_mp(ovpn, now, + &release_list); + break; + case OVPN_MODE_P2P: + next_run = ovpn_peer_keepalive_work_p2p(ovpn, now, + &release_list); + break; + } + + /* prevent rearming if the interface is being destroyed */ + if (next_run > 0 && + READ_ONCE(ovpn->dev->reg_state) == NETREG_REGISTERED) { + netdev_dbg(ovpn->dev, + "scheduling keepalive work: now=%llu next_run=%llu delta=%llu\n", + next_run, now, next_run - now); + schedule_delayed_work(&ovpn->keepalive_work, + (next_run - now) * HZ); + } + unlock_ovpn(ovpn, &release_list); +} diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h new file mode 100644 index 0000000000000..a1423f2b09e06 --- /dev/null +++ b/drivers/net/ovpn/peer.h @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_OVPNPEER_H_ +#define _NET_OVPN_OVPNPEER_H_ + +#include +#include + +#include "crypto.h" +#include "socket.h" +#include "stats.h" + +/** + * struct ovpn_peer - the main remote peer object + * @ovpn: main openvpn instance this peer belongs to + * @dev_tracker: reference tracker for associated dev + * @id: unique identifier + * @vpn_addrs: IP addresses assigned over the tunnel + * @vpn_addrs.ipv4: IPv4 assigned to peer on the tunnel + * @vpn_addrs.ipv6: IPv6 assigned to peer on the tunnel + * @hash_entry_id: entry in the peer ID hashtable + * @hash_entry_addr4: entry in the peer IPv4 hashtable + * @hash_entry_addr6: entry in the peer IPv6 hashtable + * @hash_entry_transp_addr: entry in the peer transport address hashtable + * @sock: the socket being used to talk to this peer + * @tcp: keeps track of TCP specific state + * @tcp.strp: stream parser context (TCP only) + * @tcp.user_queue: received packets that have to go to userspace (TCP only) + * @tcp.out_queue: packets on hold while socket is taken by user (TCP only) + * @tcp.tx_in_progress: true if TX is already ongoing (TCP only) + * @tcp.out_msg.skb: packet scheduled for sending (TCP only) + * @tcp.out_msg.offset: offset where next send should start (TCP only) + * @tcp.out_msg.len: remaining data to send within packet (TCP only) + * @tcp.sk_cb.sk_data_ready: pointer to original cb (TCP only) + * @tcp.sk_cb.sk_write_space: pointer to original cb (TCP only) + * @tcp.sk_cb.prot: pointer to original prot object (TCP only) + * @tcp.sk_cb.ops: pointer to the original prot_ops object (TCP only) + * @crypto: the crypto configuration (ciphers, keys, etc..) + * @dst_cache: cache for dst_entry used to send to peer + * @bind: remote peer binding + * @keepalive_interval: seconds after which a new keepalive should be sent + * @keepalive_xmit_exp: future timestamp when next keepalive should be sent + * @last_sent: timestamp of the last successfully sent packet + * @keepalive_timeout: seconds after which an inactive peer is considered dead + * @keepalive_recv_exp: future timestamp when the peer should expire + * @last_recv: timestamp of the last authenticated received packet + * @vpn_stats: per-peer in-VPN TX/RX stats + * @link_stats: per-peer link/transport TX/RX stats + * @delete_reason: why peer was deleted (i.e. timeout, transport error, ..) + * @lock: protects binding to peer (bind) and keepalive* fields + * @refcount: reference counter + * @rcu: used to free peer in an RCU safe way + * @release_entry: entry for the socket release list + * @keepalive_work: used to schedule keepalive sending + */ +struct ovpn_peer { + struct ovpn_priv *ovpn; + netdevice_tracker dev_tracker; + u32 id; + struct { + struct in_addr ipv4; + struct in6_addr ipv6; + } vpn_addrs; + struct hlist_node hash_entry_id; + struct hlist_nulls_node hash_entry_addr4; + struct hlist_nulls_node hash_entry_addr6; + struct hlist_nulls_node hash_entry_transp_addr; + struct ovpn_socket __rcu *sock; + + struct { + struct strparser strp; + struct sk_buff_head user_queue; + struct sk_buff_head out_queue; + bool tx_in_progress; + + struct { + struct sk_buff *skb; + int offset; + int len; + } out_msg; + + struct { + void (*sk_data_ready)(struct sock *sk); + void (*sk_write_space)(struct sock *sk); + struct proto *prot; + const struct proto_ops *ops; + } sk_cb; + + struct work_struct defer_del_work; + } tcp; + struct ovpn_crypto_state crypto; + struct dst_cache dst_cache; + struct ovpn_bind __rcu *bind; + unsigned long keepalive_interval; + unsigned long keepalive_xmit_exp; + time64_t last_sent; + unsigned long keepalive_timeout; + unsigned long keepalive_recv_exp; + time64_t last_recv; + struct ovpn_peer_stats vpn_stats; + struct ovpn_peer_stats link_stats; + enum ovpn_del_peer_reason delete_reason; + spinlock_t lock; /* protects bind and keepalive* */ + struct kref refcount; + struct rcu_head rcu; + struct llist_node release_entry; + struct work_struct keepalive_work; +}; + +/** + * ovpn_peer_hold - increase reference counter + * @peer: the peer whose counter should be increased + * + * Return: true if the counter was increased or false if it was zero already + */ +static inline bool ovpn_peer_hold(struct ovpn_peer *peer) +{ + return kref_get_unless_zero(&peer->refcount); +} + +void ovpn_peer_release(struct ovpn_peer *peer); +void ovpn_peer_release_kref(struct kref *kref); + +/** + * ovpn_peer_put - decrease reference counter + * @peer: the peer whose counter should be decreased + */ +static inline void ovpn_peer_put(struct ovpn_peer *peer) +{ + kref_put(&peer->refcount, ovpn_peer_release_kref); +} + +struct ovpn_peer *ovpn_peer_new(struct ovpn_priv *ovpn, u32 id); +int ovpn_peer_add(struct ovpn_priv *ovpn, struct ovpn_peer *peer); +int ovpn_peer_del(struct ovpn_peer *peer, enum ovpn_del_peer_reason reason); +void ovpn_peers_free(struct ovpn_priv *ovpn, struct sock *sock, + enum ovpn_del_peer_reason reason); + +struct ovpn_peer *ovpn_peer_get_by_transp_addr(struct ovpn_priv *ovpn, + struct sk_buff *skb); +struct ovpn_peer *ovpn_peer_get_by_id(struct ovpn_priv *ovpn, u32 peer_id); +struct ovpn_peer *ovpn_peer_get_by_dst(struct ovpn_priv *ovpn, + struct sk_buff *skb); +void ovpn_peer_hash_vpn_ip(struct ovpn_peer *peer); +bool ovpn_peer_check_by_src(struct ovpn_priv *ovpn, struct sk_buff *skb, + struct ovpn_peer *peer); + +void ovpn_peer_keepalive_set(struct ovpn_peer *peer, u32 interval, u32 timeout); +void ovpn_peer_keepalive_work(struct work_struct *work); + +void ovpn_peer_endpoints_update(struct ovpn_peer *peer, struct sk_buff *skb); +int ovpn_peer_reset_sockaddr(struct ovpn_peer *peer, + const struct sockaddr_storage *ss, + const void *local_ip); + +#endif /* _NET_OVPN_OVPNPEER_H_ */ diff --git a/drivers/net/ovpn/pktid.c b/drivers/net/ovpn/pktid.c new file mode 100644 index 0000000000000..2f29049897e39 --- /dev/null +++ b/drivers/net/ovpn/pktid.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + * James Yonan + */ + +#include +#include +#include +#include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "pktid.h" + +void ovpn_pktid_xmit_init(struct ovpn_pktid_xmit *pid) +{ + atomic_set(&pid->seq_num, 1); +} + +void ovpn_pktid_recv_init(struct ovpn_pktid_recv *pr) +{ + memset(pr, 0, sizeof(*pr)); + spin_lock_init(&pr->lock); +} + +/* Packet replay detection. + * Allows ID backtrack of up to REPLAY_WINDOW_SIZE - 1. + */ +int ovpn_pktid_recv(struct ovpn_pktid_recv *pr, u32 pkt_id, u32 pkt_time) +{ + const unsigned long now = jiffies; + int ret; + + /* ID must not be zero */ + if (unlikely(pkt_id == 0)) + return -EINVAL; + + spin_lock_bh(&pr->lock); + + /* expire backtracks at or below pr->id after PKTID_RECV_EXPIRE time */ + if (unlikely(time_after_eq(now, pr->expire))) + pr->id_floor = pr->id; + + /* time changed? */ + if (unlikely(pkt_time != pr->time)) { + if (pkt_time > pr->time) { + /* time moved forward, accept */ + pr->base = 0; + pr->extent = 0; + pr->id = 0; + pr->time = pkt_time; + pr->id_floor = 0; + } else { + /* time moved backward, reject */ + ret = -ETIME; + goto out; + } + } + + if (likely(pkt_id == pr->id + 1)) { + /* well-formed ID sequence (incremented by 1) */ + pr->base = REPLAY_INDEX(pr->base, -1); + pr->history[pr->base / 8] |= (1 << (pr->base % 8)); + if (pr->extent < REPLAY_WINDOW_SIZE) + ++pr->extent; + pr->id = pkt_id; + } else if (pkt_id > pr->id) { + /* ID jumped forward by more than one */ + const unsigned int delta = pkt_id - pr->id; + + if (delta < REPLAY_WINDOW_SIZE) { + unsigned int i; + + pr->base = REPLAY_INDEX(pr->base, -delta); + pr->history[pr->base / 8] |= (1 << (pr->base % 8)); + pr->extent += delta; + if (pr->extent > REPLAY_WINDOW_SIZE) + pr->extent = REPLAY_WINDOW_SIZE; + for (i = 1; i < delta; ++i) { + unsigned int newb = REPLAY_INDEX(pr->base, i); + + pr->history[newb / 8] &= ~BIT(newb % 8); + } + } else { + pr->base = 0; + pr->extent = REPLAY_WINDOW_SIZE; + memset(pr->history, 0, sizeof(pr->history)); + pr->history[0] = 1; + } + pr->id = pkt_id; + } else { + /* ID backtrack */ + const unsigned int delta = pr->id - pkt_id; + + if (delta > pr->max_backtrack) + pr->max_backtrack = delta; + if (delta < pr->extent) { + if (pkt_id > pr->id_floor) { + const unsigned int ri = REPLAY_INDEX(pr->base, + delta); + u8 *p = &pr->history[ri / 8]; + const u8 mask = (1 << (ri % 8)); + + if (*p & mask) { + ret = -EINVAL; + goto out; + } + *p |= mask; + } else { + ret = -EINVAL; + goto out; + } + } else { + ret = -EINVAL; + goto out; + } + } + + pr->expire = now + PKTID_RECV_EXPIRE; + ret = 0; +out: + spin_unlock_bh(&pr->lock); + return ret; +} diff --git a/drivers/net/ovpn/pktid.h b/drivers/net/ovpn/pktid.h new file mode 100644 index 0000000000000..0262d026d15e2 --- /dev/null +++ b/drivers/net/ovpn/pktid.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + * James Yonan + */ + +#ifndef _NET_OVPN_OVPNPKTID_H_ +#define _NET_OVPN_OVPNPKTID_H_ + +#include "proto.h" + +/* If no packets received for this length of time, set a backtrack floor + * at highest received packet ID thus far. + */ +#define PKTID_RECV_EXPIRE (30 * HZ) + +/* Packet-ID state for transmitter */ +struct ovpn_pktid_xmit { + atomic_t seq_num; +}; + +/* replay window sizing in bytes = 2^REPLAY_WINDOW_ORDER */ +#define REPLAY_WINDOW_ORDER 8 + +#define REPLAY_WINDOW_BYTES BIT(REPLAY_WINDOW_ORDER) +#define REPLAY_WINDOW_SIZE (REPLAY_WINDOW_BYTES * 8) +#define REPLAY_INDEX(base, i) (((base) + (i)) & (REPLAY_WINDOW_SIZE - 1)) + +/* Packet-ID state for receiver. + * Other than lock member, can be zeroed to initialize. + */ +struct ovpn_pktid_recv { + /* "sliding window" bitmask of recent packet IDs received */ + u8 history[REPLAY_WINDOW_BYTES]; + /* bit position of deque base in history */ + unsigned int base; + /* extent (in bits) of deque in history */ + unsigned int extent; + /* expiration of history in jiffies */ + unsigned long expire; + /* highest sequence number received */ + u32 id; + /* highest time stamp received */ + u32 time; + /* we will only accept backtrack IDs > id_floor */ + u32 id_floor; + unsigned int max_backtrack; + /* protects entire pktd ID state */ + spinlock_t lock; +}; + +/* Get the next packet ID for xmit */ +static inline int ovpn_pktid_xmit_next(struct ovpn_pktid_xmit *pid, u32 *pktid) +{ + const u32 seq_num = atomic_fetch_add_unless(&pid->seq_num, 1, 0); + /* when the 32bit space is over, we return an error because the packet + * ID is used to create the cipher IV and we do not want to reuse the + * same value more than once + */ + if (unlikely(!seq_num)) + return -ERANGE; + + *pktid = seq_num; + + return 0; +} + +/* Write 12-byte AEAD IV to dest */ +static inline void ovpn_pktid_aead_write(const u32 pktid, + const u8 nt[], + unsigned char *dest) +{ + *(__force __be32 *)(dest) = htonl(pktid); + BUILD_BUG_ON(4 + OVPN_NONCE_TAIL_SIZE != OVPN_NONCE_SIZE); + memcpy(dest + 4, nt, OVPN_NONCE_TAIL_SIZE); +} + +void ovpn_pktid_xmit_init(struct ovpn_pktid_xmit *pid); +void ovpn_pktid_recv_init(struct ovpn_pktid_recv *pr); + +int ovpn_pktid_recv(struct ovpn_pktid_recv *pr, u32 pkt_id, u32 pkt_time); + +#endif /* _NET_OVPN_OVPNPKTID_H_ */ diff --git a/drivers/net/ovpn/proto.h b/drivers/net/ovpn/proto.h new file mode 100644 index 0000000000000..b7d285b4d9c1d --- /dev/null +++ b/drivers/net/ovpn/proto.h @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + * James Yonan + */ + +#ifndef _NET_OVPN_PROTO_H_ +#define _NET_OVPN_PROTO_H_ + +#include "main.h" + +#include +#include + +/* When the OpenVPN protocol is ran in AEAD mode, use + * the OpenVPN packet ID as the AEAD nonce: + * + * 00000005 521c3b01 4308c041 + * [seq # ] [ nonce_tail ] + * [ 12-byte full IV ] -> OVPN_NONCE_SIZE + * [4-bytes -> OVPN_NONCE_WIRE_SIZE + * on wire] + */ + +/* nonce size (96bits) as required by AEAD ciphers */ +#define OVPN_NONCE_SIZE 12 +/* last 8 bytes of AEAD nonce: provided by userspace and usually derived + * from key material generated during TLS handshake + */ +#define OVPN_NONCE_TAIL_SIZE 8 + +/* OpenVPN nonce size reduced by 8-byte nonce tail -- this is the + * size of the AEAD Associated Data (AD) sent over the wire + * and is normally the head of the IV + */ +#define OVPN_NONCE_WIRE_SIZE (OVPN_NONCE_SIZE - OVPN_NONCE_TAIL_SIZE) + +#define OVPN_OPCODE_SIZE 4 /* DATA_V2 opcode size */ +#define OVPN_OPCODE_KEYID_MASK 0x07000000 +#define OVPN_OPCODE_PKTTYPE_MASK 0xF8000000 +#define OVPN_OPCODE_PEERID_MASK 0x00FFFFFF + +/* packet opcodes of interest to us */ +#define OVPN_DATA_V1 6 /* data channel v1 packet */ +#define OVPN_DATA_V2 9 /* data channel v2 packet */ + +#define OVPN_PEER_ID_UNDEF 0x00FFFFFF + +/** + * ovpn_opcode_from_skb - extract OP code from skb at specified offset + * @skb: the packet to extract the OP code from + * @offset: the offset in the data buffer where the OP code is located + * + * Note: this function assumes that the skb head was pulled enough + * to access the first 4 bytes. + * + * Return: the OP code + */ +static inline u8 ovpn_opcode_from_skb(const struct sk_buff *skb, u16 offset) +{ + u32 opcode = be32_to_cpu(*(__be32 *)(skb->data + offset)); + + return FIELD_GET(OVPN_OPCODE_PKTTYPE_MASK, opcode); +} + +/** + * ovpn_peer_id_from_skb - extract peer ID from skb at specified offset + * @skb: the packet to extract the OP code from + * @offset: the offset in the data buffer where the OP code is located + * + * Note: this function assumes that the skb head was pulled enough + * to access the first 4 bytes. + * + * Return: the peer ID + */ +static inline u32 ovpn_peer_id_from_skb(const struct sk_buff *skb, u16 offset) +{ + u32 opcode = be32_to_cpu(*(__be32 *)(skb->data + offset)); + + return FIELD_GET(OVPN_OPCODE_PEERID_MASK, opcode); +} + +/** + * ovpn_key_id_from_skb - extract key ID from the skb head + * @skb: the packet to extract the key ID code from + * + * Note: this function assumes that the skb head was pulled enough + * to access the first 4 bytes. + * + * Return: the key ID + */ +static inline u8 ovpn_key_id_from_skb(const struct sk_buff *skb) +{ + u32 opcode = be32_to_cpu(*(__be32 *)skb->data); + + return FIELD_GET(OVPN_OPCODE_KEYID_MASK, opcode); +} + +/** + * ovpn_opcode_compose - combine OP code, key ID and peer ID to wire format + * @opcode: the OP code + * @key_id: the key ID + * @peer_id: the peer ID + * + * Return: a 4 bytes integer obtained combining all input values following the + * OpenVPN wire format. This integer can then be written to the packet header. + */ +static inline u32 ovpn_opcode_compose(u8 opcode, u8 key_id, u32 peer_id) +{ + return FIELD_PREP(OVPN_OPCODE_PKTTYPE_MASK, opcode) | + FIELD_PREP(OVPN_OPCODE_KEYID_MASK, key_id) | + FIELD_PREP(OVPN_OPCODE_PEERID_MASK, peer_id); +} + +#endif /* _NET_OVPN_OVPNPROTO_H_ */ diff --git a/drivers/net/ovpn/skb.h b/drivers/net/ovpn/skb.h new file mode 100644 index 0000000000000..64430880f1dae --- /dev/null +++ b/drivers/net/ovpn/skb.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + * James Yonan + */ + +#ifndef _NET_OVPN_SKB_H_ +#define _NET_OVPN_SKB_H_ + +#include +#include +#include +#include +#include +#include +#include + +struct ovpn_cb { + struct ovpn_peer *peer; + struct ovpn_crypto_key_slot *ks; + struct aead_request *req; + struct scatterlist *sg; + u8 *iv; + unsigned int payload_offset; + bool nosignal; +}; + +static inline struct ovpn_cb *ovpn_skb_cb(struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(struct ovpn_cb) > sizeof(skb->cb)); + return (struct ovpn_cb *)skb->cb; +} + +/* Return IP protocol version from skb header. + * Return 0 if protocol is not IPv4/IPv6 or cannot be read. + */ +static inline __be16 ovpn_ip_check_protocol(struct sk_buff *skb) +{ + __be16 proto = 0; + + /* skb could be non-linear, + * make sure IP header is in non-fragmented part + */ + if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) + return 0; + + if (ip_hdr(skb)->version == 4) { + proto = htons(ETH_P_IP); + } else if (ip_hdr(skb)->version == 6) { + if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) + return 0; + proto = htons(ETH_P_IPV6); + } + + return proto; +} + +#endif /* _NET_OVPN_SKB_H_ */ diff --git a/drivers/net/ovpn/socket.c b/drivers/net/ovpn/socket.c new file mode 100644 index 0000000000000..a83cbab725910 --- /dev/null +++ b/drivers/net/ovpn/socket.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#include +#include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "io.h" +#include "peer.h" +#include "socket.h" +#include "tcp.h" +#include "udp.h" + +static void ovpn_socket_release_kref(struct kref *kref) +{ + struct ovpn_socket *sock = container_of(kref, struct ovpn_socket, + refcount); + + if (sock->sock->sk->sk_protocol == IPPROTO_UDP) + ovpn_udp_socket_detach(sock); + else if (sock->sock->sk->sk_protocol == IPPROTO_TCP) + ovpn_tcp_socket_detach(sock); +} + +/** + * ovpn_socket_put - decrease reference counter + * @peer: peer whose socket reference counter should be decreased + * @sock: the RCU protected peer socket + * + * This function is only used internally. Users willing to release + * references to the ovpn_socket should use ovpn_socket_release() + * + * Return: true if the socket was released, false otherwise + */ +static bool ovpn_socket_put(struct ovpn_peer *peer, struct ovpn_socket *sock) +{ + return kref_put(&sock->refcount, ovpn_socket_release_kref); +} + +/** + * ovpn_socket_release - release resources owned by socket user + * @peer: peer whose socket should be released + * + * This function should be invoked when the peer is being removed + * and wants to drop its link to the socket. + * + * In case of UDP, the detach routine will drop a reference to the + * ovpn netdev, pointed by the ovpn_socket. + * + * In case of TCP, releasing the socket will cause dropping + * the refcounter for the peer it is linked to, thus allowing the peer + * disappear as well. + * + * This function is expected to be invoked exactly once per peer + * + * NOTE: this function may sleep + */ +void ovpn_socket_release(struct ovpn_peer *peer) +{ + struct ovpn_socket *sock; + bool released; + + might_sleep(); + + sock = rcu_replace_pointer(peer->sock, NULL, true); + /* release may be invoked after socket was detached */ + if (!sock) + return; + + /* sanity check: we should not end up here if the socket + * was already closed + */ + if (!sock->sock->sk) { + DEBUG_NET_WARN_ON_ONCE(1); + return; + } + + /* Drop the reference while holding the sock lock to avoid + * concurrent ovpn_socket_new call to mess up with a partially + * detached socket. + * + * Holding the lock ensures that a socket with refcnt 0 is fully + * detached before it can be picked by a concurrent reader. + */ + lock_sock(sock->sock->sk); + released = ovpn_socket_put(peer, sock); + release_sock(sock->sock->sk); + + /* align all readers with sk_user_data being NULL */ + synchronize_rcu(); + + /* following cleanup should happen with lock released */ + if (released) { + if (sock->sock->sk->sk_protocol == IPPROTO_UDP) { + netdev_put(sock->ovpn->dev, &sock->dev_tracker); + } else if (sock->sock->sk->sk_protocol == IPPROTO_TCP) { + /* wait for TCP jobs to terminate */ + ovpn_tcp_socket_wait_finish(sock); + ovpn_peer_put(sock->peer); + } + /* we can call plain kfree() because we already waited one RCU + * period due to synchronize_rcu() + */ + kfree(sock); + } +} + +static bool ovpn_socket_hold(struct ovpn_socket *sock) +{ + return kref_get_unless_zero(&sock->refcount); +} + +static int ovpn_socket_attach(struct ovpn_socket *sock, struct ovpn_peer *peer) +{ + if (sock->sock->sk->sk_protocol == IPPROTO_UDP) + return ovpn_udp_socket_attach(sock, peer->ovpn); + else if (sock->sock->sk->sk_protocol == IPPROTO_TCP) + return ovpn_tcp_socket_attach(sock, peer); + + return -EOPNOTSUPP; +} + +/** + * ovpn_socket_new - create a new socket and initialize it + * @sock: the kernel socket to embed + * @peer: the peer reachable via this socket + * + * Return: an openvpn socket on success or a negative error code otherwise + */ +struct ovpn_socket *ovpn_socket_new(struct socket *sock, struct ovpn_peer *peer) +{ + struct ovpn_socket *ovpn_sock; + int ret; + + lock_sock(sock->sk); + + /* a TCP socket can only be owned by a single peer, therefore there + * can't be any other user + */ + if (sock->sk->sk_protocol == IPPROTO_TCP && sock->sk->sk_user_data) { + ovpn_sock = ERR_PTR(-EBUSY); + goto sock_release; + } + + /* a UDP socket can be shared across multiple peers, but we must make + * sure it is not owned by something else + */ + if (sock->sk->sk_protocol == IPPROTO_UDP) { + u8 type = READ_ONCE(udp_sk(sock->sk)->encap_type); + + /* socket owned by other encapsulation module */ + if (type && type != UDP_ENCAP_OVPNINUDP) { + ovpn_sock = ERR_PTR(-EBUSY); + goto sock_release; + } + + rcu_read_lock(); + ovpn_sock = rcu_dereference_sk_user_data(sock->sk); + if (ovpn_sock) { + /* socket owned by another ovpn instance, we can't use it */ + if (ovpn_sock->ovpn != peer->ovpn) { + ovpn_sock = ERR_PTR(-EBUSY); + rcu_read_unlock(); + goto sock_release; + } + + /* this socket is already owned by this instance, + * therefore we can increase the refcounter and + * use it as expected + */ + if (WARN_ON(!ovpn_socket_hold(ovpn_sock))) { + /* this should never happen because setting + * the refcnt to 0 and detaching the socket + * is expected to be atomic + */ + ovpn_sock = ERR_PTR(-EAGAIN); + rcu_read_unlock(); + goto sock_release; + } + + rcu_read_unlock(); + goto sock_release; + } + rcu_read_unlock(); + } + + /* socket is not owned: attach to this ovpn instance */ + + ovpn_sock = kzalloc(sizeof(*ovpn_sock), GFP_KERNEL); + if (!ovpn_sock) { + ovpn_sock = ERR_PTR(-ENOMEM); + goto sock_release; + } + + ovpn_sock->sock = sock; + kref_init(&ovpn_sock->refcount); + + ret = ovpn_socket_attach(ovpn_sock, peer); + if (ret < 0) { + kfree(ovpn_sock); + ovpn_sock = ERR_PTR(ret); + goto sock_release; + } + + /* TCP sockets are per-peer, therefore they are linked to their unique + * peer + */ + if (sock->sk->sk_protocol == IPPROTO_TCP) { + INIT_WORK(&ovpn_sock->tcp_tx_work, ovpn_tcp_tx_work); + ovpn_sock->peer = peer; + ovpn_peer_hold(peer); + } else if (sock->sk->sk_protocol == IPPROTO_UDP) { + /* in UDP we only link the ovpn instance since the socket is + * shared among multiple peers + */ + ovpn_sock->ovpn = peer->ovpn; + netdev_hold(peer->ovpn->dev, &ovpn_sock->dev_tracker, + GFP_KERNEL); + } + + rcu_assign_sk_user_data(sock->sk, ovpn_sock); +sock_release: + release_sock(sock->sk); + return ovpn_sock; +} diff --git a/drivers/net/ovpn/socket.h b/drivers/net/ovpn/socket.h new file mode 100644 index 0000000000000..00d856b1a5d88 --- /dev/null +++ b/drivers/net/ovpn/socket.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_SOCK_H_ +#define _NET_OVPN_SOCK_H_ + +#include +#include +#include + +struct ovpn_priv; +struct ovpn_peer; + +/** + * struct ovpn_socket - a kernel socket referenced in the ovpn code + * @ovpn: ovpn instance owning this socket (UDP only) + * @dev_tracker: reference tracker for associated dev (UDP only) + * @peer: unique peer transmitting over this socket (TCP only) + * @sock: the low level sock object + * @refcount: amount of contexts currently referencing this object + * @work: member used to schedule release routine (it may block) + * @tcp_tx_work: work for deferring outgoing packet processing (TCP only) + */ +struct ovpn_socket { + union { + struct { + struct ovpn_priv *ovpn; + netdevice_tracker dev_tracker; + }; + struct ovpn_peer *peer; + }; + + struct socket *sock; + struct kref refcount; + struct work_struct work; + struct work_struct tcp_tx_work; +}; + +struct ovpn_socket *ovpn_socket_new(struct socket *sock, + struct ovpn_peer *peer); +void ovpn_socket_release(struct ovpn_peer *peer); + +#endif /* _NET_OVPN_SOCK_H_ */ diff --git a/drivers/net/ovpn/stats.c b/drivers/net/ovpn/stats.c new file mode 100644 index 0000000000000..d637143473bb9 --- /dev/null +++ b/drivers/net/ovpn/stats.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#include + +#include "stats.h" + +void ovpn_peer_stats_init(struct ovpn_peer_stats *ps) +{ + atomic64_set(&ps->rx.bytes, 0); + atomic64_set(&ps->rx.packets, 0); + + atomic64_set(&ps->tx.bytes, 0); + atomic64_set(&ps->tx.packets, 0); +} diff --git a/drivers/net/ovpn/stats.h b/drivers/net/ovpn/stats.h new file mode 100644 index 0000000000000..53433d8b6c331 --- /dev/null +++ b/drivers/net/ovpn/stats.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + * Lev Stipakov + */ + +#ifndef _NET_OVPN_OVPNSTATS_H_ +#define _NET_OVPN_OVPNSTATS_H_ + +/* one stat */ +struct ovpn_peer_stat { + atomic64_t bytes; + atomic64_t packets; +}; + +/* rx and tx stats combined */ +struct ovpn_peer_stats { + struct ovpn_peer_stat rx; + struct ovpn_peer_stat tx; +}; + +void ovpn_peer_stats_init(struct ovpn_peer_stats *ps); + +static inline void ovpn_peer_stats_increment(struct ovpn_peer_stat *stat, + const unsigned int n) +{ + atomic64_add(n, &stat->bytes); + atomic64_inc(&stat->packets); +} + +static inline void ovpn_peer_stats_increment_rx(struct ovpn_peer_stats *stats, + const unsigned int n) +{ + ovpn_peer_stats_increment(&stats->rx, n); +} + +static inline void ovpn_peer_stats_increment_tx(struct ovpn_peer_stats *stats, + const unsigned int n) +{ + ovpn_peer_stats_increment(&stats->tx, n); +} + +#endif /* _NET_OVPN_OVPNSTATS_H_ */ diff --git a/drivers/net/ovpn/tcp.c b/drivers/net/ovpn/tcp.c new file mode 100644 index 0000000000000..7c42d84987ad3 --- /dev/null +++ b/drivers/net/ovpn/tcp.c @@ -0,0 +1,598 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "io.h" +#include "peer.h" +#include "proto.h" +#include "skb.h" +#include "tcp.h" + +#define OVPN_TCP_DEPTH_NESTING 2 +#if OVPN_TCP_DEPTH_NESTING == SINGLE_DEPTH_NESTING +#error "OVPN TCP requires its own lockdep subclass" +#endif + +static struct proto ovpn_tcp_prot __ro_after_init; +static struct proto_ops ovpn_tcp_ops __ro_after_init; +static struct proto ovpn_tcp6_prot __ro_after_init; +static struct proto_ops ovpn_tcp6_ops __ro_after_init; + +static int ovpn_tcp_parse(struct strparser *strp, struct sk_buff *skb) +{ + struct strp_msg *rxm = strp_msg(skb); + __be16 blen; + u16 len; + int err; + + /* when packets are written to the TCP stream, they are prepended with + * two bytes indicating the actual packet size. + * Parse accordingly and return the actual size (including the size + * header) + */ + + if (skb->len < rxm->offset + 2) + return 0; + + err = skb_copy_bits(skb, rxm->offset, &blen, sizeof(blen)); + if (err < 0) + return err; + + len = be16_to_cpu(blen); + if (len < 2) + return -EINVAL; + + return len + 2; +} + +/* queue skb for sending to userspace via recvmsg on the socket */ +static void ovpn_tcp_to_userspace(struct ovpn_peer *peer, struct sock *sk, + struct sk_buff *skb) +{ + skb_set_owner_r(skb, sk); + memset(skb->cb, 0, sizeof(skb->cb)); + skb_queue_tail(&peer->tcp.user_queue, skb); + peer->tcp.sk_cb.sk_data_ready(sk); +} + +static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb) +{ + struct ovpn_peer *peer = container_of(strp, struct ovpn_peer, tcp.strp); + struct strp_msg *msg = strp_msg(skb); + size_t pkt_len = msg->full_len - 2; + size_t off = msg->offset + 2; + u8 opcode; + + /* ensure skb->data points to the beginning of the openvpn packet */ + if (!pskb_pull(skb, off)) { + net_warn_ratelimited("%s: packet too small for peer %u\n", + netdev_name(peer->ovpn->dev), peer->id); + goto err; + } + + /* strparser does not trim the skb for us, therefore we do it now */ + if (pskb_trim(skb, pkt_len) != 0) { + net_warn_ratelimited("%s: trimming skb failed for peer %u\n", + netdev_name(peer->ovpn->dev), peer->id); + goto err; + } + + /* we need the first 4 bytes of data to be accessible + * to extract the opcode and the key ID later on + */ + if (!pskb_may_pull(skb, OVPN_OPCODE_SIZE)) { + net_warn_ratelimited("%s: packet too small to fetch opcode for peer %u\n", + netdev_name(peer->ovpn->dev), peer->id); + goto err; + } + + /* DATA_V2 packets are handled in kernel, the rest goes to user space */ + opcode = ovpn_opcode_from_skb(skb, 0); + if (unlikely(opcode != OVPN_DATA_V2)) { + if (opcode == OVPN_DATA_V1) { + net_warn_ratelimited("%s: DATA_V1 detected on the TCP stream\n", + netdev_name(peer->ovpn->dev)); + goto err; + } + + /* The packet size header must be there when sending the packet + * to userspace, therefore we put it back + */ + skb_push(skb, 2); + ovpn_tcp_to_userspace(peer, strp->sk, skb); + return; + } + + /* hold reference to peer as required by ovpn_recv(). + * + * NOTE: in this context we should already be holding a reference to + * this peer, therefore ovpn_peer_hold() is not expected to fail + */ + if (WARN_ON(!ovpn_peer_hold(peer))) + goto err; + + ovpn_recv(peer, skb); + return; +err: + dev_dstats_rx_dropped(peer->ovpn->dev); + kfree_skb(skb); + ovpn_peer_del(peer, OVPN_DEL_PEER_REASON_TRANSPORT_ERROR); +} + +static int ovpn_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int flags, int *addr_len) +{ + int err = 0, off, copied = 0, ret; + struct ovpn_socket *sock; + struct ovpn_peer *peer; + struct sk_buff *skb; + + rcu_read_lock(); + sock = rcu_dereference_sk_user_data(sk); + if (unlikely(!sock || !sock->peer || !ovpn_peer_hold(sock->peer))) { + rcu_read_unlock(); + return -EBADF; + } + peer = sock->peer; + rcu_read_unlock(); + + skb = __skb_recv_datagram(sk, &peer->tcp.user_queue, flags, &off, &err); + if (!skb) { + if (err == -EAGAIN && sk->sk_shutdown & RCV_SHUTDOWN) { + ret = 0; + goto out; + } + ret = err; + goto out; + } + + copied = len; + if (copied > skb->len) + copied = skb->len; + else if (copied < skb->len) + msg->msg_flags |= MSG_TRUNC; + + err = skb_copy_datagram_msg(skb, 0, msg, copied); + if (unlikely(err)) { + kfree_skb(skb); + ret = err; + goto out; + } + + if (flags & MSG_TRUNC) + copied = skb->len; + kfree_skb(skb); + ret = copied; +out: + ovpn_peer_put(peer); + return ret; +} + +void ovpn_tcp_socket_detach(struct ovpn_socket *ovpn_sock) +{ + struct ovpn_peer *peer = ovpn_sock->peer; + struct socket *sock = ovpn_sock->sock; + + strp_stop(&peer->tcp.strp); + skb_queue_purge(&peer->tcp.user_queue); + + /* restore CBs that were saved in ovpn_sock_set_tcp_cb() */ + sock->sk->sk_data_ready = peer->tcp.sk_cb.sk_data_ready; + sock->sk->sk_write_space = peer->tcp.sk_cb.sk_write_space; + sock->sk->sk_prot = peer->tcp.sk_cb.prot; + sock->sk->sk_socket->ops = peer->tcp.sk_cb.ops; + + rcu_assign_sk_user_data(sock->sk, NULL); +} + +void ovpn_tcp_socket_wait_finish(struct ovpn_socket *sock) +{ + struct ovpn_peer *peer = sock->peer; + + /* NOTE: we don't wait for peer->tcp.defer_del_work to finish: + * either the worker is not running or this function + * was invoked by that worker. + */ + + cancel_work_sync(&sock->tcp_tx_work); + strp_done(&peer->tcp.strp); + + skb_queue_purge(&peer->tcp.out_queue); + kfree_skb(peer->tcp.out_msg.skb); + peer->tcp.out_msg.skb = NULL; +} + +static void ovpn_tcp_send_sock(struct ovpn_peer *peer, struct sock *sk) +{ + struct sk_buff *skb = peer->tcp.out_msg.skb; + int ret, flags; + + if (!skb) + return; + + if (peer->tcp.tx_in_progress) + return; + + peer->tcp.tx_in_progress = true; + + do { + flags = ovpn_skb_cb(skb)->nosignal ? MSG_NOSIGNAL : 0; + ret = skb_send_sock_locked_with_flags(sk, skb, + peer->tcp.out_msg.offset, + peer->tcp.out_msg.len, + flags); + if (unlikely(ret < 0)) { + if (ret == -EAGAIN) + goto out; + + net_warn_ratelimited("%s: TCP error to peer %u: %d\n", + netdev_name(peer->ovpn->dev), + peer->id, ret); + + /* in case of TCP error we can't recover the VPN + * stream therefore we abort the connection + */ + ovpn_peer_hold(peer); + schedule_work(&peer->tcp.defer_del_work); + + /* we bail out immediately and keep tx_in_progress set + * to true. This way we prevent more TX attempts + * which would lead to more invocations of + * schedule_work() + */ + return; + } + + peer->tcp.out_msg.len -= ret; + peer->tcp.out_msg.offset += ret; + } while (peer->tcp.out_msg.len > 0); + + if (!peer->tcp.out_msg.len) { + preempt_disable(); + dev_dstats_tx_add(peer->ovpn->dev, skb->len); + preempt_enable(); + } + + kfree_skb(peer->tcp.out_msg.skb); + peer->tcp.out_msg.skb = NULL; + peer->tcp.out_msg.len = 0; + peer->tcp.out_msg.offset = 0; + +out: + peer->tcp.tx_in_progress = false; +} + +void ovpn_tcp_tx_work(struct work_struct *work) +{ + struct ovpn_socket *sock; + + sock = container_of(work, struct ovpn_socket, tcp_tx_work); + + lock_sock(sock->sock->sk); + if (sock->peer) + ovpn_tcp_send_sock(sock->peer, sock->sock->sk); + release_sock(sock->sock->sk); +} + +static void ovpn_tcp_send_sock_skb(struct ovpn_peer *peer, struct sock *sk, + struct sk_buff *skb) +{ + if (peer->tcp.out_msg.skb) + ovpn_tcp_send_sock(peer, sk); + + if (peer->tcp.out_msg.skb) { + dev_dstats_tx_dropped(peer->ovpn->dev); + kfree_skb(skb); + return; + } + + peer->tcp.out_msg.skb = skb; + peer->tcp.out_msg.len = skb->len; + peer->tcp.out_msg.offset = 0; + ovpn_tcp_send_sock(peer, sk); +} + +void ovpn_tcp_send_skb(struct ovpn_peer *peer, struct socket *sock, + struct sk_buff *skb) +{ + u16 len = skb->len; + + *(__be16 *)__skb_push(skb, sizeof(u16)) = htons(len); + + spin_lock_nested(&sock->sk->sk_lock.slock, OVPN_TCP_DEPTH_NESTING); + if (sock_owned_by_user(sock->sk)) { + if (skb_queue_len(&peer->tcp.out_queue) >= + READ_ONCE(net_hotdata.max_backlog)) { + dev_dstats_tx_dropped(peer->ovpn->dev); + kfree_skb(skb); + goto unlock; + } + __skb_queue_tail(&peer->tcp.out_queue, skb); + } else { + ovpn_tcp_send_sock_skb(peer, sock->sk, skb); + } +unlock: + spin_unlock(&sock->sk->sk_lock.slock); +} + +static void ovpn_tcp_release(struct sock *sk) +{ + struct sk_buff_head queue; + struct ovpn_socket *sock; + struct ovpn_peer *peer; + struct sk_buff *skb; + + rcu_read_lock(); + sock = rcu_dereference_sk_user_data(sk); + if (!sock) { + rcu_read_unlock(); + return; + } + + peer = sock->peer; + + /* during initialization this function is called before + * assigning sock->peer + */ + if (unlikely(!peer || !ovpn_peer_hold(peer))) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + __skb_queue_head_init(&queue); + skb_queue_splice_init(&peer->tcp.out_queue, &queue); + + while ((skb = __skb_dequeue(&queue))) + ovpn_tcp_send_sock_skb(peer, sk, skb); + + peer->tcp.sk_cb.prot->release_cb(sk); + ovpn_peer_put(peer); +} + +static int ovpn_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + struct ovpn_socket *sock; + int ret, linear = PAGE_SIZE; + struct ovpn_peer *peer; + struct sk_buff *skb; + + lock_sock(sk); + rcu_read_lock(); + sock = rcu_dereference_sk_user_data(sk); + if (unlikely(!sock || !sock->peer || !ovpn_peer_hold(sock->peer))) { + rcu_read_unlock(); + release_sock(sk); + return -EIO; + } + rcu_read_unlock(); + peer = sock->peer; + + if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL)) { + ret = -EOPNOTSUPP; + goto peer_free; + } + + if (peer->tcp.out_msg.skb) { + ret = -EAGAIN; + goto peer_free; + } + + if (size < linear) + linear = size; + + skb = sock_alloc_send_pskb(sk, linear, size - linear, + msg->msg_flags & MSG_DONTWAIT, &ret, 0); + if (!skb) { + net_err_ratelimited("%s: skb alloc failed: %d\n", + netdev_name(peer->ovpn->dev), ret); + goto peer_free; + } + + skb_put(skb, linear); + skb->len = size; + skb->data_len = size - linear; + + ret = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); + if (ret) { + kfree_skb(skb); + net_err_ratelimited("%s: skb copy from iter failed: %d\n", + netdev_name(peer->ovpn->dev), ret); + goto peer_free; + } + + ovpn_skb_cb(skb)->nosignal = msg->msg_flags & MSG_NOSIGNAL; + ovpn_tcp_send_sock_skb(peer, sk, skb); + ret = size; +peer_free: + release_sock(sk); + ovpn_peer_put(peer); + return ret; +} + +static int ovpn_tcp_disconnect(struct sock *sk, int flags) +{ + return -EBUSY; +} + +static void ovpn_tcp_data_ready(struct sock *sk) +{ + struct ovpn_socket *sock; + + trace_sk_data_ready(sk); + + rcu_read_lock(); + sock = rcu_dereference_sk_user_data(sk); + if (likely(sock && sock->peer)) + strp_data_ready(&sock->peer->tcp.strp); + rcu_read_unlock(); +} + +static void ovpn_tcp_write_space(struct sock *sk) +{ + struct ovpn_socket *sock; + + rcu_read_lock(); + sock = rcu_dereference_sk_user_data(sk); + if (likely(sock && sock->peer)) { + schedule_work(&sock->tcp_tx_work); + sock->peer->tcp.sk_cb.sk_write_space(sk); + } + rcu_read_unlock(); +} + +static void ovpn_tcp_build_protos(struct proto *new_prot, + struct proto_ops *new_ops, + const struct proto *orig_prot, + const struct proto_ops *orig_ops); + +static void ovpn_tcp_peer_del_work(struct work_struct *work) +{ + struct ovpn_peer *peer = container_of(work, struct ovpn_peer, + tcp.defer_del_work); + + ovpn_peer_del(peer, OVPN_DEL_PEER_REASON_TRANSPORT_ERROR); + ovpn_peer_put(peer); +} + +/* Set TCP encapsulation callbacks */ +int ovpn_tcp_socket_attach(struct ovpn_socket *ovpn_sock, + struct ovpn_peer *peer) +{ + struct socket *sock = ovpn_sock->sock; + struct strp_callbacks cb = { + .rcv_msg = ovpn_tcp_rcv, + .parse_msg = ovpn_tcp_parse, + }; + int ret; + + /* make sure no pre-existing encapsulation handler exists */ + if (sock->sk->sk_user_data) + return -EBUSY; + + /* only a fully connected socket is expected. Connection should be + * handled in userspace + */ + if (sock->sk->sk_state != TCP_ESTABLISHED) { + net_err_ratelimited("%s: provided TCP socket is not in ESTABLISHED state: %d\n", + netdev_name(peer->ovpn->dev), + sock->sk->sk_state); + return -EINVAL; + } + + ret = strp_init(&peer->tcp.strp, sock->sk, &cb); + if (ret < 0) { + DEBUG_NET_WARN_ON_ONCE(1); + return ret; + } + + INIT_WORK(&peer->tcp.defer_del_work, ovpn_tcp_peer_del_work); + + __sk_dst_reset(sock->sk); + skb_queue_head_init(&peer->tcp.user_queue); + skb_queue_head_init(&peer->tcp.out_queue); + + /* save current CBs so that they can be restored upon socket release */ + peer->tcp.sk_cb.sk_data_ready = sock->sk->sk_data_ready; + peer->tcp.sk_cb.sk_write_space = sock->sk->sk_write_space; + peer->tcp.sk_cb.prot = sock->sk->sk_prot; + peer->tcp.sk_cb.ops = sock->sk->sk_socket->ops; + + /* assign our static CBs and prot/ops */ + sock->sk->sk_data_ready = ovpn_tcp_data_ready; + sock->sk->sk_write_space = ovpn_tcp_write_space; + + if (sock->sk->sk_family == AF_INET) { + sock->sk->sk_prot = &ovpn_tcp_prot; + sock->sk->sk_socket->ops = &ovpn_tcp_ops; + } else { + sock->sk->sk_prot = &ovpn_tcp6_prot; + sock->sk->sk_socket->ops = &ovpn_tcp6_ops; + } + + /* avoid using task_frag */ + sock->sk->sk_allocation = GFP_ATOMIC; + sock->sk->sk_use_task_frag = false; + + /* enqueue the RX worker */ + strp_check_rcv(&peer->tcp.strp); + + return 0; +} + +static void ovpn_tcp_close(struct sock *sk, long timeout) +{ + struct ovpn_socket *sock; + struct ovpn_peer *peer; + + rcu_read_lock(); + sock = rcu_dereference_sk_user_data(sk); + if (!sock || !sock->peer || !ovpn_peer_hold(sock->peer)) { + rcu_read_unlock(); + return; + } + peer = sock->peer; + rcu_read_unlock(); + + ovpn_peer_del(sock->peer, OVPN_DEL_PEER_REASON_TRANSPORT_DISCONNECT); + peer->tcp.sk_cb.prot->close(sk, timeout); + ovpn_peer_put(peer); +} + +static __poll_t ovpn_tcp_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + __poll_t mask = datagram_poll(file, sock, wait); + struct ovpn_socket *ovpn_sock; + + rcu_read_lock(); + ovpn_sock = rcu_dereference_sk_user_data(sock->sk); + if (ovpn_sock && ovpn_sock->peer && + !skb_queue_empty(&ovpn_sock->peer->tcp.user_queue)) + mask |= EPOLLIN | EPOLLRDNORM; + rcu_read_unlock(); + + return mask; +} + +static void ovpn_tcp_build_protos(struct proto *new_prot, + struct proto_ops *new_ops, + const struct proto *orig_prot, + const struct proto_ops *orig_ops) +{ + memcpy(new_prot, orig_prot, sizeof(*new_prot)); + memcpy(new_ops, orig_ops, sizeof(*new_ops)); + new_prot->recvmsg = ovpn_tcp_recvmsg; + new_prot->sendmsg = ovpn_tcp_sendmsg; + new_prot->disconnect = ovpn_tcp_disconnect; + new_prot->close = ovpn_tcp_close; + new_prot->release_cb = ovpn_tcp_release; + new_ops->poll = ovpn_tcp_poll; +} + +/* Initialize TCP static objects */ +void __init ovpn_tcp_init(void) +{ + ovpn_tcp_build_protos(&ovpn_tcp_prot, &ovpn_tcp_ops, &tcp_prot, + &inet_stream_ops); + +#if IS_ENABLED(CONFIG_IPV6) + ovpn_tcp_build_protos(&ovpn_tcp6_prot, &ovpn_tcp6_ops, &tcpv6_prot, + &inet6_stream_ops); +#endif +} diff --git a/drivers/net/ovpn/tcp.h b/drivers/net/ovpn/tcp.h new file mode 100644 index 0000000000000..10aefa834cf35 --- /dev/null +++ b/drivers/net/ovpn/tcp.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#ifndef _NET_OVPN_TCP_H_ +#define _NET_OVPN_TCP_H_ + +#include +#include +#include + +#include "peer.h" +#include "skb.h" +#include "socket.h" + +void __init ovpn_tcp_init(void); + +int ovpn_tcp_socket_attach(struct ovpn_socket *ovpn_sock, + struct ovpn_peer *peer); +void ovpn_tcp_socket_detach(struct ovpn_socket *ovpn_sock); +void ovpn_tcp_socket_wait_finish(struct ovpn_socket *sock); + +/* Prepare skb and enqueue it for sending to peer. + * + * Preparation consist in prepending the skb payload with its size. + * Required by the OpenVPN protocol in order to extract packets from + * the TCP stream on the receiver side. + */ +void ovpn_tcp_send_skb(struct ovpn_peer *peer, struct socket *sock, struct sk_buff *skb); +void ovpn_tcp_tx_work(struct work_struct *work); + +#endif /* _NET_OVPN_TCP_H_ */ diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c new file mode 100644 index 0000000000000..c9e189056f336 --- /dev/null +++ b/drivers/net/ovpn/udp.c @@ -0,0 +1,439 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "bind.h" +#include "io.h" +#include "peer.h" +#include "proto.h" +#include "socket.h" +#include "udp.h" + +/* Retrieve the corresponding ovpn object from a UDP socket + * rcu_read_lock must be held on entry + */ +static struct ovpn_socket *ovpn_socket_from_udp_sock(struct sock *sk) +{ + struct ovpn_socket *ovpn_sock; + + if (unlikely(READ_ONCE(udp_sk(sk)->encap_type) != UDP_ENCAP_OVPNINUDP)) + return NULL; + + ovpn_sock = rcu_dereference_sk_user_data(sk); + if (unlikely(!ovpn_sock)) + return NULL; + + /* make sure that sk matches our stored transport socket */ + if (unlikely(!ovpn_sock->sock || sk != ovpn_sock->sock->sk)) + return NULL; + + return ovpn_sock; +} + +/** + * ovpn_udp_encap_recv - Start processing a received UDP packet. + * @sk: socket over which the packet was received + * @skb: the received packet + * + * If the first byte of the payload is: + * - DATA_V2 the packet is accepted for further processing, + * - DATA_V1 the packet is dropped as not supported, + * - anything else the packet is forwarded to the UDP stack for + * delivery to user space. + * + * Return: + * 0 if skb was consumed or dropped + * >0 if skb should be passed up to userspace as UDP (packet not consumed) + * <0 if skb should be resubmitted as proto -N (packet not consumed) + */ +static int ovpn_udp_encap_recv(struct sock *sk, struct sk_buff *skb) +{ + struct ovpn_socket *ovpn_sock; + struct ovpn_priv *ovpn; + struct ovpn_peer *peer; + u32 peer_id; + u8 opcode; + + ovpn_sock = ovpn_socket_from_udp_sock(sk); + if (unlikely(!ovpn_sock)) { + net_err_ratelimited("ovpn: %s invoked on non ovpn socket\n", + __func__); + goto drop_noovpn; + } + + ovpn = ovpn_sock->ovpn; + if (unlikely(!ovpn)) { + net_err_ratelimited("ovpn: cannot obtain ovpn object from UDP socket\n"); + goto drop_noovpn; + } + + /* Make sure the first 4 bytes of the skb data buffer after the UDP + * header are accessible. + * They are required to fetch the OP code, the key ID and the peer ID. + */ + if (unlikely(!pskb_may_pull(skb, sizeof(struct udphdr) + + OVPN_OPCODE_SIZE))) { + net_dbg_ratelimited("%s: packet too small from UDP socket\n", + netdev_name(ovpn->dev)); + goto drop; + } + + opcode = ovpn_opcode_from_skb(skb, sizeof(struct udphdr)); + if (unlikely(opcode != OVPN_DATA_V2)) { + /* DATA_V1 is not supported */ + if (opcode == OVPN_DATA_V1) + goto drop; + + /* unknown or control packet: let it bubble up to userspace */ + return 1; + } + + peer_id = ovpn_peer_id_from_skb(skb, sizeof(struct udphdr)); + /* some OpenVPN server implementations send data packets with the + * peer-id set to UNDEF. In this case we skip the peer lookup by peer-id + * and we try with the transport address + */ + if (peer_id == OVPN_PEER_ID_UNDEF) + peer = ovpn_peer_get_by_transp_addr(ovpn, skb); + else + peer = ovpn_peer_get_by_id(ovpn, peer_id); + + if (unlikely(!peer)) + goto drop; + + /* pop off outer UDP header */ + __skb_pull(skb, sizeof(struct udphdr)); + ovpn_recv(peer, skb); + return 0; + +drop: + dev_dstats_rx_dropped(ovpn->dev); +drop_noovpn: + kfree_skb(skb); + return 0; +} + +/** + * ovpn_udp4_output - send IPv4 packet over udp socket + * @peer: the destination peer + * @bind: the binding related to the destination peer + * @cache: dst cache + * @sk: the socket to send the packet over + * @skb: the packet to send + * + * Return: 0 on success or a negative error code otherwise + */ +static int ovpn_udp4_output(struct ovpn_peer *peer, struct ovpn_bind *bind, + struct dst_cache *cache, struct sock *sk, + struct sk_buff *skb) +{ + struct rtable *rt; + struct flowi4 fl = { + .saddr = bind->local.ipv4.s_addr, + .daddr = bind->remote.in4.sin_addr.s_addr, + .fl4_sport = inet_sk(sk)->inet_sport, + .fl4_dport = bind->remote.in4.sin_port, + .flowi4_proto = sk->sk_protocol, + .flowi4_mark = sk->sk_mark, + }; + int ret; + + local_bh_disable(); + rt = dst_cache_get_ip4(cache, &fl.saddr); + if (rt) + goto transmit; + + if (unlikely(!inet_confirm_addr(sock_net(sk), NULL, 0, fl.saddr, + RT_SCOPE_HOST))) { + /* we may end up here when the cached address is not usable + * anymore. In this case we reset address/cache and perform a + * new look up + */ + fl.saddr = 0; + spin_lock_bh(&peer->lock); + bind->local.ipv4.s_addr = 0; + spin_unlock_bh(&peer->lock); + dst_cache_reset(cache); + } + + rt = ip_route_output_flow(sock_net(sk), &fl, sk); + if (IS_ERR(rt) && PTR_ERR(rt) == -EINVAL) { + fl.saddr = 0; + spin_lock_bh(&peer->lock); + bind->local.ipv4.s_addr = 0; + spin_unlock_bh(&peer->lock); + dst_cache_reset(cache); + + rt = ip_route_output_flow(sock_net(sk), &fl, sk); + } + + if (IS_ERR(rt)) { + ret = PTR_ERR(rt); + net_dbg_ratelimited("%s: no route to host %pISpc: %d\n", + netdev_name(peer->ovpn->dev), + &bind->remote.in4, + ret); + goto err; + } + dst_cache_set_ip4(cache, &rt->dst, fl.saddr); + +transmit: + udp_tunnel_xmit_skb(rt, sk, skb, fl.saddr, fl.daddr, 0, + ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport, + fl.fl4_dport, false, sk->sk_no_check_tx); + ret = 0; +err: + local_bh_enable(); + return ret; +} + +#if IS_ENABLED(CONFIG_IPV6) +/** + * ovpn_udp6_output - send IPv6 packet over udp socket + * @peer: the destination peer + * @bind: the binding related to the destination peer + * @cache: dst cache + * @sk: the socket to send the packet over + * @skb: the packet to send + * + * Return: 0 on success or a negative error code otherwise + */ +static int ovpn_udp6_output(struct ovpn_peer *peer, struct ovpn_bind *bind, + struct dst_cache *cache, struct sock *sk, + struct sk_buff *skb) +{ + struct dst_entry *dst; + int ret; + + struct flowi6 fl = { + .saddr = bind->local.ipv6, + .daddr = bind->remote.in6.sin6_addr, + .fl6_sport = inet_sk(sk)->inet_sport, + .fl6_dport = bind->remote.in6.sin6_port, + .flowi6_proto = sk->sk_protocol, + .flowi6_mark = sk->sk_mark, + .flowi6_oif = bind->remote.in6.sin6_scope_id, + }; + + local_bh_disable(); + dst = dst_cache_get_ip6(cache, &fl.saddr); + if (dst) + goto transmit; + + if (unlikely(!ipv6_chk_addr(sock_net(sk), &fl.saddr, NULL, 0))) { + /* we may end up here when the cached address is not usable + * anymore. In this case we reset address/cache and perform a + * new look up + */ + fl.saddr = in6addr_any; + spin_lock_bh(&peer->lock); + bind->local.ipv6 = in6addr_any; + spin_unlock_bh(&peer->lock); + dst_cache_reset(cache); + } + + dst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(sk), sk, &fl, NULL); + if (IS_ERR(dst)) { + ret = PTR_ERR(dst); + net_dbg_ratelimited("%s: no route to host %pISpc: %d\n", + netdev_name(peer->ovpn->dev), + &bind->remote.in6, ret); + goto err; + } + dst_cache_set_ip6(cache, dst, &fl.saddr); + +transmit: + udp_tunnel6_xmit_skb(dst, sk, skb, skb->dev, &fl.saddr, &fl.daddr, 0, + ip6_dst_hoplimit(dst), 0, fl.fl6_sport, + fl.fl6_dport, udp_get_no_check6_tx(sk)); + ret = 0; +err: + local_bh_enable(); + return ret; +} +#endif + +/** + * ovpn_udp_output - transmit skb using udp-tunnel + * @peer: the destination peer + * @cache: dst cache + * @sk: the socket to send the packet over + * @skb: the packet to send + * + * rcu_read_lock should be held on entry. + * On return, the skb is consumed. + * + * Return: 0 on success or a negative error code otherwise + */ +static int ovpn_udp_output(struct ovpn_peer *peer, struct dst_cache *cache, + struct sock *sk, struct sk_buff *skb) +{ + struct ovpn_bind *bind; + int ret; + + /* set sk to null if skb is already orphaned */ + if (!skb->destructor) + skb->sk = NULL; + + rcu_read_lock(); + bind = rcu_dereference(peer->bind); + if (unlikely(!bind)) { + net_warn_ratelimited("%s: no bind for remote peer %u\n", + netdev_name(peer->ovpn->dev), peer->id); + ret = -ENODEV; + goto out; + } + + switch (bind->remote.in4.sin_family) { + case AF_INET: + ret = ovpn_udp4_output(peer, bind, cache, sk, skb); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + ret = ovpn_udp6_output(peer, bind, cache, sk, skb); + break; +#endif + default: + ret = -EAFNOSUPPORT; + break; + } + +out: + rcu_read_unlock(); + return ret; +} + +/** + * ovpn_udp_send_skb - prepare skb and send it over via UDP + * @peer: the destination peer + * @sock: the RCU protected peer socket + * @skb: the packet to send + */ +void ovpn_udp_send_skb(struct ovpn_peer *peer, struct socket *sock, + struct sk_buff *skb) +{ + int ret = -1; + + skb->dev = peer->ovpn->dev; + /* no checksum performed at this layer */ + skb->ip_summed = CHECKSUM_NONE; + + /* get socket info */ + if (unlikely(!sock)) { + net_warn_ratelimited("%s: no sock for remote peer %u\n", + netdev_name(peer->ovpn->dev), peer->id); + goto out; + } + + /* crypto layer -> transport (UDP) */ + ret = ovpn_udp_output(peer, &peer->dst_cache, sock->sk, skb); +out: + if (unlikely(ret < 0)) { + kfree_skb(skb); + return; + } +} + +static void ovpn_udp_encap_destroy(struct sock *sk) +{ + struct ovpn_socket *sock; + struct ovpn_priv *ovpn; + + rcu_read_lock(); + sock = rcu_dereference_sk_user_data(sk); + if (!sock || !sock->ovpn) { + rcu_read_unlock(); + return; + } + ovpn = sock->ovpn; + rcu_read_unlock(); + + ovpn_peers_free(ovpn, sk, OVPN_DEL_PEER_REASON_TRANSPORT_DISCONNECT); +} + +/** + * ovpn_udp_socket_attach - set udp-tunnel CBs on socket and link it to ovpn + * @ovpn_sock: socket to configure + * @ovpn: the openvp instance to link + * + * After invoking this function, the sock will be controlled by ovpn so that + * any incoming packet may be processed by ovpn first. + * + * Return: 0 on success or a negative error code otherwise + */ +int ovpn_udp_socket_attach(struct ovpn_socket *ovpn_sock, + struct ovpn_priv *ovpn) +{ + struct udp_tunnel_sock_cfg cfg = { + .encap_type = UDP_ENCAP_OVPNINUDP, + .encap_rcv = ovpn_udp_encap_recv, + .encap_destroy = ovpn_udp_encap_destroy, + }; + struct socket *sock = ovpn_sock->sock; + struct ovpn_socket *old_data; + int ret; + + /* make sure no pre-existing encapsulation handler exists */ + rcu_read_lock(); + old_data = rcu_dereference_sk_user_data(sock->sk); + if (!old_data) { + /* socket is currently unused - we can take it */ + rcu_read_unlock(); + setup_udp_tunnel_sock(sock_net(sock->sk), sock, &cfg); + return 0; + } + + /* socket is in use. We need to understand if it's owned by this ovpn + * instance or by something else. + * In the former case, we can increase the refcounter and happily + * use it, because the same UDP socket is expected to be shared among + * different peers. + * + * Unlikely TCP, a single UDP socket can be used to talk to many remote + * hosts and therefore openvpn instantiates one only for all its peers + */ + if ((READ_ONCE(udp_sk(sock->sk)->encap_type) == UDP_ENCAP_OVPNINUDP) && + old_data->ovpn == ovpn) { + netdev_dbg(ovpn->dev, + "provided socket already owned by this interface\n"); + ret = -EALREADY; + } else { + netdev_dbg(ovpn->dev, + "provided socket already taken by other user\n"); + ret = -EBUSY; + } + rcu_read_unlock(); + + return ret; +} + +/** + * ovpn_udp_socket_detach - clean udp-tunnel status for this socket + * @ovpn_sock: the socket to clean + */ +void ovpn_udp_socket_detach(struct ovpn_socket *ovpn_sock) +{ + struct udp_tunnel_sock_cfg cfg = { }; + + setup_udp_tunnel_sock(sock_net(ovpn_sock->sock->sk), ovpn_sock->sock, + &cfg); +} diff --git a/drivers/net/ovpn/udp.h b/drivers/net/ovpn/udp.h new file mode 100644 index 0000000000000..9994eb6e04283 --- /dev/null +++ b/drivers/net/ovpn/udp.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#ifndef _NET_OVPN_UDP_H_ +#define _NET_OVPN_UDP_H_ + +#include + +struct ovpn_peer; +struct ovpn_priv; +struct socket; + +int ovpn_udp_socket_attach(struct ovpn_socket *ovpn_sock, + struct ovpn_priv *ovpn); +void ovpn_udp_socket_detach(struct ovpn_socket *ovpn_sock); + +void ovpn_udp_send_skb(struct ovpn_peer *peer, struct socket *sock, + struct sk_buff *skb); + +#endif /* _NET_OVPN_UDP_H_ */ diff --git a/drivers/net/pfcp.c b/drivers/net/pfcp.c index f873a92d24459..28e6bc4a1f14c 100644 --- a/drivers/net/pfcp.c +++ b/drivers/net/pfcp.c @@ -245,30 +245,21 @@ static int __net_init pfcp_net_init(struct net *net) return 0; } -static void __net_exit pfcp_net_exit(struct net *net) +static void __net_exit pfcp_net_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { struct pfcp_net *pn = net_generic(net, pfcp_net_id); struct pfcp_dev *pfcp, *pfcp_next; - struct net_device *dev; - LIST_HEAD(list); - - rtnl_lock(); - for_each_netdev(net, dev) - if (dev->rtnl_link_ops == &pfcp_link_ops) - pfcp_dellink(dev, &list); list_for_each_entry_safe(pfcp, pfcp_next, &pn->pfcp_dev_list, list) - pfcp_dellink(pfcp->dev, &list); - - unregister_netdevice_many(&list); - rtnl_unlock(); + pfcp_dellink(pfcp->dev, dev_to_kill); } static struct pernet_operations pfcp_net_ops = { - .init = pfcp_net_init, - .exit = pfcp_net_exit, - .id = &pfcp_net_id, - .size = sizeof(struct pfcp_net), + .init = pfcp_net_init, + .exit_rtnl = pfcp_net_exit_rtnl, + .id = &pfcp_net_id, + .size = sizeof(struct pfcp_net), }; static int __init pfcp_init(void) diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index d29f9f7fd2e11..0b8cc325e2bf8 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -5,7 +5,6 @@ config PHYLINK tristate - depends on NETDEVICES select PHYLIB select SWPHY help @@ -15,7 +14,6 @@ config PHYLINK menuconfig PHYLIB tristate "PHY Device support and infrastructure" - depends on NETDEVICES select MDIO_DEVICE select MDIO_DEVRES help diff --git a/drivers/net/phy/air_en8811h.c b/drivers/net/phy/air_en8811h.c index e9fd24cb72709..57fbd8df94388 100644 --- a/drivers/net/phy/air_en8811h.c +++ b/drivers/net/phy/air_en8811h.c @@ -11,6 +11,7 @@ * Copyright (C) 2023 Airoha Technology Corp. */ +#include #include #include #include @@ -115,6 +116,11 @@ #define EN8811H_GPIO_OUTPUT 0xcf8b8 #define EN8811H_GPIO_OUTPUT_345 (BIT(3) | BIT(4) | BIT(5)) +#define EN8811H_HWTRAP1 0xcf914 +#define EN8811H_HWTRAP1_CKO BIT(12) +#define EN8811H_CLK_CGM 0xcf958 +#define EN8811H_CLK_CGM_CKO BIT(26) + #define EN8811H_FW_CTRL_1 0x0f0018 #define EN8811H_FW_CTRL_1_START 0x0 #define EN8811H_FW_CTRL_1_FINISH 0x1 @@ -142,10 +148,15 @@ struct led { unsigned long state; }; +#define clk_hw_to_en8811h_priv(_hw) \ + container_of(_hw, struct en8811h_priv, hw) + struct en8811h_priv { - u32 firmware_version; - bool mcu_needs_restart; - struct led led[EN8811H_LED_COUNT]; + u32 firmware_version; + bool mcu_needs_restart; + struct led led[EN8811H_LED_COUNT]; + struct clk_hw hw; + struct phy_device *phydev; }; enum { @@ -806,6 +817,86 @@ static int en8811h_led_hw_is_supported(struct phy_device *phydev, u8 index, return 0; }; +static unsigned long en8811h_clk_recalc_rate(struct clk_hw *hw, + unsigned long parent) +{ + struct en8811h_priv *priv = clk_hw_to_en8811h_priv(hw); + struct phy_device *phydev = priv->phydev; + u32 pbus_value; + int ret; + + ret = air_buckpbus_reg_read(phydev, EN8811H_HWTRAP1, &pbus_value); + if (ret < 0) + return ret; + + return (pbus_value & EN8811H_HWTRAP1_CKO) ? 50000000 : 25000000; +} + +static int en8811h_clk_enable(struct clk_hw *hw) +{ + struct en8811h_priv *priv = clk_hw_to_en8811h_priv(hw); + struct phy_device *phydev = priv->phydev; + + return air_buckpbus_reg_modify(phydev, EN8811H_CLK_CGM, + EN8811H_CLK_CGM_CKO, + EN8811H_CLK_CGM_CKO); +} + +static void en8811h_clk_disable(struct clk_hw *hw) +{ + struct en8811h_priv *priv = clk_hw_to_en8811h_priv(hw); + struct phy_device *phydev = priv->phydev; + + air_buckpbus_reg_modify(phydev, EN8811H_CLK_CGM, + EN8811H_CLK_CGM_CKO, 0); +} + +static int en8811h_clk_is_enabled(struct clk_hw *hw) +{ + struct en8811h_priv *priv = clk_hw_to_en8811h_priv(hw); + struct phy_device *phydev = priv->phydev; + u32 pbus_value; + int ret; + + ret = air_buckpbus_reg_read(phydev, EN8811H_CLK_CGM, &pbus_value); + if (ret < 0) + return ret; + + return (pbus_value & EN8811H_CLK_CGM_CKO); +} + +static const struct clk_ops en8811h_clk_ops = { + .recalc_rate = en8811h_clk_recalc_rate, + .enable = en8811h_clk_enable, + .disable = en8811h_clk_disable, + .is_enabled = en8811h_clk_is_enabled, +}; + +static int en8811h_clk_provider_setup(struct device *dev, struct clk_hw *hw) +{ + struct clk_init_data init; + int ret; + + if (!IS_ENABLED(CONFIG_COMMON_CLK)) + return 0; + + init.name = devm_kasprintf(dev, GFP_KERNEL, "%s-cko", + fwnode_get_name(dev_fwnode(dev))); + if (!init.name) + return -ENOMEM; + + init.ops = &en8811h_clk_ops; + init.flags = 0; + init.num_parents = 0; + hw->init = &init; + + ret = devm_clk_hw_register(dev, hw); + if (ret) + return ret; + + return devm_of_clk_add_hw_provider(dev, of_clk_hw_simple_get, hw); +} + static int en8811h_probe(struct phy_device *phydev) { struct en8811h_priv *priv; @@ -838,6 +929,12 @@ static int en8811h_probe(struct phy_device *phydev) return ret; } + priv->phydev = phydev; + /* Co-Clock Output */ + ret = en8811h_clk_provider_setup(&phydev->mdio.dev, &priv->hw); + if (ret) + return ret; + /* Configure led gpio pins as output */ ret = air_buckpbus_reg_modify(phydev, EN8811H_GPIO_OUTPUT, EN8811H_GPIO_OUTPUT_345, diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c index 85e231451093f..daab555721df8 100644 --- a/drivers/net/phy/dp83640.c +++ b/drivers/net/phy/dp83640.c @@ -478,13 +478,6 @@ static int ptp_dp83640_enable(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_EXTTS: - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Reject requests to enable time stamping on both edges. */ if ((rq->extts.flags & PTP_STRICT_FLAGS) && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -513,9 +506,6 @@ static int ptp_dp83640_enable(struct ptp_clock_info *ptp, return 0; case PTP_CLK_REQ_PEROUT: - /* Reject requests with unsupported flags */ - if (rq->perout.flags) - return -EOPNOTSUPP; if (rq->perout.index >= N_PER_OUT) return -EINVAL; return periodic_output(clock, rq, on, rq->perout.index); @@ -1002,6 +992,9 @@ static void dp83640_clock_init(struct dp83640_clock *clock, struct mii_bus *bus) clock->caps.n_per_out = N_PER_OUT; clock->caps.n_pins = DP83640_N_PINS; clock->caps.pps = 0; + clock->caps.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; clock->caps.adjfine = ptp_dp83640_adjfine; clock->caps.adjtime = ptp_dp83640_adjtime; clock->caps.gettime64 = ptp_dp83640_gettime; diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index 14f3615496384..490c9f4e5d4e4 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -33,6 +33,7 @@ #define MII_DP83822_MLEDCR 0x25 #define MII_DP83822_LDCTRL 0x403 #define MII_DP83822_LEDCFG1 0x460 +#define MII_DP83822_IOCTRL 0x461 #define MII_DP83822_IOCTRL1 0x462 #define MII_DP83822_IOCTRL2 0x463 #define MII_DP83822_GENCFG 0x465 @@ -118,6 +119,9 @@ #define DP83822_LEDCFG1_LED1_CTRL GENMASK(11, 8) #define DP83822_LEDCFG1_LED3_CTRL GENMASK(7, 4) +/* IOCTRL bits */ +#define DP83822_IOCTRL_MAC_IMPEDANCE_CTRL GENMASK(4, 1) + /* IOCTRL1 bits */ #define DP83822_IOCTRL1_GPIO3_CTRL GENMASK(10, 8) #define DP83822_IOCTRL1_GPIO3_CTRL_LED3 BIT(0) @@ -202,6 +206,7 @@ struct dp83822_private { u32 gpio2_clk_out; bool led_pin_enable[DP83822_MAX_LED_PINS]; int tx_amplitude_100base_tx_index; + int mac_termination_index; }; static int dp83822_config_wol(struct phy_device *phydev, @@ -533,6 +538,12 @@ static int dp83822_config_init(struct phy_device *phydev) FIELD_PREP(DP83822_100BASE_TX_LINE_DRIVER_SWING, dp83822->tx_amplitude_100base_tx_index)); + if (dp83822->mac_termination_index >= 0) + phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_IOCTRL, + DP83822_IOCTRL_MAC_IMPEDANCE_CTRL, + FIELD_PREP(DP83822_IOCTRL_MAC_IMPEDANCE_CTRL, + dp83822->mac_termination_index)); + err = dp83822_config_init_leds(phydev); if (err) return err; @@ -736,6 +747,10 @@ static const u32 tx_amplitude_100base_tx_gain[] = { 93, 95, 97, 98, 100, 102, 103, 105, }; +static const u32 mac_termination[] = { + 99, 91, 84, 78, 73, 69, 65, 61, 58, 55, 53, 50, 48, 46, 44, 43, +}; + static int dp83822_of_init_leds(struct phy_device *phydev) { struct device_node *node = phydev->mdio.dev.of_node; @@ -852,6 +867,23 @@ static int dp83822_of_init(struct phy_device *phydev) } } + ret = phy_get_mac_termination(phydev, dev, &val); + if (!ret) { + for (i = 0; i < ARRAY_SIZE(mac_termination); i++) { + if (mac_termination[i] == val) { + dp83822->mac_termination_index = i; + break; + } + } + + if (dp83822->mac_termination_index < 0) { + phydev_err(phydev, + "Invalid value for mac-termination-ohms property (%u)\n", + val); + return -EINVAL; + } + } + return dp83822_of_init_leds(phydev); } @@ -931,6 +963,7 @@ static int dp8382x_probe(struct phy_device *phydev) return -ENOMEM; dp83822->tx_amplitude_100base_tx_index = -1; + dp83822->mac_termination_index = -1; phydev->priv = dp83822; return 0; diff --git a/drivers/net/phy/mediatek/Kconfig b/drivers/net/phy/mediatek/Kconfig index 2a8ac5aed0f89..4308002bb82c4 100644 --- a/drivers/net/phy/mediatek/Kconfig +++ b/drivers/net/phy/mediatek/Kconfig @@ -15,8 +15,9 @@ config MEDIATEK_GE_PHY config MEDIATEK_GE_SOC_PHY tristate "MediaTek SoC Ethernet PHYs" - depends on (ARM64 && ARCH_MEDIATEK) || COMPILE_TEST - depends on NVMEM_MTK_EFUSE + depends on ARM64 || COMPILE_TEST + depends on ARCH_AIROHA || (ARCH_MEDIATEK && NVMEM_MTK_EFUSE) || \ + COMPILE_TEST select MTK_NET_PHYLIB help Supports MediaTek SoC built-in Gigabit Ethernet PHYs. diff --git a/drivers/net/phy/mediatek/mtk-ge-soc.c b/drivers/net/phy/mediatek/mtk-ge-soc.c index 175cf5239bba8..cd09684780a49 100644 --- a/drivers/net/phy/mediatek/mtk-ge-soc.c +++ b/drivers/net/phy/mediatek/mtk-ge-soc.c @@ -11,8 +11,11 @@ #include "../phylib.h" #include "mtk.h" +#define MTK_PHY_MAX_LEDS 2 + #define MTK_GPHY_ID_MT7981 0x03a29461 #define MTK_GPHY_ID_MT7988 0x03a29481 +#define MTK_GPHY_ID_AN7581 0x03a294c1 #define MTK_EXT_PAGE_ACCESS 0x1f #define MTK_PHY_PAGE_STANDARD 0x0000 @@ -1406,6 +1409,52 @@ static int mt7981_phy_probe(struct phy_device *phydev) return mt798x_phy_calibration(phydev); } +static int an7581_phy_probe(struct phy_device *phydev) +{ + struct mtk_socphy_priv *priv; + struct pinctrl *pinctrl; + + /* Toggle pinctrl to enable PHY LED */ + pinctrl = devm_pinctrl_get_select(&phydev->mdio.dev, "gbe-led"); + if (IS_ERR(pinctrl)) + dev_err(&phydev->mdio.bus->dev, + "Failed to setup PHY LED pinctrl\n"); + + priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + phydev->priv = priv; + + return 0; +} + +static int an7581_phy_led_polarity_set(struct phy_device *phydev, int index, + unsigned long modes) +{ + u16 val = 0; + u32 mode; + + if (index >= MTK_PHY_MAX_LEDS) + return -EINVAL; + + for_each_set_bit(mode, &modes, __PHY_LED_MODES_NUM) { + switch (mode) { + case PHY_LED_ACTIVE_LOW: + val = MTK_PHY_LED_ON_POLARITY; + break; + case PHY_LED_ACTIVE_HIGH: + break; + default: + return -EINVAL; + } + } + + return phy_modify_mmd(phydev, MDIO_MMD_VEND2, index ? + MTK_PHY_LED1_ON_CTRL : MTK_PHY_LED0_ON_CTRL, + MTK_PHY_LED_ON_POLARITY, val); +} + static struct phy_driver mtk_socphy_driver[] = { { PHY_ID_MATCH_EXACT(MTK_GPHY_ID_MT7981), @@ -1441,6 +1490,17 @@ static struct phy_driver mtk_socphy_driver[] = { .led_hw_control_set = mt798x_phy_led_hw_control_set, .led_hw_control_get = mt798x_phy_led_hw_control_get, }, + { + PHY_ID_MATCH_EXACT(MTK_GPHY_ID_AN7581), + .name = "Airoha AN7581 PHY", + .probe = an7581_phy_probe, + .led_blink_set = mt798x_phy_led_blink_set, + .led_brightness_set = mt798x_phy_led_brightness_set, + .led_hw_is_supported = mt798x_phy_led_hw_is_supported, + .led_hw_control_set = mt798x_phy_led_hw_control_set, + .led_hw_control_get = mt798x_phy_led_hw_control_get, + .led_polarity_set = an7581_phy_led_polarity_set, + }, }; module_phy_driver(mtk_socphy_driver); @@ -1448,6 +1508,7 @@ module_phy_driver(mtk_socphy_driver); static const struct mdio_device_id __maybe_unused mtk_socphy_tbl[] = { { PHY_ID_MATCH_EXACT(MTK_GPHY_ID_MT7981) }, { PHY_ID_MATCH_EXACT(MTK_GPHY_ID_MT7988) }, + { PHY_ID_MATCH_EXACT(MTK_GPHY_ID_AN7581) }, { } }; diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 24882d30f6858..71fb4410c31b1 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -3236,10 +3236,6 @@ static int lan8814_ptp_perout(struct ptp_clock_info *ptpci, int pulse_width; int pin, event; - /* Reject requests with unsupported flags */ - if (rq->perout.flags & ~PTP_PEROUT_DUTY_CYCLE) - return -EOPNOTSUPP; - mutex_lock(&shared->shared_lock); event = rq->perout.index; pin = ptp_find_pin(shared->ptp_clock, PTP_PF_PEROUT, event); @@ -3406,11 +3402,6 @@ static int lan8814_ptp_extts(struct ptp_clock_info *ptpci, struct phy_device *phydev = shared->phydev; int pin; - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_EXTTS_EDGES | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - pin = ptp_find_pin(shared->ptp_clock, PTP_PF_EXTTS, rq->extts.index); if (pin == -1 || pin != LAN8814_PTP_EXTTS_NUM) @@ -3917,6 +3908,10 @@ static int lan8814_ptp_probe_once(struct phy_device *phydev) shared->ptp_clock_info.n_ext_ts = LAN8814_PTP_EXTTS_NUM; shared->ptp_clock_info.n_pins = LAN8814_PTP_GPIO_NUM; shared->ptp_clock_info.pps = 0; + shared->ptp_clock_info.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; + shared->ptp_clock_info.supported_perout_flags = PTP_PEROUT_DUTY_CYCLE; shared->ptp_clock_info.pin_config = shared->pin_config; shared->ptp_clock_info.n_per_out = LAN8814_PTP_PEROUT_NUM; shared->ptp_clock_info.adjfine = lan8814_ptpci_adjfine; @@ -5068,9 +5063,6 @@ static int lan8841_ptp_perout(struct ptp_clock_info *ptp, int pin; int ret; - if (rq->perout.flags & ~PTP_PEROUT_DUTY_CYCLE) - return -EOPNOTSUPP; - pin = ptp_find_pin(ptp_priv->ptp_clock, PTP_PF_PEROUT, rq->perout.index); if (pin == -1 || pin >= LAN8841_PTP_GPIO_NUM) return -EINVAL; @@ -5314,6 +5306,7 @@ static struct ptp_clock_info lan8841_ptp_clock_info = { .n_per_out = LAN8841_PTP_GPIO_NUM, .n_ext_ts = LAN8841_PTP_GPIO_NUM, .n_pins = LAN8841_PTP_GPIO_NUM, + .supported_perout_flags = PTP_PEROUT_DUTY_CYCLE, }; #define LAN8841_OPERATION_MODE_STRAP_LOW_REGISTER 3 diff --git a/drivers/net/phy/microchip_rds_ptp.c b/drivers/net/phy/microchip_rds_ptp.c index 3e6bf10cdeed9..e6514ce04c29f 100644 --- a/drivers/net/phy/microchip_rds_ptp.c +++ b/drivers/net/phy/microchip_rds_ptp.c @@ -224,10 +224,6 @@ static int mchp_rds_ptp_perout(struct ptp_clock_info *ptpci, struct phy_device *phydev = clock->phydev; int ret, event_pin, pulsewidth; - /* Reject requests with unsupported flags */ - if (perout->flags & ~PTP_PEROUT_DUTY_CYCLE) - return -EOPNOTSUPP; - event_pin = ptp_find_pin(clock->ptp_clock, PTP_PF_PEROUT, perout->index); if (event_pin != clock->event_pin) @@ -1259,6 +1255,7 @@ struct mchp_rds_ptp_clock *mchp_rds_ptp_probe(struct phy_device *phydev, u8 mmd, clock->caps.pps = 0; clock->caps.n_pins = MCHP_RDS_PTP_N_PIN; clock->caps.n_per_out = MCHP_RDS_PTP_N_PEROUT; + clock->caps.supported_perout_flags = PTP_PEROUT_DUTY_CYCLE; clock->caps.pin_config = clock->pin_config; clock->caps.adjfine = mchp_rds_ptp_ltc_adjfine; clock->caps.adjtime = mchp_rds_ptp_ltc_adjtime; diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c index 250a018d55468..f11dd32494c30 100644 --- a/drivers/net/phy/nxp-c45-tja11xx.c +++ b/drivers/net/phy/nxp-c45-tja11xx.c @@ -763,9 +763,6 @@ static int nxp_c45_perout_enable(struct nxp_c45_phy *priv, struct phy_device *phydev = priv->phydev; int pin; - if (perout->flags & ~PTP_PEROUT_PHASE) - return -EOPNOTSUPP; - pin = ptp_find_pin(priv->ptp_clock, PTP_PF_PEROUT, perout->index); if (pin < 0) return pin; @@ -861,12 +858,6 @@ static int nxp_c45_extts_enable(struct nxp_c45_phy *priv, const struct nxp_c45_phy_data *data = nxp_c45_get_data(priv->phydev); int pin; - if (extts->flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Sampling on both edges is not supported */ if ((extts->flags & PTP_RISING_EDGE) && (extts->flags & PTP_FALLING_EDGE) && @@ -962,6 +953,10 @@ static int nxp_c45_init_ptp_clock(struct nxp_c45_phy *priv) .n_pins = ARRAY_SIZE(nxp_c45_ptp_pins), .n_ext_ts = 1, .n_per_out = 1, + .supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS, + .supported_perout_flags = PTP_PEROUT_PHASE, }; priv->ptp_clock = ptp_clock_register(&priv->caps, diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index cc1bfd22fb812..f85c172c446c5 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -2975,6 +2975,21 @@ int phy_get_tx_amplitude_gain(struct phy_device *phydev, struct device *dev, } EXPORT_SYMBOL_GPL(phy_get_tx_amplitude_gain); +/** + * phy_get_mac_termination - stores MAC termination in @val + * @phydev: phy_device struct + * @dev: pointer to the devices device struct + * @val: MAC termination + * + * Returns: 0 on success, < 0 on failure + */ +int phy_get_mac_termination(struct phy_device *phydev, struct device *dev, + u32 *val) +{ + return phy_get_u32_property(dev, "mac-termination-ohms", val); +} +EXPORT_SYMBOL_GPL(phy_get_mac_termination); + static int phy_led_set_brightness(struct led_classdev *led_cdev, enum led_brightness value) { @@ -3235,18 +3250,6 @@ struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode) } EXPORT_SYMBOL(fwnode_phy_find_device); -/** - * device_phy_find_device - For the given device, get the phy_device - * @dev: Pointer to the given device - * - * Refer return conditions of fwnode_phy_find_device(). - */ -struct phy_device *device_phy_find_device(struct device *dev) -{ - return fwnode_phy_find_device(dev_fwnode(dev)); -} -EXPORT_SYMBOL_GPL(device_phy_find_device); - /** * fwnode_get_phy_node - Get the phy_node using the named reference. * @fwnode: Pointer to fwnode from which phy_node has to be obtained. diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 53463767cc43e..def84e87e05b2 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1131,6 +1131,8 @@ static const struct file_operations ppp_device_fops = { .llseek = noop_llseek, }; +static void ppp_nl_dellink(struct net_device *dev, struct list_head *head); + static __net_init int ppp_init_net(struct net *net) { struct ppp_net *pn = net_generic(net, ppp_net_id); @@ -1146,28 +1148,20 @@ static __net_init int ppp_init_net(struct net *net) return 0; } -static __net_exit void ppp_exit_net(struct net *net) +static __net_exit void ppp_exit_rtnl_net(struct net *net, + struct list_head *dev_to_kill) { struct ppp_net *pn = net_generic(net, ppp_net_id); - struct net_device *dev; - struct net_device *aux; struct ppp *ppp; - LIST_HEAD(list); int id; - rtnl_lock(); - for_each_netdev_safe(net, dev, aux) { - if (dev->netdev_ops == &ppp_netdev_ops) - unregister_netdevice_queue(dev, &list); - } - idr_for_each_entry(&pn->units_idr, ppp, id) - /* Skip devices already unregistered by previous loop */ - if (!net_eq(dev_net(ppp->dev), net)) - unregister_netdevice_queue(ppp->dev, &list); + ppp_nl_dellink(ppp->dev, dev_to_kill); +} - unregister_netdevice_many(&list); - rtnl_unlock(); +static __net_exit void ppp_exit_net(struct net *net) +{ + struct ppp_net *pn = net_generic(net, ppp_net_id); mutex_destroy(&pn->all_ppp_mutex); idr_destroy(&pn->units_idr); @@ -1177,6 +1171,7 @@ static __net_exit void ppp_exit_net(struct net *net) static struct pernet_operations ppp_net_ops = { .init = ppp_init_net, + .exit_rtnl = ppp_exit_rtnl_net, .exit = ppp_exit_net, .id = &ppp_net_id, .size = sizeof(struct ppp_net), diff --git a/drivers/net/usb/asix.h b/drivers/net/usb/asix.h index 74162190bccc1..8531b804021aa 100644 --- a/drivers/net/usb/asix.h +++ b/drivers/net/usb/asix.h @@ -224,7 +224,6 @@ int asix_write_rx_ctl(struct usbnet *dev, u16 mode, int in_pm); u16 asix_read_medium_status(struct usbnet *dev, int in_pm); int asix_write_medium_mode(struct usbnet *dev, u16 mode, int in_pm); -void asix_adjust_link(struct net_device *netdev); int asix_write_gpio(struct usbnet *dev, u16 value, int sleep, int in_pm); diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c index 72ffc89b477ad..7fd763917ae2c 100644 --- a/drivers/net/usb/asix_common.c +++ b/drivers/net/usb/asix_common.c @@ -414,28 +414,6 @@ int asix_write_medium_mode(struct usbnet *dev, u16 mode, int in_pm) return ret; } -/* set MAC link settings according to information from phylib */ -void asix_adjust_link(struct net_device *netdev) -{ - struct phy_device *phydev = netdev->phydev; - struct usbnet *dev = netdev_priv(netdev); - u16 mode = 0; - - if (phydev->link) { - mode = AX88772_MEDIUM_DEFAULT; - - if (phydev->duplex == DUPLEX_HALF) - mode &= ~AX_MEDIUM_FD; - - if (phydev->speed != SPEED_100) - mode &= ~AX_MEDIUM_PS; - } - - asix_write_medium_mode(dev, mode, 0); - phy_print_status(phydev); - usbnet_link_change(dev, phydev->link, 0); -} - int asix_write_gpio(struct usbnet *dev, u16 value, int sleep, int in_pm) { int ret; diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c index da24941a6e444..9b0318fb50b55 100644 --- a/drivers/net/usb/asix_devices.c +++ b/drivers/net/usb/asix_devices.c @@ -752,7 +752,6 @@ static void ax88772_mac_link_down(struct phylink_config *config, struct usbnet *dev = netdev_priv(to_net_dev(config->dev)); asix_write_medium_mode(dev, 0, 0); - usbnet_link_change(dev, false, false); } static void ax88772_mac_link_up(struct phylink_config *config, @@ -783,7 +782,6 @@ static void ax88772_mac_link_up(struct phylink_config *config, m |= AX_MEDIUM_RFC; asix_write_medium_mode(dev, m, 0); - usbnet_link_change(dev, true, false); } static const struct phylink_mac_ops ax88772_phylink_mac_ops = { @@ -1350,10 +1348,9 @@ static const struct driver_info ax88772_info = { .description = "ASIX AX88772 USB 2.0 Ethernet", .bind = ax88772_bind, .unbind = ax88772_unbind, - .status = asix_status, .reset = ax88772_reset, .stop = ax88772_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | FLAG_MULTI_PACKET, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, }; @@ -1362,11 +1359,9 @@ static const struct driver_info ax88772b_info = { .description = "ASIX AX88772B USB 2.0 Ethernet", .bind = ax88772_bind, .unbind = ax88772_unbind, - .status = asix_status, .reset = ax88772_reset, .stop = ax88772_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | - FLAG_MULTI_PACKET, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, .data = FLAG_EEPROM_MAC, @@ -1376,11 +1371,9 @@ static const struct driver_info lxausb_t1l_info = { .description = "Linux Automation GmbH USB 10Base-T1L", .bind = ax88772_bind, .unbind = ax88772_unbind, - .status = asix_status, .reset = ax88772_reset, .stop = ax88772_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | - FLAG_MULTI_PACKET, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, .data = FLAG_EEPROM_MAC, @@ -1412,10 +1405,8 @@ static const struct driver_info hg20f9_info = { .description = "HG20F9 USB 2.0 Ethernet", .bind = ax88772_bind, .unbind = ax88772_unbind, - .status = asix_status, .reset = ax88772_reset, - .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | - FLAG_MULTI_PACKET, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, .data = FLAG_EEPROM_MAC, diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 9ccc3f09f71b8..a56d7239b1273 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -63,8 +64,12 @@ static int vxlan_sock_add(struct vxlan_dev *vxlan); static void vxlan_vs_del_dev(struct vxlan_dev *vxlan); -/* salt for hash table */ -static u32 vxlan_salt __read_mostly; +static const struct rhashtable_params vxlan_fdb_rht_params = { + .head_offset = offsetof(struct vxlan_fdb, rhnode), + .key_offset = offsetof(struct vxlan_fdb, key), + .key_len = sizeof(struct vxlan_fdb_key), + .automatic_shrinking = true, +}; static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) { @@ -186,7 +191,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, } else if (nh) { ndm->ndm_family = nh_family; } - send_eth = !is_zero_ether_addr(fdb->eth_addr); + send_eth = !is_zero_ether_addr(fdb->key.eth_addr); } else ndm->ndm_family = AF_BRIDGE; ndm->ndm_state = fdb->state; @@ -201,7 +206,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, peernet2id(dev_net(vxlan->dev), vxlan->net))) goto nla_put_failure; - if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) + if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.eth_addr)) goto nla_put_failure; if (nh) { if (nla_put_u32(skb, NDA_NH_ID, nh_id)) @@ -223,9 +228,9 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, goto nla_put_failure; } - if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni && + if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->key.vni && nla_put_u32(skb, NDA_SRC_VNI, - be32_to_cpu(fdb->vni))) + be32_to_cpu(fdb->key.vni))) goto nla_put_failure; ci.ndm_used = jiffies_to_clock_t(now - READ_ONCE(fdb->used)); @@ -293,8 +298,8 @@ static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan, fdb_info->remote_port = rd->remote_port; fdb_info->remote_vni = rd->remote_vni; fdb_info->remote_ifindex = rd->remote_ifindex; - memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN); - fdb_info->vni = fdb->vni; + memcpy(fdb_info->eth_addr, fdb->key.eth_addr, ETH_ALEN); + fdb_info->vni = fdb->key.vni; fdb_info->offloaded = rd->offloaded; fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER; } @@ -366,67 +371,42 @@ static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) }; struct vxlan_rdst remote = { }; - memcpy(f.eth_addr, eth_addr, ETH_ALEN); + memcpy(f.key.eth_addr, eth_addr, ETH_ALEN); vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); } -/* Hash Ethernet address */ -static u32 eth_hash(const unsigned char *addr) -{ - u64 value = get_unaligned((u64 *)addr); - - /* only want 6 bytes */ -#ifdef __BIG_ENDIAN - value >>= 16; -#else - value <<= 16; -#endif - return hash_64(value, FDB_HASH_BITS); -} - -u32 eth_vni_hash(const unsigned char *addr, __be32 vni) +/* Look up Ethernet address in forwarding table */ +static struct vxlan_fdb *vxlan_find_mac_rcu(struct vxlan_dev *vxlan, + const u8 *mac, __be32 vni) { - /* use 1 byte of OUI and 3 bytes of NIC */ - u32 key = get_unaligned((u32 *)(addr + 2)); - - return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1); -} + struct vxlan_fdb_key key; -u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) -{ - if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) - return eth_vni_hash(mac, vni); + memset(&key, 0, sizeof(key)); + memcpy(key.eth_addr, mac, sizeof(key.eth_addr)); + if (!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) + key.vni = vxlan->default_dst.remote_vni; else - return eth_hash(mac); -} + key.vni = vni; -/* Hash chain to use given mac address */ -static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, - const u8 *mac, __be32 vni) -{ - return &vxlan->fdb_head[fdb_head_index(vxlan, mac, vni)]; + return rhashtable_lookup(&vxlan->fdb_hash_tbl, &key, + vxlan_fdb_rht_params); } -/* Look up Ethernet address in forwarding table */ -static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan, - const u8 *mac, __be32 vni) +static struct vxlan_fdb *vxlan_find_mac_tx(struct vxlan_dev *vxlan, + const u8 *mac, __be32 vni) { - struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni); struct vxlan_fdb *f; - hlist_for_each_entry_rcu(f, head, hlist) { - if (ether_addr_equal(mac, f->eth_addr)) { - if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { - if (vni == f->vni) - return f; - } else { - return f; - } - } + f = vxlan_find_mac_rcu(vxlan, mac, vni); + if (f) { + unsigned long now = jiffies; + + if (READ_ONCE(f->used) != now) + WRITE_ONCE(f->used, now); } - return NULL; + return f; } static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, @@ -434,13 +414,11 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, { struct vxlan_fdb *f; - f = __vxlan_find_mac(vxlan, mac, vni); - if (f) { - unsigned long now = jiffies; + lockdep_assert_held_once(&vxlan->hash_lock); - if (READ_ONCE(f->used) != now) - WRITE_ONCE(f->used, now); - } + rcu_read_lock(); + f = vxlan_find_mac_rcu(vxlan, mac, vni); + rcu_read_unlock(); return f; } @@ -480,7 +458,7 @@ int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni, rcu_read_lock(); - f = __vxlan_find_mac(vxlan, eth_addr, vni); + f = vxlan_find_mac_rcu(vxlan, eth_addr, vni); if (!f) { rc = -ENOENT; goto out; @@ -517,32 +495,28 @@ int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, struct vxlan_dev *vxlan; struct vxlan_rdst *rdst; struct vxlan_fdb *f; - unsigned int h; int rc = 0; if (!netif_is_vxlan(dev)) return -EINVAL; vxlan = netdev_priv(dev); - for (h = 0; h < FDB_HASH_SIZE; ++h) { - spin_lock_bh(&vxlan->hash_lock[h]); - hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) { - if (f->vni == vni) { - list_for_each_entry(rdst, &f->remotes, list) { - rc = vxlan_fdb_notify_one(nb, vxlan, - f, rdst, - extack); - if (rc) - goto unlock; - } + spin_lock_bh(&vxlan->hash_lock); + hlist_for_each_entry(f, &vxlan->fdb_list, fdb_node) { + if (f->key.vni == vni) { + list_for_each_entry(rdst, &f->remotes, list) { + rc = vxlan_fdb_notify_one(nb, vxlan, f, rdst, + extack); + if (rc) + goto unlock; } } - spin_unlock_bh(&vxlan->hash_lock[h]); } + spin_unlock_bh(&vxlan->hash_lock); return 0; unlock: - spin_unlock_bh(&vxlan->hash_lock[h]); + spin_unlock_bh(&vxlan->hash_lock); return rc; } EXPORT_SYMBOL_GPL(vxlan_fdb_replay); @@ -552,20 +526,19 @@ void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) struct vxlan_dev *vxlan; struct vxlan_rdst *rdst; struct vxlan_fdb *f; - unsigned int h; if (!netif_is_vxlan(dev)) return; vxlan = netdev_priv(dev); - for (h = 0; h < FDB_HASH_SIZE; ++h) { - spin_lock_bh(&vxlan->hash_lock[h]); - hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) - if (f->vni == vni) - list_for_each_entry(rdst, &f->remotes, list) - rdst->offloaded = false; - spin_unlock_bh(&vxlan->hash_lock[h]); + spin_lock_bh(&vxlan->hash_lock); + hlist_for_each_entry(f, &vxlan->fdb_list, fdb_node) { + if (f->key.vni == vni) { + list_for_each_entry(rdst, &f->remotes, list) + rdst->offloaded = false; + } } + spin_unlock_bh(&vxlan->hash_lock); } EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload); @@ -610,10 +583,10 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, if (rd == NULL) return -ENOMEM; - if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) { - kfree(rd); - return -ENOMEM; - } + /* The driver can work correctly without a dst cache, so do not treat + * dst cache initialization errors as fatal. + */ + dst_cache_init(&rd->dst_cache, GFP_ATOMIC | __GFP_NOWARN); rd->remote_ip = *ip; rd->remote_port = port; @@ -803,27 +776,20 @@ static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac, f = kmalloc(sizeof(*f), GFP_ATOMIC); if (!f) return NULL; + memset(&f->key, 0, sizeof(f->key)); f->state = state; f->flags = ndm_flags; f->updated = f->used = jiffies; - f->vni = src_vni; + f->key.vni = src_vni; f->nh = NULL; RCU_INIT_POINTER(f->vdev, vxlan); INIT_LIST_HEAD(&f->nh_list); INIT_LIST_HEAD(&f->remotes); - memcpy(f->eth_addr, mac, ETH_ALEN); + memcpy(f->key.eth_addr, mac, ETH_ALEN); return f; } -static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac, - __be32 src_vni, struct vxlan_fdb *f) -{ - ++vxlan->addrcnt; - hlist_add_head_rcu(&f->hlist, - vxlan_fdb_head(vxlan, mac, src_vni)); -} - static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, u32 nhid, struct netlink_ext_ack *extack) { @@ -913,10 +879,27 @@ int vxlan_fdb_create(struct vxlan_dev *vxlan, if (rc < 0) goto errout; + rc = rhashtable_lookup_insert_fast(&vxlan->fdb_hash_tbl, &f->rhnode, + vxlan_fdb_rht_params); + if (rc) + goto destroy_remote; + + ++vxlan->addrcnt; + hlist_add_head_rcu(&f->fdb_node, &vxlan->fdb_list); + *fdb = f; return 0; +destroy_remote: + if (rcu_access_pointer(f->nh)) { + list_del_rcu(&f->nh_list); + nexthop_put(rtnl_dereference(f->nh)); + } else { + list_del(&rd->list); + dst_cache_destroy(&rd->dst_cache); + kfree(rd); + } errout: kfree(f); return rc; @@ -953,7 +936,7 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, { struct vxlan_rdst *rd; - netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr); + netdev_dbg(vxlan->dev, "delete %pM\n", f->key.eth_addr); --vxlan->addrcnt; if (do_notify) { @@ -966,7 +949,9 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, swdev_notify, NULL); } - hlist_del_rcu(&f->hlist); + hlist_del_init_rcu(&f->fdb_node); + rhashtable_remove_fast(&vxlan->fdb_hash_tbl, &f->rhnode, + vxlan_fdb_rht_params); list_del_rcu(&f->nh_list); call_rcu(&f->rcu, vxlan_fdb_free); } @@ -1024,8 +1009,8 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, if ((flags & NLM_F_REPLACE)) { /* Only change unicasts */ - if (!(is_multicast_ether_addr(f->eth_addr) || - is_zero_ether_addr(f->eth_addr))) { + if (!(is_multicast_ether_addr(f->key.eth_addr) || + is_zero_ether_addr(f->key.eth_addr))) { if (nhid) { rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); if (rc < 0) @@ -1041,8 +1026,8 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, } } if ((flags & NLM_F_APPEND) && - (is_multicast_ether_addr(f->eth_addr) || - is_zero_ether_addr(f->eth_addr))) { + (is_multicast_ether_addr(f->key.eth_addr) || + is_zero_ether_addr(f->key.eth_addr))) { rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); if (rc < 0) @@ -1101,7 +1086,6 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, if (rc < 0) return rc; - vxlan_fdb_insert(vxlan, mac, src_vni, f); rc = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH, swdev_notify, extack); if (rc) @@ -1125,7 +1109,7 @@ int vxlan_fdb_update(struct vxlan_dev *vxlan, { struct vxlan_fdb *f; - f = __vxlan_find_mac(vxlan, mac, src_vni); + f = vxlan_find_mac(vxlan, mac, src_vni); if (f) { if (flags & NLM_F_EXCL) { netdev_dbg(vxlan->dev, @@ -1253,7 +1237,6 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], __be16 port; __be32 src_vni, vni; u32 ifindex, nhid; - u32 hash_index; int err; if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { @@ -1273,13 +1256,12 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family) return -EAFNOSUPPORT; - hash_index = fdb_head_index(vxlan, addr, src_vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags, port, src_vni, vni, ifindex, ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER, nhid, true, extack); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); if (!err) *notified = true; @@ -1296,7 +1278,7 @@ int __vxlan_fdb_delete(struct vxlan_dev *vxlan, struct vxlan_fdb *f; int err = -ENOENT; - f = __vxlan_find_mac(vxlan, addr, src_vni); + f = vxlan_find_mac(vxlan, addr, src_vni); if (!f) return err; @@ -1330,7 +1312,6 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], union vxlan_addr ip; __be32 src_vni, vni; u32 ifindex, nhid; - u32 hash_index; __be16 port; int err; @@ -1339,11 +1320,10 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], if (err) return err; - hash_index = fdb_head_index(vxlan, addr, src_vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex, true); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); if (!err) *notified = true; @@ -1358,52 +1338,46 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, { struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct vxlan_dev *vxlan = netdev_priv(dev); - unsigned int h; + struct vxlan_fdb *f; int err = 0; - for (h = 0; h < FDB_HASH_SIZE; ++h) { - struct vxlan_fdb *f; - - rcu_read_lock(); - hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { - struct vxlan_rdst *rd; - - if (rcu_access_pointer(f->nh)) { - if (*idx < ctx->fdb_idx) - goto skip_nh; - err = vxlan_fdb_info(skb, vxlan, f, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI, NULL); - if (err < 0) { - rcu_read_unlock(); - goto out; - } -skip_nh: - *idx += 1; - continue; + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &vxlan->fdb_list, fdb_node) { + struct vxlan_rdst *rd; + + if (rcu_access_pointer(f->nh)) { + if (*idx < ctx->fdb_idx) + goto skip_nh; + err = vxlan_fdb_info(skb, vxlan, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, NLM_F_MULTI, NULL); + if (err < 0) { + rcu_read_unlock(); + goto out; } +skip_nh: + *idx += 1; + continue; + } - list_for_each_entry_rcu(rd, &f->remotes, list) { - if (*idx < ctx->fdb_idx) - goto skip; - - err = vxlan_fdb_info(skb, vxlan, f, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI, rd); - if (err < 0) { - rcu_read_unlock(); - goto out; - } -skip: - *idx += 1; + list_for_each_entry_rcu(rd, &f->remotes, list) { + if (*idx < ctx->fdb_idx) + goto skip; + + err = vxlan_fdb_info(skb, vxlan, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, NLM_F_MULTI, rd); + if (err < 0) { + rcu_read_unlock(); + goto out; } +skip: + *idx += 1; } - rcu_read_unlock(); } + rcu_read_unlock(); out: return err; } @@ -1427,7 +1401,7 @@ static int vxlan_fdb_get(struct sk_buff *skb, rcu_read_lock(); - f = __vxlan_find_mac(vxlan, addr, vni); + f = vxlan_find_mac_rcu(vxlan, addr, vni); if (!f) { NL_SET_ERR_MSG(extack, "Fdb entry not found"); err = -ENOENT; @@ -1463,7 +1437,7 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, ifindex = src_ifindex; #endif - f = __vxlan_find_mac(vxlan, src_mac, vni); + f = vxlan_find_mac_rcu(vxlan, src_mac, vni); if (likely(f)) { struct vxlan_rdst *rdst = first_remote_rcu(f); unsigned long now = jiffies; @@ -1491,10 +1465,8 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, rdst->remote_ip = *src_ip; vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL); } else { - u32 hash_index = fdb_head_index(vxlan, src_mac, vni); - /* learned new entry */ - spin_lock(&vxlan->hash_lock[hash_index]); + spin_lock(&vxlan->hash_lock); /* close off race between vxlan_flush and incoming packets */ if (netif_running(dev)) @@ -1505,7 +1477,7 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, vni, vxlan->default_dst.remote_vni, ifindex, NTF_SELF, 0, true, NULL); - spin_unlock(&vxlan->hash_lock[hash_index]); + spin_unlock(&vxlan->hash_lock); } return SKB_NOT_DROPPED_YET; @@ -1916,12 +1888,15 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; } - f = vxlan_find_mac(vxlan, n->ha, vni); + rcu_read_lock(); + f = vxlan_find_mac_tx(vxlan, n->ha, vni); if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { /* bridge-local neighbor */ neigh_release(n); + rcu_read_unlock(); goto out; } + rcu_read_unlock(); reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, n->ha, sha); @@ -2080,7 +2055,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; } - f = vxlan_find_mac(vxlan, n->ha, vni); + f = vxlan_find_mac_tx(vxlan, n->ha, vni); if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { /* bridge-local neighbor */ neigh_release(n); @@ -2648,14 +2623,10 @@ static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); hash = skb_get_hash(skb); - rcu_read_lock(); nh = rcu_dereference(f->nh); - if (!nh) { - rcu_read_unlock(); + if (!nh) goto drop; - } do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); - rcu_read_unlock(); if (likely(do_xmit)) vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc); @@ -2782,7 +2753,8 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) } eth = eth_hdr(skb); - f = vxlan_find_mac(vxlan, eth->h_dest, vni); + rcu_read_lock(); + f = vxlan_find_mac_tx(vxlan, eth->h_dest, vni); did_rsc = false; if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) && @@ -2790,11 +2762,11 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) ntohs(eth->h_proto) == ETH_P_IPV6)) { did_rsc = route_shortcircuit(dev, skb); if (did_rsc) - f = vxlan_find_mac(vxlan, eth->h_dest, vni); + f = vxlan_find_mac_tx(vxlan, eth->h_dest, vni); } if (f == NULL) { - f = vxlan_find_mac(vxlan, all_zeros_mac, vni); + f = vxlan_find_mac_tx(vxlan, all_zeros_mac, vni); if (f == NULL) { if ((vxlan->cfg.flags & VXLAN_F_L2MISS) && !is_multicast_ether_addr(eth->h_dest)) @@ -2804,7 +2776,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); - return NETDEV_TX_OK; + goto out; } } @@ -2829,6 +2801,8 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); } +out: + rcu_read_unlock(); return NETDEV_TX_OK; } @@ -2837,38 +2811,36 @@ static void vxlan_cleanup(struct timer_list *t) { struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer); unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; - unsigned int h; + struct vxlan_fdb *f; if (!netif_running(vxlan->dev)) return; - for (h = 0; h < FDB_HASH_SIZE; ++h) { - struct hlist_node *p, *n; - - spin_lock(&vxlan->hash_lock[h]); - hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { - struct vxlan_fdb *f - = container_of(p, struct vxlan_fdb, hlist); - unsigned long timeout; + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &vxlan->fdb_list, fdb_node) { + unsigned long timeout; - if (f->state & (NUD_PERMANENT | NUD_NOARP)) - continue; + if (f->state & (NUD_PERMANENT | NUD_NOARP)) + continue; - if (f->flags & NTF_EXT_LEARNED) - continue; + if (f->flags & NTF_EXT_LEARNED) + continue; - timeout = READ_ONCE(f->updated) + vxlan->cfg.age_interval * HZ; - if (time_before_eq(timeout, jiffies)) { - netdev_dbg(vxlan->dev, - "garbage collect %pM\n", - f->eth_addr); + timeout = READ_ONCE(f->updated) + vxlan->cfg.age_interval * HZ; + if (time_before_eq(timeout, jiffies)) { + spin_lock(&vxlan->hash_lock); + if (!hlist_unhashed(&f->fdb_node)) { + netdev_dbg(vxlan->dev, "garbage collect %pM\n", + f->key.eth_addr); f->state = NUD_STALE; vxlan_fdb_destroy(vxlan, f, true, true); - } else if (time_before(timeout, next_timer)) - next_timer = timeout; + } + spin_unlock(&vxlan->hash_lock); + } else if (time_before(timeout, next_timer)) { + next_timer = timeout; } - spin_unlock(&vxlan->hash_lock[h]); } + rcu_read_unlock(); mod_timer(&vxlan->age_timer, next_timer); } @@ -2903,10 +2875,14 @@ static int vxlan_init(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); int err; + err = rhashtable_init(&vxlan->fdb_hash_tbl, &vxlan_fdb_rht_params); + if (err) + return err; + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { err = vxlan_vnigroup_init(vxlan); if (err) - return err; + goto err_rhashtable_destroy; } err = gro_cells_init(&vxlan->gro_cells, dev); @@ -2925,21 +2901,11 @@ static int vxlan_init(struct net_device *dev) err_vnigroup_uninit: if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vnigroup_uninit(vxlan); +err_rhashtable_destroy: + rhashtable_destroy(&vxlan->fdb_hash_tbl); return err; } -static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni) -{ - struct vxlan_fdb *f; - u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, vni); - - spin_lock_bh(&vxlan->hash_lock[hash_index]); - f = __vxlan_find_mac(vxlan, all_zeros_mac, vni); - if (f) - vxlan_fdb_destroy(vxlan, f, true, true); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); -} - static void vxlan_uninit(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); @@ -2951,7 +2917,7 @@ static void vxlan_uninit(struct net_device *dev) gro_cells_destroy(&vxlan->gro_cells); - vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni); + rhashtable_destroy(&vxlan->fdb_hash_tbl); } /* Start ageing timer and join group when device is brought up */ @@ -2992,7 +2958,8 @@ struct vxlan_fdb_flush_desc { static bool vxlan_fdb_is_default_entry(const struct vxlan_fdb *f, const struct vxlan_dev *vxlan) { - return is_zero_ether_addr(f->eth_addr) && f->vni == vxlan->cfg.vni; + return is_zero_ether_addr(f->key.eth_addr) && + f->key.vni == vxlan->cfg.vni; } static bool vxlan_fdb_nhid_matches(const struct vxlan_fdb *f, u32 nhid) @@ -3015,7 +2982,7 @@ static bool vxlan_fdb_flush_matches(const struct vxlan_fdb *f, if (desc->ignore_default_entry && vxlan_fdb_is_default_entry(f, vxlan)) return false; - if (desc->src_vni && f->vni != desc->src_vni) + if (desc->src_vni && f->key.vni != desc->src_vni) return false; if (desc->nhid && !vxlan_fdb_nhid_matches(f, desc->nhid)) @@ -3071,33 +3038,32 @@ static void vxlan_flush(struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc) { bool match_remotes = vxlan_fdb_flush_should_match_remotes(desc); - unsigned int h; - - for (h = 0; h < FDB_HASH_SIZE; ++h) { - struct hlist_node *p, *n; + struct vxlan_fdb *f; - spin_lock_bh(&vxlan->hash_lock[h]); - hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { - struct vxlan_fdb *f - = container_of(p, struct vxlan_fdb, hlist); + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &vxlan->fdb_list, fdb_node) { + if (!vxlan_fdb_flush_matches(f, vxlan, desc)) + continue; - if (!vxlan_fdb_flush_matches(f, vxlan, desc)) - continue; + spin_lock_bh(&vxlan->hash_lock); + if (hlist_unhashed(&f->fdb_node)) + goto unlock; - if (match_remotes) { - bool destroy_fdb = false; + if (match_remotes) { + bool destroy_fdb = false; - vxlan_fdb_flush_match_remotes(f, vxlan, desc, - &destroy_fdb); + vxlan_fdb_flush_match_remotes(f, vxlan, desc, + &destroy_fdb); - if (!destroy_fdb) - continue; - } - - vxlan_fdb_destroy(vxlan, f, true, true); + if (!destroy_fdb) + goto unlock; } - spin_unlock_bh(&vxlan->hash_lock[h]); + + vxlan_fdb_destroy(vxlan, f, true, true); +unlock: + spin_unlock_bh(&vxlan->hash_lock); } + rcu_read_unlock(); } static const struct nla_policy vxlan_del_bulk_policy[NDA_MAX + 1] = { @@ -3185,7 +3151,7 @@ static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb_flush_desc desc = { - /* Default entry is deleted at vxlan_uninit. */ + /* Default entry is deleted at vxlan_dellink. */ .ignore_default_entry = true, .state = 0, .state_mask = NUD_PERMANENT | NUD_NOARP, @@ -3348,7 +3314,6 @@ static void vxlan_offload_rx_ports(struct net_device *dev, bool push) static void vxlan_setup(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - unsigned int h; eth_hw_addr_random(dev); ether_setup(dev); @@ -3375,15 +3340,13 @@ static void vxlan_setup(struct net_device *dev) dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; INIT_LIST_HEAD(&vxlan->next); + spin_lock_init(&vxlan->hash_lock); timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE); vxlan->dev = dev; - for (h = 0; h < FDB_HASH_SIZE; ++h) { - spin_lock_init(&vxlan->hash_lock[h]); - INIT_HLIST_HEAD(&vxlan->fdb_head[h]); - } + INIT_HLIST_HEAD(&vxlan->fdb_list); } static void vxlan_ether_setup(struct net_device *dev) @@ -3960,8 +3923,6 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *remote_dev = NULL; - struct vxlan_fdb *f = NULL; - bool unregister = false; struct vxlan_rdst *dst; int err; @@ -3972,72 +3933,54 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, dev->ethtool_ops = &vxlan_ethtool_ops; - /* create an fdb entry for a valid default destination */ - if (!vxlan_addr_any(&dst->remote_ip)) { - err = vxlan_fdb_create(vxlan, all_zeros_mac, - &dst->remote_ip, - NUD_REACHABLE | NUD_PERMANENT, - vxlan->cfg.dst_port, - dst->remote_vni, - dst->remote_vni, - dst->remote_ifindex, - NTF_SELF, 0, &f, extack); - if (err) - return err; - } - err = register_netdevice(dev); if (err) - goto errout; - unregister = true; + return err; if (dst->remote_ifindex) { remote_dev = __dev_get_by_index(net, dst->remote_ifindex); if (!remote_dev) { err = -ENODEV; - goto errout; + goto unregister; } err = netdev_upper_dev_link(remote_dev, dev, extack); if (err) - goto errout; + goto unregister; + + dst->remote_dev = remote_dev; } err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) goto unlink; - if (f) { - vxlan_fdb_insert(vxlan, all_zeros_mac, dst->remote_vni, f); - - /* notify default fdb entry */ - err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), - RTM_NEWNEIGH, true, extack); - if (err) { - vxlan_fdb_destroy(vxlan, f, false, false); - if (remote_dev) - netdev_upper_dev_unlink(remote_dev, dev); - goto unregister; - } + /* create an fdb entry for a valid default destination */ + if (!vxlan_addr_any(&dst->remote_ip)) { + spin_lock_bh(&vxlan->hash_lock); + err = vxlan_fdb_update(vxlan, all_zeros_mac, + &dst->remote_ip, + NUD_REACHABLE | NUD_PERMANENT, + NLM_F_EXCL | NLM_F_CREATE, + vxlan->cfg.dst_port, + dst->remote_vni, + dst->remote_vni, + dst->remote_ifindex, + NTF_SELF, 0, true, extack); + spin_unlock_bh(&vxlan->hash_lock); + if (err) + goto unlink; } list_add(&vxlan->next, &vn->vxlan_list); - if (remote_dev) - dst->remote_dev = remote_dev; + return 0; + unlink: if (remote_dev) netdev_upper_dev_unlink(remote_dev, dev); -errout: - /* unregister_netdevice() destroys the default FDB entry with deletion - * notification. But the addition notification was not sent yet, so - * destroy the entry by hand here. - */ - if (f) - __vxlan_fdb_free(f); unregister: - if (unregister) - unregister_netdevice(dev); + unregister_netdevice(dev); return err; } @@ -4454,9 +4397,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], /* handle default dst entry */ if (rem_ip_changed) { - u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni); - - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); if (!vxlan_addr_any(&conf.remote_ip)) { err = vxlan_fdb_update(vxlan, all_zeros_mac, &conf.remote_ip, @@ -4467,7 +4408,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], conf.remote_ifindex, NTF_SELF, 0, true, extack); if (err) { - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); netdev_adjacent_change_abort(dst->remote_dev, lowerdev, dev); return err; @@ -4481,7 +4422,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], dst->remote_vni, dst->remote_ifindex, true); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); /* If vni filtering device, also update fdb entries of * all vnis that were using default remote ip @@ -4518,10 +4459,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], static void vxlan_dellink(struct net_device *dev, struct list_head *head) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_fdb_flush_desc desc = { - /* Default entry is deleted at vxlan_uninit. */ - .ignore_default_entry = true, - }; + struct vxlan_fdb_flush_desc desc = {}; vxlan_flush(vxlan, &desc); @@ -4784,13 +4722,10 @@ vxlan_fdb_offloaded_set(struct net_device *dev, struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *rdst; struct vxlan_fdb *f; - u32 hash_index; - - hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); - f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); + f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) goto out; @@ -4804,7 +4739,7 @@ vxlan_fdb_offloaded_set(struct net_device *dev, rdst->offloaded = fdb_info->offloaded; out: - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); } static int @@ -4813,13 +4748,11 @@ vxlan_fdb_external_learn_add(struct net_device *dev, { struct vxlan_dev *vxlan = netdev_priv(dev); struct netlink_ext_ack *extack; - u32 hash_index; int err; - hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); extack = switchdev_notifier_info_to_extack(&fdb_info->info); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip, NUD_REACHABLE, NLM_F_CREATE | NLM_F_REPLACE, @@ -4829,7 +4762,7 @@ vxlan_fdb_external_learn_add(struct net_device *dev, fdb_info->remote_ifindex, NTF_USE | NTF_SELF | NTF_EXT_LEARNED, 0, false, extack); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } @@ -4840,13 +4773,11 @@ vxlan_fdb_external_learn_del(struct net_device *dev, { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; - u32 hash_index; int err = 0; - hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); - f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); + f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) err = -ENOENT; else if (f->flags & NTF_EXT_LEARNED) @@ -4858,7 +4789,7 @@ vxlan_fdb_external_learn_del(struct net_device *dev, fdb_info->remote_ifindex, false); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } @@ -4907,18 +4838,15 @@ static void vxlan_fdb_nh_flush(struct nexthop *nh) { struct vxlan_fdb *fdb; struct vxlan_dev *vxlan; - u32 hash_index; rcu_read_lock(); list_for_each_entry_rcu(fdb, &nh->fdb_list, nh_list) { vxlan = rcu_dereference(fdb->vdev); WARN_ON(!vxlan); - hash_index = fdb_head_index(vxlan, fdb->eth_addr, - vxlan->default_dst.remote_vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); - if (!hlist_unhashed(&fdb->hlist)) + spin_lock_bh(&vxlan->hash_lock); + if (!hlist_unhashed(&fdb->fdb_node)) vxlan_fdb_destroy(vxlan, fdb, false, false); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); } rcu_read_unlock(); } @@ -4966,19 +4894,15 @@ static void __net_exit vxlan_destroy_tunnels(struct vxlan_net *vn, vxlan_dellink(vxlan->dev, dev_to_kill); } -static void __net_exit vxlan_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit vxlan_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) { - struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct vxlan_net *vn = net_generic(net, vxlan_net_id); - __unregister_nexthop_notifier(net, &vn->nexthop_notifier_block); + ASSERT_RTNL_NET(net); - vxlan_destroy_tunnels(vn, dev_to_kill); - } + __unregister_nexthop_notifier(net, &vn->nexthop_notifier_block); + vxlan_destroy_tunnels(vn, dev_to_kill); } static void __net_exit vxlan_exit_net(struct net *net) @@ -4992,7 +4916,7 @@ static void __net_exit vxlan_exit_net(struct net *net) static struct pernet_operations vxlan_net_ops = { .init = vxlan_init_net, - .exit_batch_rtnl = vxlan_exit_batch_rtnl, + .exit_rtnl = vxlan_exit_rtnl, .exit = vxlan_exit_net, .id = &vxlan_net_id, .size = sizeof(struct vxlan_net), @@ -5002,8 +4926,6 @@ static int __init vxlan_init_module(void) { int rc; - get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); - rc = register_pernet_subsys(&vxlan_net_ops); if (rc) goto out1; diff --git a/drivers/net/vxlan/vxlan_private.h b/drivers/net/vxlan/vxlan_private.h index 76a351a997d51..d328aed9feefd 100644 --- a/drivers/net/vxlan/vxlan_private.h +++ b/drivers/net/vxlan/vxlan_private.h @@ -24,18 +24,23 @@ struct vxlan_net { struct notifier_block nexthop_notifier_block; }; +struct vxlan_fdb_key { + u8 eth_addr[ETH_ALEN]; + __be32 vni; +}; + /* Forwarding table entry */ struct vxlan_fdb { - struct hlist_node hlist; /* linked list of entries */ + struct rhash_head rhnode; struct rcu_head rcu; unsigned long updated; /* jiffies */ unsigned long used; struct list_head remotes; - u8 eth_addr[ETH_ALEN]; + struct vxlan_fdb_key key; u16 state; /* see ndm_state */ - __be32 vni; u16 flags; /* see ndm_flags and below */ struct list_head nh_list; + struct hlist_node fdb_node; struct nexthop __rcu *nh; struct vxlan_dev __rcu *vdev; }; diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c index 6e6e9f05509ab..336cd0e203094 100644 --- a/drivers/net/vxlan/vxlan_vnifilter.c +++ b/drivers/net/vxlan/vxlan_vnifilter.c @@ -411,13 +411,12 @@ static int vxlan_vnifilter_dump(struct sk_buff *skb, struct netlink_callback *cb struct tunnel_msg *tmsg; struct net_device *dev; - if (cb->nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct tunnel_msg))) { + tmsg = nlmsg_payload(cb->nlh, sizeof(*tmsg)); + if (!tmsg) { NL_SET_ERR_MSG(cb->extack, "Invalid msg length"); return -EINVAL; } - tmsg = nlmsg_data(cb->nlh); - if (tmsg->flags & ~TUNNEL_MSG_VALID_USER_FLAGS) { NL_SET_ERR_MSG(cb->extack, "Invalid tunnelmsg flags in ancillary header"); return -EINVAL; @@ -483,11 +482,9 @@ static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni, struct netlink_ext_ack *extack) { struct vxlan_rdst *dst = &vxlan->default_dst; - u32 hash_index; int err = 0; - hash_index = fdb_head_index(vxlan, all_zeros_mac, vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); if (remote_ip && !vxlan_addr_any(remote_ip)) { err = vxlan_fdb_update(vxlan, all_zeros_mac, remote_ip, @@ -499,7 +496,7 @@ static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni, dst->remote_ifindex, NTF_SELF, 0, true, extack); if (err) { - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } } @@ -512,7 +509,7 @@ static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni, dst->remote_ifindex, true); } - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig index 07bf7f9aae019..204278eb215e5 100644 --- a/drivers/ptp/Kconfig +++ b/drivers/ptp/Kconfig @@ -44,7 +44,7 @@ config PTP_1588_CLOCK_DTE depends on PTP_1588_CLOCK depends on NET && HAS_IOMEM depends on ARCH_BCM_MOBILE || (ARCH_BCM_IPROC && !(ARCH_BCM_NSP || ARCH_BCM_5301X)) || COMPILE_TEST - default y + default y if ARCH_BCM_MOBILE || ARCH_BCM_IPROC help This driver adds support for using the Digital timing engine (DTE) in the Broadcom SoC's as a PTP clock. @@ -59,7 +59,7 @@ config PTP_1588_CLOCK_QORIQ tristate "Freescale QorIQ 1588 timer as PTP clock" depends on GIANFAR || FSL_DPAA_ETH || FSL_DPAA2_ETH || FSL_ENETC || FSL_ENETC_VF || COMPILE_TEST depends on PTP_1588_CLOCK - default y + default y if GIANFAR || FSL_DPAA_ETH || FSL_DPAA2_ETH || FSL_ENETC || FSL_ENETC_VF help This driver adds support for using the Freescale QorIQ 1588 timer as a PTP clock. This clock is only useful if your PTP diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 4380e6ddb8495..4bf421765d033 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -162,6 +162,7 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, { struct ptp_clock *ptp = container_of(pccontext->clk, struct ptp_clock, clock); + unsigned int i, pin_index, supported_extts_flags; struct ptp_sys_offset_extended *extoff = NULL; struct ptp_sys_offset_precise precise_offset; struct system_device_crosststamp xtstamp; @@ -172,7 +173,6 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, struct ptp_clock_request req; struct ptp_clock_caps caps; struct ptp_clock_time *pct; - unsigned int i, pin_index; struct ptp_pin_desc pd; struct timespec64 ts; int enable, err = 0; @@ -240,6 +240,18 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, err = -EINVAL; break; } + supported_extts_flags = ptp->info->supported_extts_flags; + /* The PTP_ENABLE_FEATURE flag is always supported. */ + supported_extts_flags |= PTP_ENABLE_FEATURE; + /* If the driver does not support strictly checking flags, the + * PTP_RISING_EDGE and PTP_FALLING_EDGE flags are merely + * hints which are not enforced. + */ + if (!(supported_extts_flags & PTP_STRICT_FLAGS)) + supported_extts_flags |= PTP_EXTTS_EDGES; + /* Reject unsupported flags */ + if (req.extts.flags & ~supported_extts_flags) + return -EOPNOTSUPP; req.type = PTP_CLK_REQ_EXTTS; enable = req.extts.flags & PTP_ENABLE_FEATURE ? 1 : 0; if (mutex_lock_interruptible(&ptp->pincfg_mux)) @@ -312,6 +324,8 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, err = -EINVAL; break; } + if (req.perout.flags & ~ptp->info->supported_perout_flags) + return -EOPNOTSUPP; req.type = PTP_CLK_REQ_PEROUT; enable = req.perout.period.sec || req.perout.period.nsec; if (mutex_lock_interruptible(&ptp->pincfg_mux)) diff --git a/drivers/ptp/ptp_clockmatrix.c b/drivers/ptp/ptp_clockmatrix.c index fbb3fa8fc60b2..b8d4df8c6da2c 100644 --- a/drivers/ptp/ptp_clockmatrix.c +++ b/drivers/ptp/ptp_clockmatrix.c @@ -283,18 +283,6 @@ static int idtcm_extts_enable(struct idtcm_channel *channel, idtcm = channel->idtcm; old_mask = idtcm->extts_mask; - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - - /* Reject requests to enable time stamping on falling edge */ - if ((rq->extts.flags & PTP_ENABLE_FEATURE) && - (rq->extts.flags & PTP_FALLING_EDGE)) - return -EOPNOTSUPP; - if (index >= MAX_TOD) return -EINVAL; @@ -2043,6 +2031,7 @@ static const struct ptp_clock_info idtcm_caps = { .n_per_out = 12, .n_ext_ts = MAX_TOD, .n_pins = MAX_REF_CLK, + .supported_extts_flags = PTP_RISING_EDGE | PTP_STRICT_FLAGS, .adjphase = &idtcm_adjphase, .getmaxphase = &idtcm_getmaxphase, .adjfine = &idtcm_adjfine, @@ -2060,6 +2049,7 @@ static const struct ptp_clock_info idtcm_caps_deprecated = { .n_per_out = 12, .n_ext_ts = MAX_TOD, .n_pins = MAX_REF_CLK, + .supported_extts_flags = PTP_RISING_EDGE | PTP_STRICT_FLAGS, .adjphase = &idtcm_adjphase, .getmaxphase = &idtcm_getmaxphase, .adjfine = &idtcm_adjfine, diff --git a/drivers/ptp/ptp_fc3.c b/drivers/ptp/ptp_fc3.c index cfced36c70bcb..70002500170ea 100644 --- a/drivers/ptp/ptp_fc3.c +++ b/drivers/ptp/ptp_fc3.c @@ -592,6 +592,7 @@ static const struct ptp_clock_info idtfc3_caps = { .max_adj = MAX_FFO_PPB, .n_per_out = 1, .n_ext_ts = 1, + .supported_extts_flags = PTP_STRICT_FLAGS | PTP_EXT_OFFSET, .adjphase = &idtfc3_adjphase, .adjfine = &idtfc3_adjfine, .adjtime = &idtfc3_adjtime, diff --git a/drivers/ptp/ptp_idt82p33.c b/drivers/ptp/ptp_idt82p33.c index b2fd94d4f8631..f01c50dfa44e8 100644 --- a/drivers/ptp/ptp_idt82p33.c +++ b/drivers/ptp/ptp_idt82p33.c @@ -246,18 +246,6 @@ static int idt82p33_extts_enable(struct idt82p33_channel *channel, idt82p33 = channel->idt82p33; old_mask = idt82p33->extts_mask; - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - - /* Reject requests to enable time stamping on falling edge */ - if ((rq->extts.flags & PTP_ENABLE_FEATURE) && - (rq->extts.flags & PTP_FALLING_EDGE)) - return -EOPNOTSUPP; - if (index >= MAX_PHC_PLL) return -EINVAL; @@ -1187,6 +1175,9 @@ static void idt82p33_caps_init(u32 index, struct ptp_clock_info *caps, caps->pin_config = pin_cfg; + caps->supported_extts_flags = PTP_RISING_EDGE | + PTP_STRICT_FLAGS; + for (i = 0; i < max_pins; ++i) { ppd = &pin_cfg[i]; diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 60ed70a39d2cc..b7f15f303ea2d 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -611,7 +611,7 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) ism->dev.parent = &pdev->dev; ism->dev.release = ism_dev_release; device_initialize(&ism->dev); - dev_set_name(&ism->dev, dev_name(&pdev->dev)); + dev_set_name(&ism->dev, "%s", dev_name(&pdev->dev)); ret = device_add(&ism->dev); if (ret) goto err_dev; diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index fc8ba9142f2f0..682bd8ec2c105 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -5,6 +5,7 @@ config AFS_FS select AF_RXRPC select DNS_RESOLVER select NETFS_SUPPORT + select CRYPTO_KRB5 help If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 5efd7e13b3043..b49b8fe682f39 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -8,6 +8,7 @@ kafs-y := \ addr_prefs.o \ callback.o \ cell.o \ + cm_security.o \ cmservice.o \ dir.o \ dir_edit.o \ diff --git a/fs/afs/cm_security.c b/fs/afs/cm_security.c new file mode 100644 index 0000000000000..edcbd249d2024 --- /dev/null +++ b/fs/afs/cm_security.c @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Cache manager security. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include "internal.h" +#include "afs_cm.h" +#include "afs_fs.h" +#include "protocol_yfs.h" +#define RXRPC_TRACE_ONLY_DEFINE_ENUMS +#include + +#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c +#define xdr_round_up(x) (round_up((x), sizeof(__be32))) +#define xdr_len_object(x) (4 + round_up((x), sizeof(__be32))) + +#ifdef CONFIG_RXGK +static int afs_create_yfs_cm_token(struct sk_buff *challenge, + struct afs_server *server); +#endif + +/* + * Respond to an RxGK challenge, adding appdata. + */ +static int afs_respond_to_challenge(struct sk_buff *challenge) +{ +#ifdef CONFIG_RXGK + struct krb5_buffer appdata = {}; + struct afs_server *server; +#endif + struct rxrpc_peer *peer; + unsigned long peer_data; + u16 service_id; + u8 security_index; + + rxrpc_kernel_query_challenge(challenge, &peer, &peer_data, + &service_id, &security_index); + + _enter("%u,%u", service_id, security_index); + + switch (service_id) { + /* We don't send CM_SERVICE RPCs, so don't expect a challenge + * therefrom. + */ + case FS_SERVICE: + case VL_SERVICE: + case YFS_FS_SERVICE: + case YFS_VL_SERVICE: + break; + default: + pr_warn("Can't respond to unknown challenge %u:%u", + service_id, security_index); + return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, + afs_abort_unsupported_sec_class); + } + + switch (security_index) { +#ifdef CONFIG_RXKAD + case RXRPC_SECURITY_RXKAD: + return rxkad_kernel_respond_to_challenge(challenge); +#endif + +#ifdef CONFIG_RXGK + case RXRPC_SECURITY_RXGK: + return rxgk_kernel_respond_to_challenge(challenge, &appdata); + + case RXRPC_SECURITY_YFS_RXGK: + switch (service_id) { + case FS_SERVICE: + case YFS_FS_SERVICE: + server = (struct afs_server *)peer_data; + if (!server->cm_rxgk_appdata.data) { + mutex_lock(&server->cm_token_lock); + if (!server->cm_rxgk_appdata.data) + afs_create_yfs_cm_token(challenge, server); + mutex_unlock(&server->cm_token_lock); + } + if (server->cm_rxgk_appdata.data) + appdata = server->cm_rxgk_appdata; + break; + } + return rxgk_kernel_respond_to_challenge(challenge, &appdata); +#endif + + default: + return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, + afs_abort_unsupported_sec_class); + } +} + +/* + * Process the OOB message queue, processing challenge packets. + */ +void afs_process_oob_queue(struct work_struct *work) +{ + struct afs_net *net = container_of(work, struct afs_net, rx_oob_work); + struct sk_buff *oob; + enum rxrpc_oob_type type; + + while ((oob = rxrpc_kernel_dequeue_oob(net->socket, &type))) { + switch (type) { + case RXRPC_OOB_CHALLENGE: + afs_respond_to_challenge(oob); + break; + } + rxrpc_kernel_free_oob(oob); + } +} + +#ifdef CONFIG_RXGK +/* + * Create a securities keyring for the cache manager and attach a key to it for + * the RxGK tokens we want to use to secure the callback connection back from + * the fileserver. + */ +int afs_create_token_key(struct afs_net *net, struct socket *socket) +{ + const struct krb5_enctype *krb5; + struct key *ring; + key_ref_t key; + char K0[32], *desc; + int ret; + + ring = keyring_alloc("kafs", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + KEY_POS_SEARCH | KEY_POS_WRITE | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH, + KEY_ALLOC_NOT_IN_QUOTA, + NULL, NULL); + if (IS_ERR(ring)) + return PTR_ERR(ring); + + ret = rxrpc_sock_set_security_keyring(socket->sk, ring); + if (ret < 0) + goto out; + + ret = -ENOPKG; + krb5 = crypto_krb5_find_enctype(KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96); + if (!krb5) + goto out; + + if (WARN_ON_ONCE(krb5->key_len > sizeof(K0))) + goto out; + + ret = -ENOMEM; + desc = kasprintf(GFP_KERNEL, "%u:%u:%u:%u", + YFS_CM_SERVICE, RXRPC_SECURITY_YFS_RXGK, 1, krb5->etype); + if (!desc) + goto out; + + wait_for_random_bytes(); + get_random_bytes(K0, krb5->key_len); + + key = key_create(make_key_ref(ring, true), + "rxrpc_s", desc, + K0, krb5->key_len, + KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + kfree(desc); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto out; + } + + net->fs_cm_token_key = key_ref_to_ptr(key); + ret = 0; +out: + key_put(ring); + return ret; +} + +/* + * Create an YFS RxGK GSS token to use as a ticket to the specified fileserver. + */ +static int afs_create_yfs_cm_token(struct sk_buff *challenge, + struct afs_server *server) +{ + const struct krb5_enctype *conn_krb5, *token_krb5; + const struct krb5_buffer *token_key; + struct crypto_aead *aead; + struct scatterlist sg; + struct afs_net *net = server->cell->net; + const struct key *key = net->fs_cm_token_key; + size_t keysize, uuidsize, authsize, toksize, encsize, contsize, adatasize, offset; + __be32 caps[1] = { + [0] = htonl(AFS_CAP_ERROR_TRANSLATION), + }; + __be32 *xdr; + void *appdata, *K0, *encbase; + u32 enctype; + int ret; + + if (!key) + return -ENOKEY; + + /* Assume that the fileserver is happy to use the same encoding type as + * we were told to use by the token obtained by the user. + */ + enctype = rxgk_kernel_query_challenge(challenge); + + conn_krb5 = crypto_krb5_find_enctype(enctype); + if (!conn_krb5) + return -ENOPKG; + token_krb5 = key->payload.data[0]; + token_key = (const struct krb5_buffer *)&key->payload.data[2]; + + /* struct rxgk_key { + * afs_uint32 enctype; + * opaque key<>; + * }; + */ + keysize = 4 + xdr_len_object(conn_krb5->key_len); + + /* struct RXGK_AuthName { + * afs_int32 kind; + * opaque data; + * opaque display; + * }; + */ + uuidsize = sizeof(server->uuid); + authsize = 4 + xdr_len_object(uuidsize) + xdr_len_object(0); + + /* struct RXGK_Token { + * rxgk_key K0; + * RXGK_Level level; + * rxgkTime starttime; + * afs_int32 lifetime; + * afs_int32 bytelife; + * rxgkTime expirationtime; + * struct RXGK_AuthName identities<>; + * }; + */ + toksize = keysize + 8 + 4 + 4 + 8 + xdr_len_object(authsize); + + offset = 0; + encsize = crypto_krb5_how_much_buffer(token_krb5, KRB5_ENCRYPT_MODE, toksize, &offset); + + /* struct RXGK_TokenContainer { + * afs_int32 kvno; + * afs_int32 enctype; + * opaque encrypted_token<>; + * }; + */ + contsize = 4 + 4 + xdr_len_object(encsize); + + /* struct YFSAppData { + * opr_uuid initiatorUuid; + * opr_uuid acceptorUuid; + * Capabilities caps; + * afs_int32 enctype; + * opaque callbackKey<>; + * opaque callbackToken<>; + * }; + */ + adatasize = 16 + 16 + + xdr_len_object(sizeof(caps)) + + 4 + + xdr_len_object(conn_krb5->key_len) + + xdr_len_object(contsize); + + ret = -ENOMEM; + appdata = kzalloc(adatasize, GFP_KERNEL); + if (!appdata) + goto out; + xdr = appdata; + + memcpy(xdr, &net->uuid, 16); /* appdata.initiatorUuid */ + xdr += 16 / 4; + memcpy(xdr, &server->uuid, 16); /* appdata.acceptorUuid */ + xdr += 16 / 4; + *xdr++ = htonl(ARRAY_SIZE(caps)); /* appdata.caps.len */ + memcpy(xdr, &caps, sizeof(caps)); /* appdata.caps */ + xdr += ARRAY_SIZE(caps); + *xdr++ = htonl(conn_krb5->etype); /* appdata.enctype */ + + *xdr++ = htonl(conn_krb5->key_len); /* appdata.callbackKey.len */ + K0 = xdr; + get_random_bytes(K0, conn_krb5->key_len); /* appdata.callbackKey.data */ + xdr += xdr_round_up(conn_krb5->key_len) / 4; + + *xdr++ = htonl(contsize); /* appdata.callbackToken.len */ + *xdr++ = htonl(1); /* cont.kvno */ + *xdr++ = htonl(token_krb5->etype); /* cont.enctype */ + *xdr++ = htonl(encsize); /* cont.encrypted_token.len */ + + encbase = xdr; + xdr += offset / 4; + *xdr++ = htonl(conn_krb5->etype); /* token.K0.enctype */ + *xdr++ = htonl(conn_krb5->key_len); /* token.K0.key.len */ + memcpy(xdr, K0, conn_krb5->key_len); /* token.K0.key.data */ + xdr += xdr_round_up(conn_krb5->key_len) / 4; + + *xdr++ = htonl(RXRPC_SECURITY_ENCRYPT); /* token.level */ + *xdr++ = htonl(0); /* token.starttime */ + *xdr++ = htonl(0); /* " */ + *xdr++ = htonl(0); /* token.lifetime */ + *xdr++ = htonl(0); /* token.bytelife */ + *xdr++ = htonl(0); /* token.expirationtime */ + *xdr++ = htonl(0); /* " */ + *xdr++ = htonl(1); /* token.identities.count */ + *xdr++ = htonl(0); /* token.identities[0].kind */ + *xdr++ = htonl(uuidsize); /* token.identities[0].data.len */ + memcpy(xdr, &server->uuid, uuidsize); + xdr += xdr_round_up(uuidsize) / 4; + *xdr++ = htonl(0); /* token.identities[0].display.len */ + + xdr = encbase + xdr_round_up(encsize); + + if ((unsigned long)xdr - (unsigned long)appdata != adatasize) + pr_err("Appdata size incorrect %lx != %zx\n", + (unsigned long)xdr - (unsigned long)appdata, adatasize); + + aead = crypto_krb5_prepare_encryption(token_krb5, token_key, RXGK_SERVER_ENC_TOKEN, + GFP_KERNEL); + if (IS_ERR(aead)) { + ret = PTR_ERR(aead); + goto out_token; + } + + sg_init_one(&sg, encbase, encsize); + ret = crypto_krb5_encrypt(token_krb5, aead, &sg, 1, encsize, offset, toksize, false); + if (ret < 0) + goto out_aead; + + server->cm_rxgk_appdata.len = adatasize; + server->cm_rxgk_appdata.data = appdata; + appdata = NULL; + +out_aead: + crypto_free_aead(aead); +out_token: + kfree(appdata); +out: + return ret; +} +#endif /* CONFIG_RXGK */ diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 440b0e731093b..1124ea4000cb1 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -176,8 +177,10 @@ struct afs_call { bool intr; /* T if interruptible */ bool unmarshalling_error; /* T if an unmarshalling error occurred */ bool responded; /* Got a response from the call (may be abort) */ + u8 security_ix; /* Security class */ u16 service_id; /* Actual service ID (after upgrade) */ unsigned int debug_id; /* Trace ID */ + u32 enctype; /* Security encoding type */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ union { /* place to extract temporary data */ @@ -281,6 +284,7 @@ struct afs_net { struct socket *socket; struct afs_call *spare_incoming_call; struct work_struct charge_preallocation_work; + struct work_struct rx_oob_work; struct mutex socket_mutex; atomic_t nr_outstanding_calls; atomic_t nr_superblocks; @@ -305,6 +309,7 @@ struct afs_net { struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ + struct key *fs_cm_token_key; /* Key for creating CM tokens */ struct work_struct fs_prober; struct timer_list fs_probe_timer; atomic_t servers_outstanding; @@ -540,6 +545,8 @@ struct afs_server { struct list_head volumes; /* RCU list of afs_server_entry objects */ struct work_struct destroyer; /* Work item to try and destroy a server */ struct timer_list timer; /* Management timer */ + struct mutex cm_token_lock; /* Lock governing creation of appdata */ + struct krb5_buffer cm_rxgk_appdata; /* Appdata to be included in RESPONSE packet */ time64_t unuse_time; /* Time at which last unused */ unsigned long flags; #define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */ @@ -1058,6 +1065,19 @@ extern void __net_exit afs_cell_purge(struct afs_net *); */ extern bool afs_cm_incoming_call(struct afs_call *); +/* + * cm_security.c + */ +void afs_process_oob_queue(struct work_struct *work); +#ifdef CONFIG_RXGK +int afs_create_token_key(struct afs_net *net, struct socket *socket); +#else +static inline int afs_create_token_key(struct afs_net *net, struct socket *socket) +{ + return 0; +} +#endif + /* * dir.c */ diff --git a/fs/afs/main.c b/fs/afs/main.c index c845c5daaeba0..02475d415d885 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -73,6 +73,7 @@ static int __net_init afs_net_init(struct net *net_ns) generate_random_uuid((unsigned char *)&net->uuid); INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation); + INIT_WORK(&net->rx_oob_work, afs_process_oob_queue); mutex_init(&net->socket_mutex); net->cells = RB_ROOT; diff --git a/fs/afs/misc.c b/fs/afs/misc.c index b8180bf2281f7..8f2b3a1776908 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "internal.h" #include "afs_fs.h" #include "protocol_uae.h" @@ -103,6 +104,32 @@ int afs_abort_to_error(u32 abort_code) case RXKADDATALEN: return -EKEYREJECTED; case RXKADILLEGALLEVEL: return -EKEYREJECTED; + case RXGK_INCONSISTENCY: return -EPROTO; + case RXGK_PACKETSHORT: return -EPROTO; + case RXGK_BADCHALLENGE: return -EPROTO; + case RXGK_SEALEDINCON: return -EKEYREJECTED; + case RXGK_NOTAUTH: return -EKEYREJECTED; + case RXGK_EXPIRED: return -EKEYEXPIRED; + case RXGK_BADLEVEL: return -EKEYREJECTED; + case RXGK_BADKEYNO: return -EKEYREJECTED; + case RXGK_NOTRXGK: return -EKEYREJECTED; + case RXGK_UNSUPPORTED: return -EKEYREJECTED; + case RXGK_GSSERROR: return -EKEYREJECTED; +#ifdef RXGK_BADETYPE + case RXGK_BADETYPE: return -ENOPKG; +#endif +#ifdef RXGK_BADTOKEN + case RXGK_BADTOKEN: return -EKEYREJECTED; +#endif +#ifdef RXGK_BADETYPE + case RXGK_DATALEN: return -EPROTO; +#endif +#ifdef RXGK_BADQOP + case RXGK_BADQOP: return -EKEYREJECTED; +#endif + + case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG; + case RXGEN_OPCODE: return -ENOTSUPP; default: return -EREMOTEIO; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index d5e480a33859b..c1cadf8fb346a 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -24,8 +24,17 @@ static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned static void afs_process_async_call(struct work_struct *); static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); +static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID); +static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob); static int afs_deliver_cm_op_id(struct afs_call *); +static const struct rxrpc_kernel_ops afs_rxrpc_callback_ops = { + .notify_new_call = afs_rx_new_call, + .discard_new_call = afs_rx_discard_new_call, + .user_attach_call = afs_rx_attach, + .notify_oob = afs_rx_notify_oob, +}; + /* asynchronous incoming call initial processing */ static const struct afs_call_type afs_RXCMxxxx = { .name = "CB.xxxx", @@ -49,6 +58,7 @@ int afs_open_socket(struct afs_net *net) goto error_1; socket->sk->sk_allocation = GFP_NOFS; + socket->sk->sk_user_data = net; /* bind the callback manager's address to make this a server socket */ memset(&srx, 0, sizeof(srx)); @@ -64,6 +74,14 @@ int afs_open_socket(struct afs_net *net) if (ret < 0) goto error_2; + ret = rxrpc_sock_set_manage_response(socket->sk, true); + if (ret < 0) + goto error_2; + + ret = afs_create_token_key(net, socket); + if (ret < 0) + pr_err("Couldn't create RxGK CM key: %d\n", ret); + ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); if (ret == -EADDRINUSE) { srx.transport.sin6.sin6_port = 0; @@ -84,8 +102,7 @@ int afs_open_socket(struct afs_net *net) * it sends back to us. */ - rxrpc_kernel_new_call_notification(socket, afs_rx_new_call, - afs_rx_discard_new_call); + rxrpc_kernel_set_notifications(socket, &afs_rxrpc_callback_ops); ret = kernel_listen(socket, INT_MAX); if (ret < 0) @@ -125,7 +142,9 @@ void afs_close_socket(struct afs_net *net) kernel_sock_shutdown(net->socket, SHUT_RDWR); flush_workqueue(afs_async_calls); + net->socket->sk->sk_user_data = NULL; sock_release(net->socket); + key_put(net->fs_cm_token_key); _debug("dework"); _leave(""); @@ -738,7 +757,6 @@ void afs_charge_preallocation(struct work_struct *work) if (rxrpc_kernel_charge_accept(net->socket, afs_wake_up_async_call, - afs_rx_attach, (unsigned long)call, GFP_KERNEL, call->debug_id) < 0) @@ -800,10 +818,14 @@ static int afs_deliver_cm_op_id(struct afs_call *call) if (!afs_cm_incoming_call(call)) return -ENOTSUPP; + call->security_ix = rxrpc_kernel_query_call_security(call->rxcall, + &call->service_id, + &call->enctype); + trace_afs_cb_call(call); call->work.func = call->type->work; - /* pass responsibility for the remainer of this message off to the + /* pass responsibility for the remainder of this message off to the * cache manager op */ return call->type->deliver(call); } @@ -952,3 +974,13 @@ noinline int afs_protocol_error(struct afs_call *call, call->unmarshalling_error = true; return -EBADMSG; } + +/* + * Wake up OOB notification processing. + */ +static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob) +{ + struct afs_net *net = sk->sk_user_data; + + schedule_work(&net->rx_oob_work); +} diff --git a/fs/afs/server.c b/fs/afs/server.c index 8755f2703815e..a97562f831eb5 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -131,6 +131,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t * timer_setup(&server->timer, afs_server_timer, 0); INIT_LIST_HEAD(&server->volumes); init_waitqueue_head(&server->probe_wq); + mutex_init(&server->cm_token_lock); INIT_LIST_HEAD(&server->probe_link); INIT_HLIST_NODE(&server->proc_link); spin_lock_init(&server->probe_lock); @@ -396,6 +397,7 @@ static void afs_server_rcu(struct rcu_head *rcu) afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state), afs_estate_trace_put_server); afs_put_cell(server->cell, afs_cell_trace_put_server); + kfree(server->cm_rxgk_appdata.data); kfree(server); } diff --git a/include/crypto/krb5.h b/include/crypto/krb5.h index 62d998e62f47d..71dd38f59be1d 100644 --- a/include/crypto/krb5.h +++ b/include/crypto/krb5.h @@ -63,6 +63,11 @@ struct scatterlist; #define KEY_USAGE_SEED_ENCRYPTION (0xAA) #define KEY_USAGE_SEED_INTEGRITY (0x55) +/* + * Standard Kerberos error codes. + */ +#define KRB5_PROG_KEYTYPE_NOSUPP -1765328233 + /* * Mode of operation. */ diff --git a/include/keys/rxrpc-type.h b/include/keys/rxrpc-type.h index 333c0f49a9cd2..0ddbe197a261b 100644 --- a/include/keys/rxrpc-type.h +++ b/include/keys/rxrpc-type.h @@ -9,6 +9,7 @@ #define _KEYS_RXRPC_TYPE_H #include +#include /* * key type for AF_RXRPC keys @@ -31,6 +32,21 @@ struct rxkad_key { u8 ticket[]; /* the encrypted ticket */ }; +/* + * RxRPC key for YFS-RxGK (type-6 security) + */ +struct rxgk_key { + s64 begintime; /* Time at which the ticket starts */ + s64 endtime; /* Time at which the ticket ends */ + u64 lifetime; /* Maximum lifespan of a connection (seconds) */ + u64 bytelife; /* Maximum number of bytes on a connection */ + unsigned int enctype; /* Encoding type */ + s8 level; /* Negotiated security RXRPC_SECURITY_PLAIN/AUTH/ENCRYPT */ + struct krb5_buffer key; /* Master key, K0 */ + struct krb5_buffer ticket; /* Ticket to be passed to server */ + u8 _key[]; /* Key storage */ +}; + /* * list of tokens attached to an rxrpc key */ @@ -40,6 +56,7 @@ struct rxrpc_key_token { struct rxrpc_key_token *next; /* the next token in the list */ union { struct rxkad_key *kad; + struct rxgk_key *rxgk; }; }; diff --git a/include/linux/btf.h b/include/linux/btf.h index ebc0c0c9b9446..b2983706292f7 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -522,6 +522,7 @@ bool btf_param_match_suffix(const struct btf *btf, const char *suffix); int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto, u32 arg_no); +u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, int off); struct bpf_verifier_log; diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 325af611909f9..0b61b8b996d4e 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -2,79 +2,8 @@ #ifndef _LINUX_DCCP_H #define _LINUX_DCCP_H - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include #include -enum dccp_state { - DCCP_OPEN = TCP_ESTABLISHED, - DCCP_REQUESTING = TCP_SYN_SENT, - DCCP_LISTEN = TCP_LISTEN, - DCCP_RESPOND = TCP_SYN_RECV, - /* - * States involved in closing a DCCP connection: - * 1) ACTIVE_CLOSEREQ is entered by a server sending a CloseReq. - * - * 2) CLOSING can have three different meanings (RFC 4340, 8.3): - * a. Client has performed active-close, has sent a Close to the server - * from state OPEN or PARTOPEN, and is waiting for the final Reset - * (in this case, SOCK_DONE == 1). - * b. Client is asked to perform passive-close, by receiving a CloseReq - * in (PART)OPEN state. It sends a Close and waits for final Reset - * (in this case, SOCK_DONE == 0). - * c. Server performs an active-close as in (a), keeps TIMEWAIT state. - * - * 3) The following intermediate states are employed to give passively - * closing nodes a chance to process their unread data: - * - PASSIVE_CLOSE (from OPEN => CLOSED) and - * - PASSIVE_CLOSEREQ (from (PART)OPEN to CLOSING; case (b) above). - */ - DCCP_ACTIVE_CLOSEREQ = TCP_FIN_WAIT1, - DCCP_PASSIVE_CLOSE = TCP_CLOSE_WAIT, /* any node receiving a Close */ - DCCP_CLOSING = TCP_CLOSING, - DCCP_TIME_WAIT = TCP_TIME_WAIT, - DCCP_CLOSED = TCP_CLOSE, - DCCP_NEW_SYN_RECV = TCP_NEW_SYN_RECV, - DCCP_PARTOPEN = TCP_MAX_STATES, - DCCP_PASSIVE_CLOSEREQ, /* clients receiving CloseReq */ - DCCP_MAX_STATES -}; - -enum { - DCCPF_OPEN = TCPF_ESTABLISHED, - DCCPF_REQUESTING = TCPF_SYN_SENT, - DCCPF_LISTEN = TCPF_LISTEN, - DCCPF_RESPOND = TCPF_SYN_RECV, - DCCPF_ACTIVE_CLOSEREQ = TCPF_FIN_WAIT1, - DCCPF_CLOSING = TCPF_CLOSING, - DCCPF_TIME_WAIT = TCPF_TIME_WAIT, - DCCPF_CLOSED = TCPF_CLOSE, - DCCPF_NEW_SYN_RECV = TCPF_NEW_SYN_RECV, - DCCPF_PARTOPEN = (1 << DCCP_PARTOPEN), -}; - -static inline struct dccp_hdr *dccp_hdr(const struct sk_buff *skb) -{ - return (struct dccp_hdr *)skb_transport_header(skb); -} - -static inline struct dccp_hdr *dccp_zeroed_hdr(struct sk_buff *skb, int headlen) -{ - skb_push(skb, headlen); - skb_reset_transport_header(skb); - return memset(skb_transport_header(skb), 0, headlen); -} - static inline struct dccp_hdr_ext *dccp_hdrx(const struct dccp_hdr *dh) { return (struct dccp_hdr_ext *)((unsigned char *)dh + sizeof(*dh)); @@ -85,12 +14,6 @@ static inline unsigned int __dccp_basic_hdr_len(const struct dccp_hdr *dh) return sizeof(*dh) + (dh->dccph_x ? sizeof(struct dccp_hdr_ext) : 0); } -static inline unsigned int dccp_basic_hdr_len(const struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - return __dccp_basic_hdr_len(dh); -} - static inline __u64 dccp_hdr_seq(const struct dccp_hdr *dh) { __u64 seq_nr = ntohs(dh->dccph_seq); @@ -103,222 +26,10 @@ static inline __u64 dccp_hdr_seq(const struct dccp_hdr *dh) return seq_nr; } -static inline struct dccp_hdr_request *dccp_hdr_request(struct sk_buff *skb) -{ - return (struct dccp_hdr_request *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - -static inline struct dccp_hdr_ack_bits *dccp_hdr_ack_bits(const struct sk_buff *skb) -{ - return (struct dccp_hdr_ack_bits *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - -static inline u64 dccp_hdr_ack_seq(const struct sk_buff *skb) -{ - const struct dccp_hdr_ack_bits *dhack = dccp_hdr_ack_bits(skb); - return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + ntohl(dhack->dccph_ack_nr_low); -} - -static inline struct dccp_hdr_response *dccp_hdr_response(struct sk_buff *skb) -{ - return (struct dccp_hdr_response *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - -static inline struct dccp_hdr_reset *dccp_hdr_reset(struct sk_buff *skb) -{ - return (struct dccp_hdr_reset *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - static inline unsigned int __dccp_hdr_len(const struct dccp_hdr *dh) { return __dccp_basic_hdr_len(dh) + dccp_packet_hdr_len(dh->dccph_type); } -static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) -{ - return __dccp_hdr_len(dccp_hdr(skb)); -} - -/** - * struct dccp_request_sock - represent DCCP-specific connection request - * @dreq_inet_rsk: structure inherited from - * @dreq_iss: initial sequence number, sent on the first Response (RFC 4340, 7.1) - * @dreq_gss: greatest sequence number sent (for retransmitted Responses) - * @dreq_isr: initial sequence number received in the first Request - * @dreq_gsr: greatest sequence number received (for retransmitted Request(s)) - * @dreq_service: service code present on the Request (there is just one) - * @dreq_featneg: feature negotiation options for this connection - * The following two fields are analogous to the ones in dccp_sock: - * @dreq_timestamp_echo: last received timestamp to echo (13.1) - * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo - */ -struct dccp_request_sock { - struct inet_request_sock dreq_inet_rsk; - __u64 dreq_iss; - __u64 dreq_gss; - __u64 dreq_isr; - __u64 dreq_gsr; - __be32 dreq_service; - spinlock_t dreq_lock; - struct list_head dreq_featneg; - __u32 dreq_timestamp_echo; - __u32 dreq_timestamp_time; -}; - -static inline struct dccp_request_sock *dccp_rsk(const struct request_sock *req) -{ - return (struct dccp_request_sock *)req; -} - -extern struct inet_timewait_death_row dccp_death_row; - -extern int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - struct sk_buff *skb); - -struct dccp_options_received { - u64 dccpor_ndp:48; - u32 dccpor_timestamp; - u32 dccpor_timestamp_echo; - u32 dccpor_elapsed_time; -}; - -struct ccid; - -enum dccp_role { - DCCP_ROLE_UNDEFINED, - DCCP_ROLE_LISTEN, - DCCP_ROLE_CLIENT, - DCCP_ROLE_SERVER, -}; - -struct dccp_service_list { - __u32 dccpsl_nr; - __be32 dccpsl_list[]; -}; - -#define DCCP_SERVICE_INVALID_VALUE htonl((__u32)-1) -#define DCCP_SERVICE_CODE_IS_ABSENT 0 - -static inline bool dccp_list_has_service(const struct dccp_service_list *sl, - const __be32 service) -{ - if (likely(sl != NULL)) { - u32 i = sl->dccpsl_nr; - while (i--) - if (sl->dccpsl_list[i] == service) - return true; - } - return false; -} - -struct dccp_ackvec; - -/** - * struct dccp_sock - DCCP socket state - * - * @dccps_swl - sequence number window low - * @dccps_swh - sequence number window high - * @dccps_awl - acknowledgement number window low - * @dccps_awh - acknowledgement number window high - * @dccps_iss - initial sequence number sent - * @dccps_isr - initial sequence number received - * @dccps_osr - first OPEN sequence number received - * @dccps_gss - greatest sequence number sent - * @dccps_gsr - greatest valid sequence number received - * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss - * @dccps_service - first (passive sock) or unique (active sock) service code - * @dccps_service_list - second .. last service code on passive socket - * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option - * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo - * @dccps_l_ack_ratio - feature-local Ack Ratio - * @dccps_r_ack_ratio - feature-remote Ack Ratio - * @dccps_l_seq_win - local Sequence Window (influences ack number validity) - * @dccps_r_seq_win - remote Sequence Window (influences seq number validity) - * @dccps_pcslen - sender partial checksum coverage (via sockopt) - * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) - * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) - * @dccps_ndp_count - number of Non Data Packets since last data packet - * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) - * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) - * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) - * @dccps_hc_rx_ackvec - rx half connection ack vector - * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) - * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) - * @dccps_options_received - parsed set of retrieved options - * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy - * @dccps_tx_qlen - maximum length of the TX queue - * @dccps_role - role of this sock, one of %dccp_role - * @dccps_hc_rx_insert_options - receiver wants to add options when acking - * @dccps_hc_tx_insert_options - sender wants to add options when sending - * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) - * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" - * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets - * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing) - * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) - */ -struct dccp_sock { - /* inet_connection_sock has to be the first member of dccp_sock */ - struct inet_connection_sock dccps_inet_connection; -#define dccps_syn_rtt dccps_inet_connection.icsk_ack.lrcvtime - __u64 dccps_swl; - __u64 dccps_swh; - __u64 dccps_awl; - __u64 dccps_awh; - __u64 dccps_iss; - __u64 dccps_isr; - __u64 dccps_osr; - __u64 dccps_gss; - __u64 dccps_gsr; - __u64 dccps_gar; - __be32 dccps_service; - __u32 dccps_mss_cache; - struct dccp_service_list *dccps_service_list; - __u32 dccps_timestamp_echo; - __u32 dccps_timestamp_time; - __u16 dccps_l_ack_ratio; - __u16 dccps_r_ack_ratio; - __u64 dccps_l_seq_win:48; - __u64 dccps_r_seq_win:48; - __u8 dccps_pcslen:4; - __u8 dccps_pcrlen:4; - __u8 dccps_send_ndp_count:1; - __u64 dccps_ndp_count:48; - unsigned long dccps_rate_last; - struct list_head dccps_featneg; - struct dccp_ackvec *dccps_hc_rx_ackvec; - struct ccid *dccps_hc_rx_ccid; - struct ccid *dccps_hc_tx_ccid; - struct dccp_options_received dccps_options_received; - __u8 dccps_qpolicy; - __u32 dccps_tx_qlen; - enum dccp_role dccps_role:2; - __u8 dccps_hc_rx_insert_options:1; - __u8 dccps_hc_tx_insert_options:1; - __u8 dccps_server_timewait:1; - __u8 dccps_sync_scheduled:1; - struct tasklet_struct dccps_xmitlet; - struct timer_list dccps_xmit_timer; -}; - -#define dccp_sk(ptr) container_of_const(ptr, struct dccp_sock, \ - dccps_inet_connection.icsk_inet.sk) - -static inline const char *dccp_role(const struct sock *sk) -{ - switch (dccp_sk(sk)->dccps_role) { - case DCCP_ROLE_UNDEFINED: return "undefined"; - case DCCP_ROLE_LISTEN: return "listen"; - case DCCP_ROLE_SERVER: return "server"; - case DCCP_ROLE_CLIENT: return "client"; - } - return NULL; -} - -extern void dccp_syn_ack_timeout(const struct request_sock *req); - #endif /* _LINUX_DCCP_H */ diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 8210ece94fa6b..7edb5f5e71345 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -926,10 +926,11 @@ struct kernel_ethtool_ts_info { * @get_ts_info: Get the time stamping and PTP hardware clock capabilities. * It may be called with RCU, or rtnl or reference on the device. * Drivers supporting transmit time stamps in software should set this to - * ethtool_op_get_ts_info(). Drivers must not zero statistics which they - * don't report. The stats structure is initialized to ETHTOOL_STAT_NOT_SET - * indicating driver does not report statistics. - * @get_ts_stats: Query the device hardware timestamping statistics. + * ethtool_op_get_ts_info(). + * @get_ts_stats: Query the device hardware timestamping statistics. Drivers + * must not zero statistics which they don't report. The stats structure + * is initialized to ETHTOOL_STAT_NOT_SET indicating driver does not + * report statistics. * @get_module_info: Get the size and type of the eeprom contained within * a plug-in module. * @get_module_eeprom: Get the eeprom information from the plug-in module @@ -1329,6 +1330,17 @@ extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...); */ extern void ethtool_puts(u8 **data, const char *str); +/** + * ethtool_cpy - Write possibly-not-NUL-terminated string to ethtool string data + * @data: Pointer to a pointer to the start of string to write into + * @str: NUL-byte padded char array of size ETH_GSTRING_LEN to copy from + */ +#define ethtool_cpy(data, str) do { \ + BUILD_BUG_ON(sizeof(str) != ETH_GSTRING_LEN); \ + memcpy(*(data), str, ETH_GSTRING_LEN); \ + *(data) += ETH_GSTRING_LEN; \ +} while (0) + /* Link mode to forced speed capabilities maps */ struct ethtool_forced_speed_map { u32 speed; diff --git a/include/linux/mm.h b/include/linux/mm.h index b7f13f087954b..130d3c9d2ee45 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4248,4 +4248,62 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #define VM_SEALED_SYSMAP VM_NONE #endif +/* + * DMA mapping IDs for page_pool + * + * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and + * stashes it in the upper bits of page->pp_magic. We always want to be able to + * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP + * pages can have arbitrary kernel pointers stored in the same field as pp_magic + * (since it overlaps with page->lru.next), so we must ensure that we cannot + * mistake a valid kernel pointer with any of the values we write into this + * field. + * + * On architectures that set POISON_POINTER_DELTA, this is already ensured, + * since this value becomes part of PP_SIGNATURE; meaning we can just use the + * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the + * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is + * 0, we make sure that we leave the two topmost bits empty, as that guarantees + * we won't mistake a valid kernel pointer for a value we set, regardless of the + * VMSPLIT setting. + * + * Altogether, this means that the number of bits available is constrained by + * the size of an unsigned long (at the upper end, subtracting two bits per the + * above), and the definition of PP_SIGNATURE (with or without + * POISON_POINTER_DELTA). + */ +#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) +#if POISON_POINTER_DELTA > 0 +/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA + * index to not overlap with that if set + */ +#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) +#else +/* Always leave out the topmost two; see above. */ +#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2) +#endif + +#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ + PP_DMA_INDEX_SHIFT) + +/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is + * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for + * the head page of compound page and bit 1 for pfmemalloc page, as well as the + * bits used for the DMA index. page_is_pfmemalloc() is checked in + * __page_pool_put_page() to avoid recycling the pfmemalloc page. + */ +#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) + +#ifdef CONFIG_PAGE_POOL +static inline bool page_pool_page_is_pp(struct page *page) +{ + return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; +} +#else +static inline bool page_pool_page_is_pp(struct page *page) +{ + return false; +} +#endif + #endif /* _LINUX_MM_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2d11d013cabed..0321fd952f708 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -688,6 +688,7 @@ struct netdev_queue { /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; #ifdef CONFIG_XDP_SOCKETS + /* "ops protected", see comment about net_device::lock */ struct xsk_buff_pool *pool; #endif @@ -1945,13 +1946,11 @@ enum netdev_reg_state { * * @reg_state: Register/unregister state machine * @dismantle: Device is going to be freed - * @rtnl_link_state: This enum represents the phases of creating - * a new link - * * @needs_free_netdev: Should unregister perform free_netdev? * @priv_destructor: Called from unregister * @npinfo: XXX: need comments on this one * @nd_net: Network namespace this network device is inside + * protected by @lock * * @ml_priv: Mid-layer private * @ml_priv_type: Mid-layer private type @@ -2359,10 +2358,10 @@ struct net_device { bool dismantle; - enum { - RTNL_LINK_INITIALIZED, - RTNL_LINK_INITIALIZING, - } rtnl_link_state:16; + /** @moving_ns: device is changing netns, protected by @lock */ + bool moving_ns; + /** @rtnl_link_initializing: Device being created, suppress events */ + bool rtnl_link_initializing; bool needs_free_netdev; void (*priv_destructor)(struct net_device *dev); @@ -2521,7 +2520,7 @@ struct net_device { * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: - * @up + * @up, @moving_ns, @nd_net, @xdp_features * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues @@ -4689,9 +4688,10 @@ static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) /* * txq->trans_start can be read locklessly from dev_watchdog() */ -static inline void txq_trans_update(struct netdev_queue *txq) +static inline void txq_trans_update(const struct net_device *dev, + struct netdev_queue *txq) { - if (txq->xmit_lock_owner != -1) + if (!dev->lltx) WRITE_ONCE(txq->trans_start, jiffies); } @@ -5211,7 +5211,7 @@ static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_devi rc = __netdev_start_xmit(ops, skb, dev, more); if (rc == NETDEV_TX_OK) - txq_trans_update(txq); + txq_trans_update(dev, txq); return rc; } diff --git a/include/linux/phy.h b/include/linux/phy.h index a2bfae80c4497..066a28a4b64b2 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1757,7 +1757,6 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id); struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode); struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode); -struct phy_device *device_phy_find_device(struct device *dev); struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode); struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45); int phy_device_register(struct phy_device *phy); @@ -1779,11 +1778,6 @@ struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode) return NULL; } -static inline struct phy_device *device_phy_find_device(struct device *dev) -{ - return NULL; -} - static inline struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode) { @@ -2046,6 +2040,9 @@ int phy_get_tx_amplitude_gain(struct phy_device *phydev, struct device *dev, enum ethtool_link_mode_bit_indices linkmode, u32 *val); +int phy_get_mac_termination(struct phy_device *phydev, struct device *dev, + u32 *val); + void phy_resolve_pause(unsigned long *local_adv, unsigned long *partner_adv, bool *tx_pause, bool *rx_pause); diff --git a/include/linux/poison.h b/include/linux/poison.h index 331a9a996fa87..8ca2235f78d5d 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -70,6 +70,10 @@ #define KEY_DESTROY 0xbd /********** net/core/page_pool.c **********/ +/* + * page_pool uses additional free bits within this value to store data, see the + * definition of PP_DMA_INDEX_MASK in mm.h + */ #define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) /********** net/core/skbuff.c **********/ diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 0d68d09bedd14..eced7e9bf69a8 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -68,6 +68,22 @@ struct ptp_system_timestamp { * @n_per_out: The number of programmable periodic signals. * @n_pins: The number of programmable pins. * @pps: Indicates whether the clock supports a PPS callback. + * + * @supported_perout_flags: The set of flags the driver supports for the + * PTP_PEROUT_REQUEST ioctl. The PTP core will + * reject a request with any flag not specified + * here. + * + * @supported_extts_flags: The set of flags the driver supports for the + * PTP_EXTTS_REQUEST ioctl. The PTP core will use + * this list to reject unsupported requests. + * PTP_ENABLE_FEATURE is assumed and does not need to + * be included. If PTP_STRICT_FLAGS is *not* set, + * then both PTP_RISING_EDGE and PTP_FALLING_EDGE + * will be assumed. Note that PTP_STRICT_FLAGS must + * be set if the drivers wants to honor + * PTP_EXTTS_REQUEST2 and any future flags. + * * @pin_config: Array of length 'n_pins'. If the number of * programmable pins is nonzero, then drivers must * allocate and initialize this array. @@ -174,6 +190,8 @@ struct ptp_clock_info { int n_per_out; int n_pins; int pps; + unsigned int supported_perout_flags; + unsigned int supported_extts_flags; struct ptp_pin_desc *pin_config; int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm); int (*adjphase)(struct ptp_clock_info *ptp, s32 phase); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b974a277975a8..beb084ee4f4d2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4105,8 +4105,7 @@ static inline void skb_frag_list_init(struct sk_buff *skb) int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, int *err, long *timeo_p, const struct sk_buff *skb); -struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, - struct sk_buff_head *queue, +struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last); @@ -4146,6 +4145,8 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, unsigned int flags); int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len); +int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb, + int offset, int len, int flags); int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); diff --git a/include/linux/socket.h b/include/linux/socket.h index c3322eb3d6865..3b262487ec060 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -168,7 +168,7 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); } -static inline size_t msg_data_left(struct msghdr *msg) +static inline size_t msg_data_left(const struct msghdr *msg) { return iov_iter_count(&msg->msg_iter); } diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index c4ec8bb8144e7..8aed09d65b4a1 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -276,7 +276,6 @@ struct plat_stmmacenet_data { int mac_port_sel_speed; int has_xgmac; u8 vlan_fail_q; - unsigned long eee_usecs_rate; struct pci_dev *pdev; int int_snapshot_num; int msi_mac_vec; diff --git a/include/linux/tfrc.h b/include/linux/tfrc.h deleted file mode 100644 index a5acc768085da..0000000000000 --- a/include/linux/tfrc.h +++ /dev/null @@ -1,51 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _LINUX_TFRC_H_ -#define _LINUX_TFRC_H_ -/* - * TFRC - Data Structures for the TCP-Friendly Rate Control congestion - * control mechanism as specified in RFC 3448. - * - * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - */ -#include - -/** tfrc_rx_info - TFRC Receiver Data Structure - * - * @tfrcrx_x_recv: receiver estimate of sending rate (3.2.2) - * @tfrcrx_rtt: round-trip-time (communicated by sender) - * @tfrcrx_p: current estimate of loss event rate (3.2.2) - */ -struct tfrc_rx_info { - __u32 tfrcrx_x_recv; - __u32 tfrcrx_rtt; - __u32 tfrcrx_p; -}; - -/** tfrc_tx_info - TFRC Sender Data Structure - * - * @tfrctx_x: computed transmit rate (4.3 (4)) - * @tfrctx_x_recv: receiver estimate of send rate (4.3) - * @tfrctx_x_calc: return value of throughput equation (3.1) - * @tfrctx_rtt: (moving average) estimate of RTT (4.3) - * @tfrctx_p: current loss event rate (5.4) - * @tfrctx_rto: estimate of RTO, equals 4*RTT (4.3) - * @tfrctx_ipi: inter-packet interval (4.6) - * - * Note: X and X_recv are both maintained in units of 64 * bytes/second. This - * enables a finer resolution of sending rates and avoids problems with - * integer arithmetic; u32 is not sufficient as scaling consumes 6 bits. - */ -struct tfrc_tx_info { - __u64 tfrctx_x; - __u64 tfrctx_x_recv; - __u32 tfrctx_x_calc; - __u32 tfrctx_rtt; - __u32 tfrctx_p; - __u32 tfrctx_rto; - __u32 tfrctx_ipi; -}; - -#endif /* _LINUX_TFRC_H_ */ diff --git a/include/linux/udp.h b/include/linux/udp.h index 0807e21cfec95..895240177f4f4 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -101,6 +101,13 @@ struct udp_sock { /* Cache friendly copy of sk->sk_peek_off >= 0 */ bool peeking_with_offset; + + /* + * Accounting for the tunnel GRO fastpath. + * Unprotected by compilers guard, as it uses space available in + * the last UDP socket cacheline. + */ + struct hlist_node tunnel_list; }; #define udp_test_bit(nr, sk) \ @@ -219,4 +226,13 @@ static inline void udp_allow_gso(struct sock *sk) #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) +static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6) +{ +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + return rcu_dereference(net->ipv4.udp_tunnel_gro[is_ipv6].sk); +#else + return NULL; +#endif +} + #endif /* _LINUX_UDP_H */ diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index cf793d18e5df5..f15341594cc8f 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -16,6 +16,7 @@ struct sock; struct socket; struct rxrpc_call; struct rxrpc_peer; +struct krb5_buffer; enum rxrpc_abort_reason; enum rxrpc_interruptibility { @@ -24,23 +25,33 @@ enum rxrpc_interruptibility { RXRPC_UNINTERRUPTIBLE, /* Call should not be interruptible at all */ }; +enum rxrpc_oob_type { + RXRPC_OOB_CHALLENGE, /* Security challenge for a connection */ +}; + /* * Debug ID counter for tracing. */ extern atomic_t rxrpc_debug_id; +/* + * Operations table for rxrpc to call out to a kernel application (e.g. kAFS). + */ +struct rxrpc_kernel_ops { + void (*notify_new_call)(struct sock *sk, struct rxrpc_call *call, + unsigned long user_call_ID); + void (*discard_new_call)(struct rxrpc_call *call, unsigned long user_call_ID); + void (*user_attach_call)(struct rxrpc_call *call, unsigned long user_call_ID); + void (*notify_oob)(struct sock *sk, struct sk_buff *oob); +}; + typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *, unsigned long); typedef void (*rxrpc_notify_end_tx_t)(struct sock *, struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_notify_new_call_t)(struct sock *, struct rxrpc_call *, - unsigned long); -typedef void (*rxrpc_discard_new_call_t)(struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_user_attach_call_t)(struct rxrpc_call *, unsigned long); -void rxrpc_kernel_new_call_notification(struct socket *, - rxrpc_notify_new_call_t, - rxrpc_discard_new_call_t); +void rxrpc_kernel_set_notifications(struct socket *sock, + const struct rxrpc_kernel_ops *app_ops); struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_peer *peer, struct key *key, @@ -72,9 +83,9 @@ const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer); unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data); unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer); unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *); -int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, - rxrpc_user_attach_call_t, unsigned long, gfp_t, - unsigned int); +int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, + unsigned long user_call_ID, gfp_t gfp, + unsigned int debug_id); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); @@ -83,5 +94,25 @@ void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val); int rxrpc_sock_set_security_keyring(struct sock *, struct key *); +int rxrpc_sock_set_manage_response(struct sock *sk, bool set); + +enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata); +struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock, + enum rxrpc_oob_type *_type); +void rxrpc_kernel_free_oob(struct sk_buff *oob); +void rxrpc_kernel_query_challenge(struct sk_buff *challenge, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata, + u16 *_service_id, u8 *_security_index); +int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code, + int error, enum rxrpc_abort_reason why); +int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge); +u32 rxgk_kernel_query_challenge(struct sk_buff *challenge); +int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, + struct krb5_buffer *appdata); +u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call, + u16 *_service_id, u32 *_enctype); #endif /* _NET_RXRPC_H */ diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index e4fdc6b54ceff..bea77934a235c 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -40,6 +40,7 @@ FN(TCP_OFOMERGE) \ FN(TCP_RFC7323_PAWS) \ FN(TCP_RFC7323_PAWS_ACK) \ + FN(TCP_RFC7323_TW_PAWS) \ FN(TCP_RFC7323_TSECR) \ FN(TCP_LISTEN_OVERFLOW) \ FN(TCP_OLD_SEQUENCE) \ @@ -283,6 +284,12 @@ enum skb_drop_reason { * Corresponds to LINUX_MIB_PAWS_OLD_ACK. */ SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK, + /** + * @SKB_DROP_REASON_TCP_RFC7323_TW_PAWS: PAWS check, socket is in + * TIME_WAIT state. + * Corresponds to LINUX_MIB_PAWS_TW_REJECTED. + */ + SKB_DROP_REASON_TCP_RFC7323_TW_PAWS, /** * @SKB_DROP_REASON_TCP_RFC7323_TSECR: PAWS check, invalid TSEcr. * Corresponds to LINUX_MIB_TSECRREJECTED. diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 949641e925398..4564b5d348b1a 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -175,14 +175,9 @@ struct inet_hashinfo { bool pernet; } ____cacheline_aligned_in_smp; -static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) +static inline struct inet_hashinfo *tcp_get_hashinfo(const struct sock *sk) { -#if IS_ENABLED(CONFIG_IP_DCCP) - return sk->sk_prot->h.hashinfo ? : - sock_net(sk)->ipv4.tcp_death_row.hashinfo; -#else return sock_net(sk)->ipv4.tcp_death_row.hashinfo; -#endif } static inline struct inet_listen_hashbucket * diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index a36a335cef9fb..0c3d571a04a16 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -377,10 +377,9 @@ struct net *ip_tunnel_get_link_net(const struct net_device *dev); int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); - -void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id, - struct rtnl_link_ops *ops, - struct list_head *dev_to_kill); +void ip_tunnel_delete_net(struct net *net, unsigned int id, + struct rtnl_link_ops *ops, + struct list_head *dev_to_kill); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); diff --git a/include/net/mptcp.h b/include/net/mptcp.h index bfbad695951cf..f7263fe2a2e40 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -101,18 +101,9 @@ struct mptcp_out_options { #define MPTCP_SCHED_MAX 128 #define MPTCP_SCHED_BUF_MAX (MPTCP_SCHED_NAME_MAX * MPTCP_SCHED_MAX) -#define MPTCP_SUBFLOWS_MAX 8 - -struct mptcp_sched_data { - u8 subflows; - struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX]; -}; - struct mptcp_sched_ops { - int (*get_send)(struct mptcp_sock *msk, - struct mptcp_sched_data *data); - int (*get_retrans)(struct mptcp_sock *msk, - struct mptcp_sched_data *data); + int (*get_send)(struct mptcp_sock *msk); + int (*get_retrans)(struct mptcp_sock *msk); char name[MPTCP_SCHED_NAME_MAX]; struct module *owner; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index bd57d8fb54f14..025a7574b275f 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -475,8 +475,8 @@ struct pernet_operations { void (*exit)(struct net *net); void (*exit_batch)(struct list_head *net_exit_list); /* Following method is called with RTNL held. */ - void (*exit_batch_rtnl)(struct list_head *net_exit_list, - struct list_head *dev_kill_list); + void (*exit_rtnl)(struct net *net, + struct list_head *dev_kill_list); unsigned int * const id; const size_t size; }; diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h index c316b551df8d6..2a753813f8496 100644 --- a/include/net/netdev_lock.h +++ b/include/net/netdev_lock.h @@ -48,6 +48,22 @@ static inline void netdev_unlock_ops(struct net_device *dev) netdev_unlock(dev); } +static inline void netdev_lock_ops_to_full(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_assert_locked(dev); + else + netdev_lock(dev); +} + +static inline void netdev_unlock_full_to_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_assert_locked(dev); + else + netdev_unlock(dev); +} + static inline void netdev_ops_assert_locked(const struct net_device *dev) { if (netdev_need_ops_lock(dev)) @@ -64,6 +80,22 @@ netdev_ops_assert_locked_or_invisible(const struct net_device *dev) netdev_ops_assert_locked(dev); } +static inline void netdev_lock_ops_compat(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_lock(dev); + else + rtnl_lock(); +} + +static inline void netdev_unlock_ops_compat(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_unlock(dev); + else + rtnl_unlock(); +} + static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 825141d675e58..ea709b59d8270 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -85,9 +85,11 @@ struct netdev_queue_stats_tx { * for some of the events is not maintained, and reliable "total" cannot * be provided). * + * Ops are called under the instance lock if netdev_need_ops_lock() + * returns true, otherwise under rtnl_lock. * Device drivers can assume that when collecting total device stats, * the @get_base_stats and subsequent per-queue calls are performed - * "atomically" (without releasing the rtnl_lock). + * "atomically" (without releasing the relevant lock). * * Device drivers are encouraged to reset the per-queue statistics when * number of queues change. This is because the primary use case for diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index b2238b551dced..8cdcd138b33f2 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -20,12 +20,12 @@ struct netdev_rx_queue { struct net_device *dev; netdevice_tracker dev_tracker; + /* All fields below are "ops protected", + * see comment about net_device::lock + */ #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif - /* NAPI instance for the queue - * "ops protected", see comment about net_device::lock - */ struct napi_struct *napi; struct pp_memory_provider_params mp_params; } ____cacheline_aligned_in_smp; diff --git a/include/net/netlink.h b/include/net/netlink.h index 29e0db9403820..82e07e272290a 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -611,6 +611,22 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh) return nlh->nlmsg_len - NLMSG_HDRLEN; } +/** + * nlmsg_payload - message payload if the data fits in the len + * @nlh: netlink message header + * @len: struct length + * + * Returns: The netlink message payload/data if the length is sufficient, + * otherwise NULL. + */ +static inline void *nlmsg_payload(const struct nlmsghdr *nlh, size_t len) +{ + if (nlh->nlmsg_len < nlmsg_msg_size(len)) + return NULL; + + return nlmsg_data(nlh); +} + /** * nlmsg_attrdata - head of attributes data * @nlh: netlink message header diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 650b2dc9199f4..6373e3f17da84 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -47,6 +47,11 @@ struct sysctl_fib_multipath_hash_seed { }; #endif +struct udp_tunnel_gro { + struct sock __rcu *sk; + struct hlist_head list; +}; + struct netns_ipv4 { /* Cacheline organization can be found documented in * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst. @@ -85,6 +90,11 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; struct udp_table *udp_table; +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + /* Not in a pernet subsys because need to be available at GRO stage */ + struct udp_tunnel_gro udp_tunnel_gro[2]; +#endif + #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table_header *frags_hdr; @@ -277,4 +287,5 @@ struct netns_ipv4 { struct hlist_head *inet_addr_lst; struct delayed_work addr_chk_work; }; + #endif diff --git a/include/net/p8022.h b/include/net/p8022.h deleted file mode 100644 index a29e224ac498a..0000000000000 --- a/include/net/p8022.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_P8022_H -#define _NET_P8022_H - -struct net_device; -struct packet_type; -struct sk_buff; - -struct datalink_proto * -register_8022_client(unsigned char type, - int (*func)(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt, - struct net_device *orig_dev)); -void unregister_8022_client(struct datalink_proto *proto); -#endif diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 582a3d00cbe23..93f2c31baf9bf 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -395,6 +395,12 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, page_pool_put_full_page(pool, page, true); } +static inline void page_pool_recycle_direct_netmem(struct page_pool *pool, + netmem_ref netmem) +{ + page_pool_put_full_netmem(pool, netmem, true); +} + #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ (sizeof(dma_addr_t) > sizeof(unsigned long)) @@ -492,4 +498,9 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) page_pool_update_nid(pool, new_nid); } +static inline bool page_pool_is_unreadable(struct page_pool *pool) +{ + return !!pool->mp_ops; +} + #endif /* _NET_PAGE_POOL_HELPERS_H */ diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 36eb57d73abc6..431b593de7093 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA @@ -33,6 +34,9 @@ #define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \ PP_FLAG_SYSTEM_POOL | PP_FLAG_ALLOW_UNREADABLE_NETMEM) +/* Index limit to stay within PP_DMA_INDEX_BITS for DMA indices */ +#define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1) + /* * Fast allocation side cache array/stack * @@ -221,6 +225,8 @@ struct page_pool { void *mp_priv; const struct memory_provider_ops *mp_ops; + struct xarray dma_mapped; + #ifdef CONFIG_PAGE_POOL_STATS /* recycle stats are per-cpu to avoid locking */ struct page_pool_recycle_stats __percpu *recycle_stats; diff --git a/include/net/rps.h b/include/net/rps.h index e358e9711f275..507f4aa5d39b2 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -57,9 +57,10 @@ struct rps_dev_flow_table { * meaning we use 32-6=26 bits for the hash. */ struct rps_sock_flow_table { - u32 mask; + struct rcu_head rcu; + u32 mask; - u32 ents[] ____cacheline_aligned_in_smp; + u32 ents[] ____cacheline_aligned_in_smp; }; #define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) diff --git a/include/net/rstreason.h b/include/net/rstreason.h index 69cb2e52b7dae..979ac87b5d994 100644 --- a/include/net/rstreason.h +++ b/include/net/rstreason.h @@ -36,7 +36,7 @@ /** * enum sk_rst_reason - the reasons of socket reset * - * The reasons of sk reset, which are used in DCCP/TCP/MPTCP protocols. + * The reasons of sk reset, which are used in TCP/MPTCP protocols. * * There are three parts in order: * 1) skb drop reasons: relying on drop reasons for such as passive reset diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h index 21e7fa2a18138..cddebafb9f779 100644 --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -16,9 +16,5 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport); u32 secure_tcpv6_ts_off(const struct net *net, const __be32 *saddr, const __be32 *daddr); -u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport); -u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, - __be16 sport, __be16 dport); #endif /* _NET_SECURE_SEQ */ diff --git a/include/net/sock.h b/include/net/sock.h index 694f954258d43..e223102337c77 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1781,7 +1781,6 @@ void sk_free(struct sock *sk); void sk_net_refcnt_upgrade(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); -void sk_free_unlock_clone(struct sock *sk); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); @@ -2605,8 +2604,8 @@ struct sock_skb_cb { * using skb->cb[] would keep using it directly and utilize its * alignment guarantee. */ -#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \ - sizeof(struct sock_skb_cb))) +#define SOCK_SKB_CB_OFFSET (sizeof_field(struct sk_buff, cb) - \ + sizeof(struct sock_skb_cb)) #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ SOCK_SKB_CB_OFFSET)) diff --git a/include/net/tcp.h b/include/net/tcp.h index 4450c384ef178..5078ad868feef 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -427,7 +427,8 @@ enum tcp_tw_status { enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th, - u32 *tw_isn); + u32 *tw_isn, + enum skb_drop_reason *drop_reason); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *lost_race, enum skb_drop_reason *drop_reason); diff --git a/include/net/udp.h b/include/net/udp.h index 6e89520e100dc..a772510b2aa58 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -290,6 +290,7 @@ static inline void udp_lib_init_sock(struct sock *sk) struct udp_sock *up = udp_sk(sk); skb_queue_head_init(&up->reader_queue); + INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); } diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index a93dc51f6323e..2df3b8344eb52 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -191,6 +191,21 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) } #endif +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add); +#else +static inline void udp_tunnel_update_gro_lookup(struct net *net, + struct sock *sk, bool add) {} +static inline void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) {} +#endif + +static inline void udp_tunnel_cleanup_gro(struct sock *sk) +{ + udp_tunnel_update_gro_rcv(sk, false); + udp_tunnel_update_gro_lookup(sock_net(sk), sk, false); +} + static inline void udp_tunnel_encap_enable(struct sock *sk) { if (udp_test_and_set_bit(ENCAP_ENABLED, sk)) diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 2dd23ee2bacdd..e2f7ca045d3e5 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -296,7 +296,7 @@ struct vxlan_dev { struct vxlan_rdst default_dst; /* default destination */ struct timer_list age_timer; - spinlock_t hash_lock[FDB_HASH_SIZE]; + spinlock_t hash_lock; unsigned int addrcnt; struct gro_cells gro_cells; @@ -304,9 +304,10 @@ struct vxlan_dev { struct vxlan_vni_group __rcu *vnigrp; - struct hlist_head fdb_head[FDB_HASH_SIZE]; + struct rhashtable fdb_hash_tbl; struct rhashtable mdb_tbl; + struct hlist_head fdb_list; struct hlist_head mdb_list; unsigned int mdb_seq; }; diff --git a/include/net/xdp.h b/include/net/xdp.h index 48efacbaa35da..b40f1f96cb117 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -616,8 +616,12 @@ struct xdp_metadata_ops { u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); +void xdp_features_set_redirect_target_locked(struct net_device *dev, + bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); +void xdp_features_clear_redirect_target_locked(struct net_device *dev); #else static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 8857f5ea77d48..7f83d242c8e9f 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -663,19 +663,26 @@ TRACE_EVENT(afs_cb_call, __field(unsigned int, call) __field(u32, op) __field(u16, service_id) + __field(u8, security_ix) + __field(u32, enctype) ), TP_fast_assign( __entry->call = call->debug_id; __entry->op = call->operation_ID; __entry->service_id = call->service_id; + __entry->security_ix = call->security_ix; + __entry->enctype = call->enctype; ), - TP_printk("c=%08x %s", + TP_printk("c=%08x %s sv=%u sx=%u en=%u", __entry->call, __entry->service_id == 2501 ? __print_symbolic(__entry->op, yfs_cm_operations) : - __print_symbolic(__entry->op, afs_cm_operations)) + __print_symbolic(__entry->op, afs_cm_operations), + __entry->service_id, + __entry->security_ix, + __entry->enctype) ); TRACE_EVENT(afs_call, diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index cad50d91077ef..378d2dfc73923 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -25,6 +25,7 @@ EM(afs_abort_probeuuid_negative, "afs-probeuuid-neg") \ EM(afs_abort_send_data_error, "afs-send-data") \ EM(afs_abort_unmarshal_error, "afs-unmarshal") \ + EM(afs_abort_unsupported_sec_class, "afs-unsup-sec-class") \ /* rxperf errors */ \ EM(rxperf_abort_general_error, "rxperf-error") \ EM(rxperf_abort_oom, "rxperf-oom") \ @@ -68,6 +69,38 @@ EM(rxkad_abort_resp_tkt_sname, "rxkad-resp-tk-sname") \ EM(rxkad_abort_resp_unknown_tkt, "rxkad-resp-unknown-tkt") \ EM(rxkad_abort_resp_version, "rxkad-resp-version") \ + /* RxGK security errors */ \ + EM(rxgk_abort_1_verify_mic_eproto, "rxgk1-vfy-mic-eproto") \ + EM(rxgk_abort_2_decrypt_eproto, "rxgk2-dec-eproto") \ + EM(rxgk_abort_2_short_data, "rxgk2-short-data") \ + EM(rxgk_abort_2_short_encdata, "rxgk2-short-encdata") \ + EM(rxgk_abort_2_short_header, "rxgk2-short-hdr") \ + EM(rxgk_abort_bad_key_number, "rxgk-bad-key-num") \ + EM(rxgk_abort_chall_key_expired, "rxgk-chall-key-exp") \ + EM(rxgk_abort_chall_no_key, "rxgk-chall-nokey") \ + EM(rxgk_abort_chall_short, "rxgk-chall-short") \ + EM(rxgk_abort_resp_auth_dec, "rxgk-resp-auth-dec") \ + EM(rxgk_abort_resp_bad_callid, "rxgk-resp-bad-callid") \ + EM(rxgk_abort_resp_bad_nonce, "rxgk-resp-bad-nonce") \ + EM(rxgk_abort_resp_bad_param, "rxgk-resp-bad-param") \ + EM(rxgk_abort_resp_call_ctr, "rxgk-resp-call-ctr") \ + EM(rxgk_abort_resp_call_state, "rxgk-resp-call-state") \ + EM(rxgk_abort_resp_internal_error, "rxgk-resp-int-error") \ + EM(rxgk_abort_resp_nopkg, "rxgk-resp-nopkg") \ + EM(rxgk_abort_resp_short_applen, "rxgk-resp-short-applen") \ + EM(rxgk_abort_resp_short_auth, "rxgk-resp-short-auth") \ + EM(rxgk_abort_resp_short_call_list, "rxgk-resp-short-callls") \ + EM(rxgk_abort_resp_short_packet, "rxgk-resp-short-packet") \ + EM(rxgk_abort_resp_short_yfs_klen, "rxgk-resp-short-yfs-klen") \ + EM(rxgk_abort_resp_short_yfs_key, "rxgk-resp-short-yfs-key") \ + EM(rxgk_abort_resp_short_yfs_tkt, "rxgk-resp-short-yfs-tkt") \ + EM(rxgk_abort_resp_tok_dec, "rxgk-resp-tok-dec") \ + EM(rxgk_abort_resp_tok_internal_error, "rxgk-resp-tok-int-err") \ + EM(rxgk_abort_resp_tok_keyerr, "rxgk-resp-tok-keyerr") \ + EM(rxgk_abort_resp_tok_nokey, "rxgk-resp-tok-nokey") \ + EM(rxgk_abort_resp_tok_nopkg, "rxgk-resp-tok-nopkg") \ + EM(rxgk_abort_resp_tok_short, "rxgk-resp-tok-short") \ + EM(rxgk_abort_resp_xdr_align, "rxgk-resp-xdr-align") \ /* rxrpc errors */ \ EM(rxrpc_abort_call_improper_term, "call-improper-term") \ EM(rxrpc_abort_call_reset, "call-reset") \ @@ -77,6 +110,7 @@ EM(rxrpc_abort_call_timeout, "call-timeout") \ EM(rxrpc_abort_no_service_key, "no-serv-key") \ EM(rxrpc_abort_nomem, "nomem") \ + EM(rxrpc_abort_response_sendmsg, "resp-sendmsg") \ EM(rxrpc_abort_service_not_offered, "serv-not-offered") \ EM(rxrpc_abort_shut_down, "shut-down") \ EM(rxrpc_abort_unsupported_security, "unsup-sec") \ @@ -133,24 +167,33 @@ EM(rxrpc_skb_get_conn_secured, "GET conn-secd") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ EM(rxrpc_skb_get_local_work, "GET locl-work") \ + EM(rxrpc_skb_get_post_oob, "GET post-oob ") \ EM(rxrpc_skb_get_reject_work, "GET rej-work ") \ EM(rxrpc_skb_get_to_recvmsg, "GET to-recv ") \ EM(rxrpc_skb_get_to_recvmsg_oos, "GET to-recv-o") \ EM(rxrpc_skb_new_encap_rcv, "NEW encap-rcv") \ EM(rxrpc_skb_new_error_report, "NEW error-rpt") \ EM(rxrpc_skb_new_jumbo_subpacket, "NEW jumbo-sub") \ + EM(rxrpc_skb_new_response_rxgk, "NEW resp-rxgk") \ + EM(rxrpc_skb_new_response_rxkad, "NEW resp-rxkd") \ EM(rxrpc_skb_new_unshared, "NEW unshared ") \ EM(rxrpc_skb_put_call_rx, "PUT call-rx ") \ + EM(rxrpc_skb_put_challenge, "PUT challenge") \ EM(rxrpc_skb_put_conn_secured, "PUT conn-secd") \ EM(rxrpc_skb_put_conn_work, "PUT conn-work") \ EM(rxrpc_skb_put_error_report, "PUT error-rep") \ EM(rxrpc_skb_put_input, "PUT input ") \ EM(rxrpc_skb_put_jumbo_subpacket, "PUT jumbo-sub") \ + EM(rxrpc_skb_put_oob, "PUT oob ") \ EM(rxrpc_skb_put_purge, "PUT purge ") \ + EM(rxrpc_skb_put_purge_oob, "PUT purge-oob") \ + EM(rxrpc_skb_put_response, "PUT response ") \ EM(rxrpc_skb_put_rotate, "PUT rotate ") \ EM(rxrpc_skb_put_unknown, "PUT unknown ") \ EM(rxrpc_skb_see_conn_work, "SEE conn-work") \ + EM(rxrpc_skb_see_oob_challenge, "SEE oob-chall") \ EM(rxrpc_skb_see_recvmsg, "SEE recvmsg ") \ + EM(rxrpc_skb_see_recvmsg_oob, "SEE recvm-oob") \ EM(rxrpc_skb_see_reject, "SEE reject ") \ EM(rxrpc_skb_see_rotate, "SEE rotate ") \ E_(rxrpc_skb_see_version, "SEE version ") @@ -216,9 +259,11 @@ EM(rxrpc_conn_free, "FREE ") \ EM(rxrpc_conn_get_activate_call, "GET act-call") \ EM(rxrpc_conn_get_call_input, "GET inp-call") \ + EM(rxrpc_conn_get_challenge_input, "GET inp-chal") \ EM(rxrpc_conn_get_conn_input, "GET inp-conn") \ EM(rxrpc_conn_get_idle, "GET idle ") \ EM(rxrpc_conn_get_poke_abort, "GET pk-abort") \ + EM(rxrpc_conn_get_poke_response, "GET response") \ EM(rxrpc_conn_get_poke_secured, "GET secured ") \ EM(rxrpc_conn_get_poke_timer, "GET poke ") \ EM(rxrpc_conn_get_service_conn, "GET svc-conn") \ @@ -226,10 +271,12 @@ EM(rxrpc_conn_new_service, "NEW service ") \ EM(rxrpc_conn_put_call, "PUT call ") \ EM(rxrpc_conn_put_call_input, "PUT inp-call") \ + EM(rxrpc_conn_put_challenge_input, "PUT inp-chal") \ EM(rxrpc_conn_put_conn_input, "PUT inp-conn") \ EM(rxrpc_conn_put_discard_idle, "PUT disc-idl") \ EM(rxrpc_conn_put_local_dead, "PUT loc-dead") \ EM(rxrpc_conn_put_noreuse, "PUT noreuse ") \ + EM(rxrpc_conn_put_oob, "PUT oob ") \ EM(rxrpc_conn_put_poke, "PUT poke ") \ EM(rxrpc_conn_put_service_reaped, "PUT svc-reap") \ EM(rxrpc_conn_put_unbundle, "PUT unbundle") \ @@ -331,6 +378,7 @@ EM(rxrpc_recvmsg_full, "FULL") \ EM(rxrpc_recvmsg_hole, "HOLE") \ EM(rxrpc_recvmsg_next, "NEXT") \ + EM(rxrpc_recvmsg_oobq, "OOBQ") \ EM(rxrpc_recvmsg_requeue, "REQU") \ EM(rxrpc_recvmsg_return, "RETN") \ EM(rxrpc_recvmsg_terminal, "TERM") \ @@ -455,8 +503,9 @@ EM(rxrpc_tx_point_call_final_resend, "CallFinalResend") \ EM(rxrpc_tx_point_conn_abort, "ConnAbort") \ EM(rxrpc_tx_point_reject, "Reject") \ + EM(rxrpc_tx_point_rxgk_challenge, "RxGKChall") \ EM(rxrpc_tx_point_rxkad_challenge, "RxkadChall") \ - EM(rxrpc_tx_point_rxkad_response, "RxkadResp") \ + EM(rxrpc_tx_point_response, "Response") \ EM(rxrpc_tx_point_version_keepalive, "VerKeepalive") \ E_(rxrpc_tx_point_version_reply, "VerReply") @@ -473,6 +522,7 @@ #define rxrpc_txbuf_traces \ EM(rxrpc_txbuf_alloc_data, "ALLOC DATA ") \ + EM(rxrpc_txbuf_alloc_response, "ALLOC RESP ") \ EM(rxrpc_txbuf_free, "FREE ") \ EM(rxrpc_txbuf_get_buffer, "GET BUFFER ") \ EM(rxrpc_txbuf_get_trans, "GET TRANS ") \ @@ -480,6 +530,7 @@ EM(rxrpc_txbuf_put_cleaned, "PUT CLEANED") \ EM(rxrpc_txbuf_put_nomem, "PUT NOMEM ") \ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ + EM(rxrpc_txbuf_put_response_tx, "PUT RESP TX") \ EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ EM(rxrpc_txbuf_put_trans, "PUT TRANS ") \ EM(rxrpc_txbuf_see_lost, "SEE LOST ") \ @@ -1150,6 +1201,39 @@ TRACE_EVENT(rxrpc_rx_conn_abort, __entry->abort_code) ); +TRACE_EVENT(rxrpc_tx_challenge, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, + u32 version, u32 nonce), + + TP_ARGS(conn, serial, version, nonce), + + TP_STRUCT__entry( + __field(unsigned int, conn) + __field(rxrpc_serial_t, serial) + __field(u32, version) + __field(u32, nonce) + __field(u16, service_id) + __field(u8, security_ix) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->serial = serial; + __entry->version = version; + __entry->nonce = nonce; + __entry->service_id = conn->service_id; + __entry->security_ix = conn->security_ix; + ), + + TP_printk("C=%08x CHALLENGE r=%08x sv=%u+%u v=%x n=%x", + __entry->conn, + __entry->serial, + __entry->service_id, + __entry->security_ix, + __entry->version, + __entry->nonce) + ); + TRACE_EVENT(rxrpc_rx_challenge, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 nonce, u32 min_level), @@ -1162,6 +1246,8 @@ TRACE_EVENT(rxrpc_rx_challenge, __field(u32, version) __field(u32, nonce) __field(u32, min_level) + __field(u16, service_id) + __field(u8, security_ix) ), TP_fast_assign( @@ -1170,16 +1256,60 @@ TRACE_EVENT(rxrpc_rx_challenge, __entry->version = version; __entry->nonce = nonce; __entry->min_level = min_level; + __entry->service_id = conn->service_id; + __entry->security_ix = conn->security_ix; ), - TP_printk("C=%08x CHALLENGE %08x v=%x n=%x ml=%x", + TP_printk("C=%08x CHALLENGE r=%08x sv=%u+%u v=%x n=%x ml=%x", __entry->conn, __entry->serial, + __entry->service_id, + __entry->security_ix, __entry->version, __entry->nonce, __entry->min_level) ); +TRACE_EVENT(rxrpc_tx_response, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, + struct rxrpc_skb_priv *rsp), + + TP_ARGS(conn, serial, rsp), + + TP_STRUCT__entry( + __field(unsigned int, conn) + __field(rxrpc_serial_t, serial) + __field(rxrpc_serial_t, challenge) + __field(u32, version) + __field(u32, kvno) + __field(u16, ticket_len) + __field(u16, appdata_len) + __field(u16, service_id) + __field(u8, security_ix) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->serial = serial; + __entry->challenge = rsp->resp.challenge_serial; + __entry->version = rsp->resp.version; + __entry->kvno = rsp->resp.kvno; + __entry->ticket_len = rsp->resp.ticket_len; + __entry->service_id = conn->service_id; + __entry->security_ix = conn->security_ix; + ), + + TP_printk("C=%08x RESPONSE r=%08x cr=%08x sv=%u+%u v=%x kv=%x tl=%u", + __entry->conn, + __entry->serial, + __entry->challenge, + __entry->service_id, + __entry->security_ix, + __entry->version, + __entry->kvno, + __entry->ticket_len) + ); + TRACE_EVENT(rxrpc_rx_response, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 kvno, u32 ticket_len), @@ -1192,6 +1322,7 @@ TRACE_EVENT(rxrpc_rx_response, __field(u32, version) __field(u32, kvno) __field(u32, ticket_len) + __field(u8, security_ix) ), TP_fast_assign( @@ -1200,11 +1331,13 @@ TRACE_EVENT(rxrpc_rx_response, __entry->version = version; __entry->kvno = kvno; __entry->ticket_len = ticket_len; + __entry->security_ix = conn->security_ix; ), - TP_printk("C=%08x RESPONSE %08x v=%x kvno=%x tl=%x", + TP_printk("C=%08x RESPONSE r=%08x sx=%u v=%x kvno=%x tl=%x", __entry->conn, __entry->serial, + __entry->security_ix, __entry->version, __entry->kvno, __entry->ticket_len) @@ -2668,6 +2801,30 @@ TRACE_EVENT(rxrpc_rack_timer, ktime_to_us(__entry->delay)) ); +TRACE_EVENT(rxrpc_rxgk_rekey, + TP_PROTO(struct rxrpc_connection *conn, + unsigned int current_key, unsigned int requested_key), + + TP_ARGS(conn, current_key, requested_key), + + TP_STRUCT__entry( + __field(unsigned int, conn) + __field(unsigned int, current_key) + __field(unsigned int, requested_key) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->current_key = current_key; + __entry->requested_key = requested_key; + ), + + TP_printk("C=%08x cur=%x req=%x", + __entry->conn, + __entry->current_key, + __entry->requested_key) + ); + #undef EM #undef E_ diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index 3836de435d9d2..b5310439536e0 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -19,7 +19,6 @@ /* The protocol traced by inet_sock_set_state */ #define inet_protocol_names \ EM(IPPROTO_TCP) \ - EM(IPPROTO_DCCP) \ EM(IPPROTO_SCTP) \ EMe(IPPROTO_MPTCP) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 5d331383047b7..de214f1dea586 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -21,7 +21,6 @@ TRACE_DEFINE_ENUM(SOCK_DGRAM); TRACE_DEFINE_ENUM(SOCK_RAW); TRACE_DEFINE_ENUM(SOCK_RDM); TRACE_DEFINE_ENUM(SOCK_SEQPACKET); -TRACE_DEFINE_ENUM(SOCK_DCCP); TRACE_DEFINE_ENUM(SOCK_PACKET); #define show_socket_type(type) \ @@ -31,7 +30,6 @@ TRACE_DEFINE_ENUM(SOCK_PACKET); { SOCK_RAW, "RAW" }, \ { SOCK_RDM, "RDM" }, \ { SOCK_SEQPACKET, "SEQPACKET" }, \ - { SOCK_DCCP, "DCCP" }, \ { SOCK_PACKET, "PACKET" }) /* This list is known to be incomplete, add new enums as needed. */ diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 1a40c41ff8c30..53e878fa14d14 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -259,6 +259,30 @@ TRACE_EVENT(tcp_retransmit_synack, __entry->saddr_v6, __entry->daddr_v6) ); +TRACE_EVENT(tcp_sendmsg_locked, + TP_PROTO(const struct sock *sk, const struct msghdr *msg, + const struct sk_buff *skb, int size_goal), + + TP_ARGS(sk, msg, skb, size_goal), + + TP_STRUCT__entry( + __field(const void *, skb_addr) + __field(int, skb_len) + __field(int, msg_left) + __field(int, size_goal) + ), + + TP_fast_assign( + __entry->skb_addr = skb; + __entry->skb_len = skb ? skb->len : 0; + __entry->msg_left = msg_data_left(msg); + __entry->size_goal = size_goal; + ), + + TP_printk("skb_addr %p skb_len %d msg_left %d size_goal %d", + __entry->skb_addr, __entry->skb_len, __entry->msg_left, + __entry->size_goal)); + DECLARE_TRACE(tcp_cwnd_reduction_tp, TP_PROTO(const struct sock *sk, int newly_acked_sacked, int newly_lost, int flag), @@ -269,7 +293,7 @@ DECLARE_TRACE(tcp_cwnd_reduction_tp, TRACE_EVENT(tcp_probe, - TP_PROTO(struct sock *sk, struct sk_buff *skb), + TP_PROTO(struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb), diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 2df6e4035d50b..418c4be697ade 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_FIB_RULES_H -#define __LINUX_FIB_RULES_H +#ifndef _UAPI__LINUX_FIB_RULES_H +#define _UAPI__LINUX_FIB_RULES_H #include #include diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h index 1c392dd95a5ee..aa7958b4e41d0 100644 --- a/include/uapi/linux/if_addr.h +++ b/include/uapi/linux/if_addr.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_IF_ADDR_H -#define __LINUX_IF_ADDR_H +#ifndef _UAPI__LINUX_IF_ADDR_H +#define _UAPI__LINUX_IF_ADDR_H #include #include diff --git a/include/uapi/linux/if_addrlabel.h b/include/uapi/linux/if_addrlabel.h index d1f5974c76e10..e69db764fbba1 100644 --- a/include/uapi/linux/if_addrlabel.h +++ b/include/uapi/linux/if_addrlabel.h @@ -8,8 +8,8 @@ * YOSHIFUJI Hideaki @ USAGI/WIDE */ -#ifndef __LINUX_IF_ADDRLABEL_H -#define __LINUX_IF_ADDRLABEL_H +#ifndef _UAPI__LINUX_IF_ADDRLABEL_H +#define _UAPI__LINUX_IF_ADDRLABEL_H #include diff --git a/include/uapi/linux/if_alg.h b/include/uapi/linux/if_alg.h index 0824fbc026a1c..b35871cbeed7d 100644 --- a/include/uapi/linux/if_alg.h +++ b/include/uapi/linux/if_alg.h @@ -11,8 +11,8 @@ * */ -#ifndef _LINUX_IF_ALG_H -#define _LINUX_IF_ALG_H +#ifndef _UAPI_LINUX_IF_ALG_H +#define _UAPI_LINUX_IF_ALG_H #include @@ -58,4 +58,4 @@ struct af_alg_iv { #define ALG_OP_DECRYPT 0 #define ALG_OP_ENCRYPT 1 -#endif /* _LINUX_IF_ALG_H */ +#endif /* _UAPI_LINUX_IF_ALG_H */ diff --git a/include/uapi/linux/if_arcnet.h b/include/uapi/linux/if_arcnet.h index b122cfac71288..473569eaf692b 100644 --- a/include/uapi/linux/if_arcnet.h +++ b/include/uapi/linux/if_arcnet.h @@ -14,8 +14,8 @@ * 2 of the License, or (at your option) any later version. */ -#ifndef _LINUX_IF_ARCNET_H -#define _LINUX_IF_ARCNET_H +#ifndef _UAPI_LINUX_IF_ARCNET_H +#define _UAPI_LINUX_IF_ARCNET_H #include #include @@ -127,4 +127,4 @@ struct archdr { } soft; }; -#endif /* _LINUX_IF_ARCNET_H */ +#endif /* _UAPI_LINUX_IF_ARCNET_H */ diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index d174914a837db..3bcc03f3aa4f0 100644 --- a/include/uapi/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h @@ -41,8 +41,8 @@ * - added definitions for various XOR hashing policies */ -#ifndef _LINUX_IF_BONDING_H -#define _LINUX_IF_BONDING_H +#ifndef _UAPI_LINUX_IF_BONDING_H +#define _UAPI_LINUX_IF_BONDING_H #include #include @@ -152,4 +152,4 @@ enum { }; #define BOND_3AD_STAT_MAX (__BOND_3AD_STAT_MAX - 1) -#endif /* _LINUX_IF_BONDING_H */ +#endif /* _UAPI_LINUX_IF_BONDING_H */ diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index a5b743a2f7750..73876c0e2bba6 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -699,10 +699,11 @@ struct br_mdb_entry { #define MDB_TEMPORARY 0 #define MDB_PERMANENT 1 __u8 state; -#define MDB_FLAGS_OFFLOAD (1 << 0) -#define MDB_FLAGS_FAST_LEAVE (1 << 1) -#define MDB_FLAGS_STAR_EXCL (1 << 2) -#define MDB_FLAGS_BLOCKED (1 << 3) +#define MDB_FLAGS_OFFLOAD (1 << 0) +#define MDB_FLAGS_FAST_LEAVE (1 << 1) +#define MDB_FLAGS_STAR_EXCL (1 << 2) +#define MDB_FLAGS_BLOCKED (1 << 3) +#define MDB_FLAGS_OFFLOAD_FAILED (1 << 4) __u8 flags; __u16 vid; struct { @@ -830,6 +831,7 @@ enum br_boolopt_id { BR_BOOLOPT_NO_LL_LEARN, BR_BOOLOPT_MCAST_VLAN_SNOOPING, BR_BOOLOPT_MST_ENABLE, + BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION, BR_BOOLOPT_MAX }; diff --git a/include/uapi/linux/if_fc.h b/include/uapi/linux/if_fc.h index 3e3173282cc36..ff5ab92d16c2d 100644 --- a/include/uapi/linux/if_fc.h +++ b/include/uapi/linux/if_fc.h @@ -18,8 +18,8 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#ifndef _LINUX_IF_FC_H -#define _LINUX_IF_FC_H +#ifndef _UAPI_LINUX_IF_FC_H +#define _UAPI_LINUX_IF_FC_H #include @@ -49,4 +49,4 @@ struct fcllc { __be16 ethertype; /* ether type field */ }; -#endif /* _LINUX_IF_FC_H */ +#endif /* _UAPI_LINUX_IF_FC_H */ diff --git a/include/uapi/linux/if_hippi.h b/include/uapi/linux/if_hippi.h index 785a1452a66c8..42c4ffd11daea 100644 --- a/include/uapi/linux/if_hippi.h +++ b/include/uapi/linux/if_hippi.h @@ -20,8 +20,8 @@ * 2 of the License, or (at your option) any later version. */ -#ifndef _LINUX_IF_HIPPI_H -#define _LINUX_IF_HIPPI_H +#ifndef _UAPI_LINUX_IF_HIPPI_H +#define _UAPI_LINUX_IF_HIPPI_H #include #include @@ -151,4 +151,4 @@ struct hippi_hdr { struct hippi_snap_hdr snap; } __attribute__((packed)); -#endif /* _LINUX_IF_HIPPI_H */ +#endif /* _UAPI_LINUX_IF_HIPPI_H */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 318386cc5b0d1..3ad2d5d980347 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1986,4 +1986,19 @@ enum { #define IFLA_DSA_MAX (__IFLA_DSA_MAX - 1) +/* OVPN section */ + +enum ovpn_mode { + OVPN_MODE_P2P, + OVPN_MODE_MP, +}; + +enum { + IFLA_OVPN_UNSPEC, + IFLA_OVPN_MODE, + __IFLA_OVPN_MAX, +}; + +#define IFLA_OVPN_MAX (__IFLA_OVPN_MAX - 1) + #endif /* _UAPI_LINUX_IF_LINK_H */ diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 1d2718dd96472..6cd1d7a41dfb7 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_IF_PACKET_H -#define __LINUX_IF_PACKET_H +#ifndef _UAPI__LINUX_IF_PACKET_H +#define _UAPI__LINUX_IF_PACKET_H #include #include diff --git a/include/uapi/linux/if_plip.h b/include/uapi/linux/if_plip.h index 495a366112f29..054d86a9c6e6b 100644 --- a/include/uapi/linux/if_plip.h +++ b/include/uapi/linux/if_plip.h @@ -9,8 +9,8 @@ * */ -#ifndef _LINUX_IF_PLIP_H -#define _LINUX_IF_PLIP_H +#ifndef _UAPI_LINUX_IF_PLIP_H +#define _UAPI_LINUX_IF_PLIP_H #include diff --git a/include/uapi/linux/if_slip.h b/include/uapi/linux/if_slip.h index 65937be53103f..299bf7adc8624 100644 --- a/include/uapi/linux/if_slip.h +++ b/include/uapi/linux/if_slip.h @@ -6,8 +6,8 @@ * KISS TNC driver. */ -#ifndef __LINUX_SLIP_H -#define __LINUX_SLIP_H +#ifndef _UAPI__LINUX_SLIP_H +#define _UAPI__LINUX_SLIP_H #define SL_MODE_SLIP 0 #define SL_MODE_CSLIP 1 diff --git a/include/uapi/linux/if_x25.h b/include/uapi/linux/if_x25.h index 3a5938e38370d..861cfa983db44 100644 --- a/include/uapi/linux/if_x25.h +++ b/include/uapi/linux/if_x25.h @@ -13,8 +13,8 @@ * GNU General Public License for more details. */ -#ifndef _IF_X25_H -#define _IF_X25_H +#ifndef _UAPI_IF_X25_H +#define _UAPI_IF_X25_H #include @@ -24,4 +24,4 @@ #define X25_IFACE_DISCONNECT 0x02 #define X25_IFACE_PARAMS 0x03 -#endif /* _IF_X25_H */ +#endif /* _UAPI_IF_X25_H */ diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 42869770776ec..44f2bb93e7e69 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -7,8 +7,8 @@ * Magnus Karlsson */ -#ifndef _LINUX_IF_XDP_H -#define _LINUX_IF_XDP_H +#ifndef _UAPI_LINUX_IF_XDP_H +#define _UAPI_LINUX_IF_XDP_H #include @@ -180,4 +180,4 @@ struct xdp_desc { /* TX packet carries valid metadata. */ #define XDP_TX_METADATA (1 << 1) -#endif /* _LINUX_IF_XDP_H */ +#endif /* _UAPI_LINUX_IF_XDP_H */ diff --git a/include/uapi/linux/ip6_tunnel.h b/include/uapi/linux/ip6_tunnel.h index 0245269b037c8..85182a839d42a 100644 --- a/include/uapi/linux/ip6_tunnel.h +++ b/include/uapi/linux/ip6_tunnel.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _IP6_TUNNEL_H -#define _IP6_TUNNEL_H +#ifndef _UAPI_IP6_TUNNEL_H +#define _UAPI_IP6_TUNNEL_H #include #include /* For IFNAMSIZ. */ diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h index 84f622a66a7ac..9dd41c2f58a62 100644 --- a/include/uapi/linux/net_dropmon.h +++ b/include/uapi/linux/net_dropmon.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __NET_DROPMON_H -#define __NET_DROPMON_H +#ifndef _UAPI__NET_DROPMON_H +#define _UAPI__NET_DROPMON_H #include #include diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 383213de612a8..a93e6ea37fb3a 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -7,8 +7,8 @@ * */ -#ifndef _NET_TIMESTAMPING_H -#define _NET_TIMESTAMPING_H +#ifndef _UAPI_NET_TIMESTAMPING_H +#define _UAPI_NET_TIMESTAMPING_H #include #include /* for SO_TIMESTAMPING */ @@ -216,4 +216,4 @@ struct sock_txtime { __u32 flags; /* as defined by enum txtime_flags */ }; -#endif /* _NET_TIMESTAMPING_H */ +#endif /* _UAPI_NET_TIMESTAMPING_H */ diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h index dfa61be43d2f0..ff28200204bbb 100644 --- a/include/uapi/linux/netlink_diag.h +++ b/include/uapi/linux/netlink_diag.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __NETLINK_DIAG_H__ -#define __NETLINK_DIAG_H__ +#ifndef _UAPI__NETLINK_DIAG_H__ +#define _UAPI__NETLINK_DIAG_H__ #include diff --git a/include/uapi/linux/ovpn.h b/include/uapi/linux/ovpn.h new file mode 100644 index 0000000000000..680d1522dc87c --- /dev/null +++ b/include/uapi/linux/ovpn.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/ovpn.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_OVPN_H +#define _UAPI_LINUX_OVPN_H + +#define OVPN_FAMILY_NAME "ovpn" +#define OVPN_FAMILY_VERSION 1 + +#define OVPN_NONCE_TAIL_SIZE 8 + +enum ovpn_cipher_alg { + OVPN_CIPHER_ALG_NONE, + OVPN_CIPHER_ALG_AES_GCM, + OVPN_CIPHER_ALG_CHACHA20_POLY1305, +}; + +enum ovpn_del_peer_reason { + OVPN_DEL_PEER_REASON_TEARDOWN, + OVPN_DEL_PEER_REASON_USERSPACE, + OVPN_DEL_PEER_REASON_EXPIRED, + OVPN_DEL_PEER_REASON_TRANSPORT_ERROR, + OVPN_DEL_PEER_REASON_TRANSPORT_DISCONNECT, +}; + +enum ovpn_key_slot { + OVPN_KEY_SLOT_PRIMARY, + OVPN_KEY_SLOT_SECONDARY, +}; + +enum { + OVPN_A_PEER_ID = 1, + OVPN_A_PEER_REMOTE_IPV4, + OVPN_A_PEER_REMOTE_IPV6, + OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID, + OVPN_A_PEER_REMOTE_PORT, + OVPN_A_PEER_SOCKET, + OVPN_A_PEER_SOCKET_NETNSID, + OVPN_A_PEER_VPN_IPV4, + OVPN_A_PEER_VPN_IPV6, + OVPN_A_PEER_LOCAL_IPV4, + OVPN_A_PEER_LOCAL_IPV6, + OVPN_A_PEER_LOCAL_PORT, + OVPN_A_PEER_KEEPALIVE_INTERVAL, + OVPN_A_PEER_KEEPALIVE_TIMEOUT, + OVPN_A_PEER_DEL_REASON, + OVPN_A_PEER_VPN_RX_BYTES, + OVPN_A_PEER_VPN_TX_BYTES, + OVPN_A_PEER_VPN_RX_PACKETS, + OVPN_A_PEER_VPN_TX_PACKETS, + OVPN_A_PEER_LINK_RX_BYTES, + OVPN_A_PEER_LINK_TX_BYTES, + OVPN_A_PEER_LINK_RX_PACKETS, + OVPN_A_PEER_LINK_TX_PACKETS, + + __OVPN_A_PEER_MAX, + OVPN_A_PEER_MAX = (__OVPN_A_PEER_MAX - 1) +}; + +enum { + OVPN_A_KEYCONF_PEER_ID = 1, + OVPN_A_KEYCONF_SLOT, + OVPN_A_KEYCONF_KEY_ID, + OVPN_A_KEYCONF_CIPHER_ALG, + OVPN_A_KEYCONF_ENCRYPT_DIR, + OVPN_A_KEYCONF_DECRYPT_DIR, + + __OVPN_A_KEYCONF_MAX, + OVPN_A_KEYCONF_MAX = (__OVPN_A_KEYCONF_MAX - 1) +}; + +enum { + OVPN_A_KEYDIR_CIPHER_KEY = 1, + OVPN_A_KEYDIR_NONCE_TAIL, + + __OVPN_A_KEYDIR_MAX, + OVPN_A_KEYDIR_MAX = (__OVPN_A_KEYDIR_MAX - 1) +}; + +enum { + OVPN_A_IFINDEX = 1, + OVPN_A_PEER, + OVPN_A_KEYCONF, + + __OVPN_A_MAX, + OVPN_A_MAX = (__OVPN_A_MAX - 1) +}; + +enum { + OVPN_CMD_PEER_NEW = 1, + OVPN_CMD_PEER_SET, + OVPN_CMD_PEER_GET, + OVPN_CMD_PEER_DEL, + OVPN_CMD_PEER_DEL_NTF, + OVPN_CMD_KEY_NEW, + OVPN_CMD_KEY_GET, + OVPN_CMD_KEY_SWAP, + OVPN_CMD_KEY_SWAP_NTF, + OVPN_CMD_KEY_DEL, + + __OVPN_CMD_MAX, + OVPN_CMD_MAX = (__OVPN_CMD_MAX - 1) +}; + +#define OVPN_MCGRP_PEERS "peers" + +#endif /* _UAPI_LINUX_OVPN_H */ diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 2c32080416b5a..4908213641656 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_PKT_CLS_H -#define __LINUX_PKT_CLS_H +#ifndef _UAPI__LINUX_PKT_CLS_H +#define _UAPI__LINUX_PKT_CLS_H #include #include diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 25a9a47001cdd..9ea8743957172 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_PKT_SCHED_H -#define __LINUX_PKT_SCHED_H +#ifndef _UAPI__LINUX_PKT_SCHED_H +#define _UAPI__LINUX_PKT_SCHED_H #include #include diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h index 8f8dc7a937a43..d9735abd4c791 100644 --- a/include/uapi/linux/rxrpc.h +++ b/include/uapi/linux/rxrpc.h @@ -36,26 +36,33 @@ struct sockaddr_rxrpc { #define RXRPC_MIN_SECURITY_LEVEL 4 /* minimum security level */ #define RXRPC_UPGRADEABLE_SERVICE 5 /* Upgrade service[0] -> service[1] */ #define RXRPC_SUPPORTED_CMSG 6 /* Get highest supported control message type */ +#define RXRPC_MANAGE_RESPONSE 7 /* [clnt] Want to manage RESPONSE packets */ /* * RxRPC control messages * - If neither abort or accept are specified, the message is a data message. * - terminal messages mean that a user call ID tag can be recycled + * - C/S/- indicate whether these are applicable to client, server or both * - s/r/- indicate whether these are applicable to sendmsg() and/or recvmsg() */ enum rxrpc_cmsg_type { - RXRPC_USER_CALL_ID = 1, /* sr: user call ID specifier */ - RXRPC_ABORT = 2, /* sr: abort request / notification [terminal] */ - RXRPC_ACK = 3, /* -r: [Service] RPC op final ACK received [terminal] */ - RXRPC_NET_ERROR = 5, /* -r: network error received [terminal] */ - RXRPC_BUSY = 6, /* -r: server busy received [terminal] */ - RXRPC_LOCAL_ERROR = 7, /* -r: local error generated [terminal] */ - RXRPC_NEW_CALL = 8, /* -r: [Service] new incoming call notification */ - RXRPC_EXCLUSIVE_CALL = 10, /* s-: Call should be on exclusive connection */ - RXRPC_UPGRADE_SERVICE = 11, /* s-: Request service upgrade for client call */ - RXRPC_TX_LENGTH = 12, /* s-: Total length of Tx data */ - RXRPC_SET_CALL_TIMEOUT = 13, /* s-: Set one or more call timeouts */ - RXRPC_CHARGE_ACCEPT = 14, /* s-: Charge the accept pool with a user call ID */ + RXRPC_USER_CALL_ID = 1, /* -sr: User call ID specifier */ + RXRPC_ABORT = 2, /* -sr: Abort request / notification [terminal] */ + RXRPC_ACK = 3, /* S-r: RPC op final ACK received [terminal] */ + RXRPC_NET_ERROR = 5, /* --r: Network error received [terminal] */ + RXRPC_BUSY = 6, /* C-r: Server busy received [terminal] */ + RXRPC_LOCAL_ERROR = 7, /* --r: Local error generated [terminal] */ + RXRPC_NEW_CALL = 8, /* S-r: New incoming call notification */ + RXRPC_EXCLUSIVE_CALL = 10, /* Cs-: Call should be on exclusive connection */ + RXRPC_UPGRADE_SERVICE = 11, /* Cs-: Request service upgrade for client call */ + RXRPC_TX_LENGTH = 12, /* -s-: Total length of Tx data */ + RXRPC_SET_CALL_TIMEOUT = 13, /* -s-: Set one or more call timeouts */ + RXRPC_CHARGE_ACCEPT = 14, /* Ss-: Charge the accept pool with a user call ID */ + RXRPC_OOB_ID = 15, /* -sr: OOB message ID */ + RXRPC_CHALLENGED = 16, /* C-r: Info on a received CHALLENGE */ + RXRPC_RESPOND = 17, /* Cs-: Respond to a challenge */ + RXRPC_RESPONDED = 18, /* S-r: Data received in RESPONSE */ + RXRPC_RESP_RXGK_APPDATA = 19, /* Cs-: RESPONSE: RxGK app data to include */ RXRPC__SUPPORTED }; @@ -73,6 +80,7 @@ enum rxrpc_cmsg_type { #define RXRPC_SECURITY_RXKAD 2 /* kaserver or kerberos 4 */ #define RXRPC_SECURITY_RXGK 4 /* gssapi-based */ #define RXRPC_SECURITY_RXK5 5 /* kerberos 5 */ +#define RXRPC_SECURITY_YFS_RXGK 6 /* YFS gssapi-based */ /* * RxRPC-level abort codes @@ -118,4 +126,49 @@ enum rxrpc_cmsg_type { #define RXKADDATALEN 19270411 /* user data too long */ #define RXKADILLEGALLEVEL 19270412 /* caller not authorised to use encrypted conns */ +/* + * RxGK GSSAPI security abort codes. + */ +#if 0 /* Original standard abort codes (used by OpenAFS) */ +#define RXGK_INCONSISTENCY 1233242880 /* Security module structure inconsistent */ +#define RXGK_PACKETSHORT 1233242881 /* Packet too short for security challenge */ +#define RXGK_BADCHALLENGE 1233242882 /* Invalid security challenge */ +#define RXGK_BADETYPE 1233242883 /* Invalid or impermissible encryption type */ +#define RXGK_BADLEVEL 1233242884 /* Invalid or impermissible security level */ +#define RXGK_BADKEYNO 1233242885 /* Key version number not found */ +#define RXGK_EXPIRED 1233242886 /* Token has expired */ +#define RXGK_NOTAUTH 1233242887 /* Caller not authorized */ +#define RXGK_BAD_TOKEN 1233242888 /* Security object was passed a bad token */ +#define RXGK_SEALED_INCON 1233242889 /* Sealed data inconsistent */ +#define RXGK_DATA_LEN 1233242890 /* User data too long */ +#define RXGK_BAD_QOP 1233242891 /* Inadequate quality of protection available */ +#else /* Revised standard abort codes (used by YFS) */ +#define RXGK_INCONSISTENCY 1233242880 /* Security module structure inconsistent */ +#define RXGK_PACKETSHORT 1233242881 /* Packet too short for security challenge */ +#define RXGK_BADCHALLENGE 1233242882 /* Security challenge/response failed */ +#define RXGK_SEALEDINCON 1233242883 /* Sealed data is inconsistent */ +#define RXGK_NOTAUTH 1233242884 /* Caller not authorised */ +#define RXGK_EXPIRED 1233242885 /* Authentication expired */ +#define RXGK_BADLEVEL 1233242886 /* Unsupported or not permitted security level */ +#define RXGK_BADKEYNO 1233242887 /* Bad transport key number */ +#define RXGK_NOTRXGK 1233242888 /* Security layer is not rxgk */ +#define RXGK_UNSUPPORTED 1233242889 /* Endpoint does not support rxgk */ +#define RXGK_GSSERROR 1233242890 /* GSSAPI mechanism error */ +#endif + +/* + * Challenge information in the RXRPC_CHALLENGED control message. + */ +struct rxrpc_challenge { + __u16 service_id; /* The service ID of the connection (may be upgraded) */ + __u8 security_index; /* The security index of the connection */ + __u8 pad; /* Round out to a multiple of 4 bytes. */ + /* ... The security class gets to append extra information ... */ +}; + +struct rxgk_challenge { + struct rxrpc_challenge base; + __u32 enctype; /* Krb5 encoding type */ +}; + #endif /* _UAPI_LINUX_RXRPC_H */ diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index ec47f9b68a1bf..1d234d7e18927 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -188,6 +188,7 @@ enum LINUX_MIB_PAWSESTABREJECTED, /* PAWSEstabRejected */ LINUX_MIB_TSECRREJECTED, /* TSEcrRejected */ LINUX_MIB_PAWS_OLD_ACK, /* PAWSOldAck */ + LINUX_MIB_PAWS_TW_REJECTED, /* PAWSTimewait */ LINUX_MIB_DELAYEDACKS, /* DelayedACKs */ LINUX_MIB_DELAYEDACKLOCKED, /* DelayedACKLocked */ LINUX_MIB_DELAYEDACKLOST, /* DelayedACKLost */ diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h index d85d671deed3c..edca3e430305a 100644 --- a/include/uapi/linux/udp.h +++ b/include/uapi/linux/udp.h @@ -43,5 +43,6 @@ struct udphdr { #define UDP_ENCAP_GTP1U 5 /* 3GPP TS 29.060 */ #define UDP_ENCAP_RXRPC 6 #define TCP_ENCAP_ESPINTCP 7 /* Yikes, this is really xfrm encap types. */ +#define UDP_ENCAP_OVPNINUDP 8 /* OpenVPN traffic */ #endif /* _UAPI_LINUX_UDP_H */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 16ba36f34dfab..324c47ab377a7 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6391,8 +6391,8 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t) return btf_type_is_int(t); } -static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, - int off) +u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, + int off) { const struct btf_param *args; const struct btf_type *t; @@ -6541,6 +6541,7 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { { "xprt_put_cong", 0x10 }, /* tcp */ { "tcp_send_reset", 0x11 }, + { "tcp_sendmsg_locked", 0x100 }, /* tegra_apb_dma */ { "tegra_dma_tx_status", 0x100 }, /* timer_migration */ @@ -6671,7 +6672,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, tname, off); return false; } - arg = get_ctx_arg_idx(btf, t, off); + arg = btf_ctx_arg_idx(btf, t, off); args = (const struct btf_param *)(t + 1); /* if (t == NULL) Fall back to default BPF prog with * MAX_BPF_FUNC_REG_ARGS u64 arguments. diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e3a2662f4e336..78cefb41266a1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2293,6 +2293,26 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) return __bpf_list_del(head, true); } +__bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head) +{ + struct list_head *h = (struct list_head *)head; + + if (list_empty(h) || unlikely(!h->next)) + return NULL; + + return (struct bpf_list_node *)h->next; +} + +__bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head) +{ + struct list_head *h = (struct list_head *)head; + + if (list_empty(h) || unlikely(!h->next)) + return NULL; + + return (struct bpf_list_node *)h->prev; +} + __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { @@ -2366,6 +2386,33 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) return (struct bpf_rb_node *)rb_first_cached(r); } +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root) +{ + struct rb_root_cached *r = (struct rb_root_cached *)root; + + return (struct bpf_rb_node *)r->rb_root.rb_node; +} + +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node) +{ + struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; + + if (READ_ONCE(node_internal->owner) != root) + return NULL; + + return (struct bpf_rb_node *)node_internal->rb_node.rb_left; +} + +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node) +{ + struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; + + if (READ_ONCE(node_internal->owner) != root) + return NULL; + + return (struct bpf_rb_node *)node_internal->rb_node.rb_right; +} + /** * bpf_task_acquire - Acquire a reference to a task. A task acquired by this * kfunc which is not stored in a map as a kptr, must be released by calling @@ -3209,11 +3256,16 @@ BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_add_impl) BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 54c6953a8b84c..78a9b3d1cd29e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11987,6 +11987,16 @@ static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_p return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); } +static bool is_rbtree_node_type(const struct btf_type *t) +{ + return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_RB_NODE_ID]); +} + +static bool is_list_node_type(const struct btf_type *t) +{ + return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_LIST_NODE_ID]); +} + static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, const struct btf_param *arg) { @@ -12069,6 +12079,8 @@ enum special_kfunc_type { KF_bpf_list_push_back_impl, KF_bpf_list_pop_front, KF_bpf_list_pop_back, + KF_bpf_list_front, + KF_bpf_list_back, KF_bpf_cast_to_kern_ctx, KF_bpf_rdonly_cast, KF_bpf_rcu_read_lock, @@ -12076,6 +12088,9 @@ enum special_kfunc_type { KF_bpf_rbtree_remove, KF_bpf_rbtree_add_impl, KF_bpf_rbtree_first, + KF_bpf_rbtree_root, + KF_bpf_rbtree_left, + KF_bpf_rbtree_right, KF_bpf_dynptr_from_skb, KF_bpf_dynptr_from_xdp, KF_bpf_dynptr_slice, @@ -12111,11 +12126,16 @@ BTF_ID(func, bpf_list_push_front_impl) BTF_ID(func, bpf_list_push_back_impl) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_list_front) +BTF_ID(func, bpf_list_back) BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add_impl) BTF_ID(func, bpf_rbtree_first) +BTF_ID(func, bpf_rbtree_root) +BTF_ID(func, bpf_rbtree_left) +BTF_ID(func, bpf_rbtree_right) #ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) @@ -12144,6 +12164,8 @@ BTF_ID(func, bpf_list_push_front_impl) BTF_ID(func, bpf_list_push_back_impl) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_list_front) +BTF_ID(func, bpf_list_back) BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rcu_read_lock) @@ -12151,6 +12173,9 @@ BTF_ID(func, bpf_rcu_read_unlock) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add_impl) BTF_ID(func, bpf_rbtree_first) +BTF_ID(func, bpf_rbtree_root) +BTF_ID(func, bpf_rbtree_left) +BTF_ID(func, bpf_rbtree_right) #ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) @@ -12579,14 +12604,19 @@ static bool is_bpf_list_api_kfunc(u32 btf_id) return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] || btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] || btf_id == special_kfunc_list[KF_bpf_list_pop_front] || - btf_id == special_kfunc_list[KF_bpf_list_pop_back]; + btf_id == special_kfunc_list[KF_bpf_list_pop_back] || + btf_id == special_kfunc_list[KF_bpf_list_front] || + btf_id == special_kfunc_list[KF_bpf_list_back]; } static bool is_bpf_rbtree_api_kfunc(u32 btf_id) { return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] || btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || - btf_id == special_kfunc_list[KF_bpf_rbtree_first]; + btf_id == special_kfunc_list[KF_bpf_rbtree_first] || + btf_id == special_kfunc_list[KF_bpf_rbtree_root] || + btf_id == special_kfunc_list[KF_bpf_rbtree_left] || + btf_id == special_kfunc_list[KF_bpf_rbtree_right]; } static bool is_bpf_iter_num_api_kfunc(u32 btf_id) @@ -12686,7 +12716,9 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, break; case BPF_RB_NODE: ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || - kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]); + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] || + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_left] || + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_right]); break; default: verbose(env, "verifier internal error: unexpected graph node argument type %s\n", @@ -13200,26 +13232,25 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return ret; break; case KF_ARG_PTR_TO_RB_NODE: - if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) { - if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) { - verbose(env, "rbtree_remove node input must be non-owning ref\n"); + if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to allocated object\n", i); return -EINVAL; } - if (in_rbtree_lock_required_cb(env)) { - verbose(env, "rbtree_remove not allowed in rbtree cb\n"); + if (!reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); return -EINVAL; } } else { - if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) { + verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name); return -EINVAL; } - if (!reg->ref_obj_id) { - verbose(env, "allocated object must be referenced\n"); + if (in_rbtree_lock_required_cb(env)) { + verbose(env, "%s not allowed in rbtree cb\n", func_name); return -EINVAL; } } - ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta); if (ret < 0) return ret; @@ -13745,13 +13776,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); - } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] || - meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { + } else if (is_list_node_type(ptr_type)) { struct btf_field *field = meta.arg_list_head.field; mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); - } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] || - meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { + } else if (is_rbtree_node_type(ptr_type)) { struct btf_field *field = meta.arg_rbtree_root.field; mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); @@ -13881,7 +13910,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (is_kfunc_ret_null(&meta)) regs[BPF_REG_0].id = id; regs[BPF_REG_0].ref_obj_id = id; - } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { + } else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) { ref_set_non_owning(env, ®s[BPF_REG_0]); } diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index 8aafd050b754b..e81327d2cd639 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -112,3 +112,8 @@ CONFIG_BRANCH_PROFILE_NONE=y CONFIG_DYNAMIC_FTRACE=y CONFIG_FTRACE=y CONFIG_FUNCTION_TRACER=y +# +# Preemption +# +CONFIG_DEBUG_PREEMPT=y +CONFIG_PREEMPT=y diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1715e34b91af4..244111c88183e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -897,9 +897,7 @@ static inline bool page_expected_state(struct page *page, #ifdef CONFIG_MEMCG page->memcg_data | #endif -#ifdef CONFIG_PAGE_POOL - ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) | -#endif + page_pool_page_is_pp(page) | (page->flags & check_flags))) return false; @@ -926,10 +924,8 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif -#ifdef CONFIG_PAGE_POOL - if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE)) + if (unlikely(page_pool_page_is_pp(page))) bad_reason = "page_pool leak"; -#endif return bad_reason; } diff --git a/net/802/Makefile b/net/802/Makefile index bfed80221b8bc..99abc29d537c0 100644 --- a/net/802/Makefile +++ b/net/802/Makefile @@ -3,12 +3,11 @@ # Makefile for the Linux 802.x protocol layers. # -# Check the p8022 selections against net/core/Makefile. -obj-$(CONFIG_LLC) += p8022.o psnap.o +obj-$(CONFIG_LLC) += psnap.o obj-$(CONFIG_NET_FC) += fc.o obj-$(CONFIG_FDDI) += fddi.o obj-$(CONFIG_HIPPI) += hippi.o -obj-$(CONFIG_ATALK) += p8022.o psnap.o +obj-$(CONFIG_ATALK) += psnap.o obj-$(CONFIG_STP) += stp.o obj-$(CONFIG_GARP) += garp.o obj-$(CONFIG_MRP) += mrp.o diff --git a/net/802/p8022.c b/net/802/p8022.c deleted file mode 100644 index 78c25168d7c91..0000000000000 --- a/net/802/p8022.c +++ /dev/null @@ -1,64 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * NET3: Support for 802.2 demultiplexing off Ethernet - * - * Demultiplex 802.2 encoded protocols. We match the entry by the - * SSAP/DSAP pair and then deliver to the registered datalink that - * matches. The control byte is ignored and handling of such items - * is up to the routine passed the frame. - * - * Unlike the 802.3 datalink we have a list of 802.2 entries as - * there are multiple protocols to demux. The list is currently - * short (3 or 4 entries at most). The current demux assumes this. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb, - const unsigned char *dest) -{ - llc_build_and_send_ui_pkt(dl->sap, skb, dest, dl->sap->laddr.lsap); - return 0; -} - -struct datalink_proto *register_8022_client(unsigned char type, - int (*func)(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt, - struct net_device *orig_dev)) -{ - struct datalink_proto *proto; - - proto = kmalloc(sizeof(*proto), GFP_ATOMIC); - if (proto) { - proto->type[0] = type; - proto->header_length = 3; - proto->request = p8022_request; - proto->sap = llc_sap_open(type, func); - if (!proto->sap) { - kfree(proto); - proto = NULL; - } - } - return proto; -} - -void unregister_8022_client(struct datalink_proto *proto) -{ - llc_sap_put(proto->sap); - kfree(proto); -} - -EXPORT_SYMBOL(register_8022_client); -EXPORT_SYMBOL(unregister_8022_client); - -MODULE_DESCRIPTION("Support for 802.2 demultiplexing off Ethernet"); -MODULE_LICENSE("GPL"); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 41be38264493d..06908e37c3d94 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/net/Kconfig b/net/Kconfig index c3fca69a7c834..202cc595e5e6f 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -245,7 +245,6 @@ source "net/bridge/netfilter/Kconfig" endif -source "net/dccp/Kconfig" source "net/sctp/Kconfig" source "net/rds/Kconfig" source "net/tipc/Kconfig" diff --git a/net/Makefile b/net/Makefile index 60ed5190eda8d..aac960c41db6d 100644 --- a/net/Makefile +++ b/net/Makefile @@ -42,7 +42,6 @@ obj-$(CONFIG_PHONET) += phonet/ ifneq ($(CONFIG_VLAN_8021Q),) obj-y += 8021q/ endif -obj-$(CONFIG_IP_DCCP) += dccp/ obj-$(CONFIG_IP_SCTP) += sctp/ obj-$(CONFIG_RDS) += rds/ obj-$(CONFIG_WIRELESS) += wireless/ diff --git a/net/bridge/br.c b/net/bridge/br.c index 183fcb362f9e6..0adeafe11a365 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -284,6 +284,9 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, case BR_BOOLOPT_MST_ENABLE: err = br_mst_set_enabled(br, on, extack); break; + case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: + br_opt_toggle(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, on); + break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -302,6 +305,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED); case BR_BOOLOPT_MST_ENABLE: return br_opt_get(br, BROPT_MST_ENABLED); + case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: + return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION); default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -363,21 +368,20 @@ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) clear_bit(opt, &br->options); } -static void __net_exit br_net_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit br_net_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { struct net_device *dev; - struct net *net; - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - for_each_netdev(net, dev) - if (netif_is_bridge_master(dev)) - br_dev_delete(dev, dev_to_kill); + ASSERT_RTNL_NET(net); + + for_each_netdev(net, dev) + if (netif_is_bridge_master(dev)) + br_dev_delete(dev, dev_to_kill); } static struct pernet_operations br_net_ops = { - .exit_batch_rtnl = br_net_exit_batch_rtnl, + .exit_rtnl = br_net_exit_rtnl, }; static const struct stp_proto br_stp_proto = { diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index 115a23054a58d..1e2b51769eec8 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -160,6 +160,9 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) { if (br_is_neigh_suppress_enabled(p, vid)) return; + if (is_unicast_ether_addr(eth_hdr(skb)->h_dest) && + parp->ar_op == htons(ARPOP_REQUEST)) + return; if (parp->ar_op != htons(ARPOP_RREQUEST) && parp->ar_op != htons(ARPOP_RREPLY) && (ipv4_is_zeronet(sip) || sip == tip)) { @@ -410,6 +413,10 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, if (br_is_neigh_suppress_enabled(p, vid)) return; + if (is_unicast_ether_addr(eth_hdr(skb)->h_dest) && + msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) + return; + if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT && !msg->icmph.icmp6_solicited) { /* prevent flooding to neigh suppress ports */ diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 232133a0fd21c..5f6ac9bf15275 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -189,7 +189,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) { if ((mdst && mdst->host_joined) || - br_multicast_is_router(brmctx, skb)) { + br_multicast_is_router(brmctx, skb) || + br->dev->flags & IFF_ALLMULTI) { local_rcv = true; DEV_STATS_INC(br->dev, multicast); } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 722203b98ff74..400eb872b4032 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -144,6 +144,8 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) e->flags |= MDB_FLAGS_STAR_EXCL; if (flags & MDB_PG_FLAGS_BLOCKED) e->flags |= MDB_FLAGS_BLOCKED; + if (flags & MDB_PG_FLAGS_OFFLOAD_FAILED) + e->flags |= MDB_FLAGS_OFFLOAD_FAILED; } static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, @@ -517,16 +519,17 @@ static size_t rtnl_mdb_nlmsg_size(const struct net_bridge_port_group *pg) rtnl_mdb_nlmsg_pg_size(pg); } -void br_mdb_notify(struct net_device *dev, - struct net_bridge_mdb_entry *mp, - struct net_bridge_port_group *pg, - int type) +static void __br_mdb_notify(struct net_device *dev, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + int type, bool notify_switchdev) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; - br_switchdev_mdb_notify(dev, mp, pg, type); + if (notify_switchdev) + br_switchdev_mdb_notify(dev, mp, pg, type); skb = nlmsg_new(rtnl_mdb_nlmsg_size(pg), GFP_ATOMIC); if (!skb) @@ -544,6 +547,21 @@ void br_mdb_notify(struct net_device *dev, rtnl_set_sk_err(net, RTNLGRP_MDB, err); } +void br_mdb_notify(struct net_device *dev, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + int type) +{ + __br_mdb_notify(dev, mp, pg, type, true); +} + +void br_mdb_flag_change_notify(struct net_device *dev, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg) +{ + __br_mdb_notify(dev, mp, pg, RTM_NEWMDB, false); +} + static int nlmsg_populate_rtr_fill(struct sk_buff *skb, struct net_device *dev, int ifindex, u16 vid, u32 pid, diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c index 1820f09ff59ce..3f24b4ee49c27 100644 --- a/net/bridge/br_mst.c +++ b/net/bridge/br_mst.c @@ -80,10 +80,10 @@ static void br_mst_vlan_set_state(struct net_bridge_vlan_group *vg, if (br_vlan_get_state(v) == state) return; - br_vlan_set_state(v, state); - if (v->vid == vg->pvid) br_vlan_set_pvid_state(vg, state); + + br_vlan_set_state(v, state); } int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index dcbf058de1e3b..7e0b2362b9ee5 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2105,12 +2105,17 @@ static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) } } -void br_multicast_enable_port(struct net_bridge_port *port) +static void br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) { - struct net_bridge *br = port->br; + struct net_bridge *br = pmctx->port->br; spin_lock_bh(&br->multicast_lock); - __br_multicast_enable_port_ctx(&port->multicast_ctx); + if (br_multicast_port_ctx_is_vlan(pmctx) && + !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) { + spin_unlock_bh(&br->multicast_lock); + return; + } + __br_multicast_enable_port_ctx(pmctx); spin_unlock_bh(&br->multicast_lock); } @@ -2137,11 +2142,67 @@ static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) br_multicast_rport_del_notify(pmctx, del); } +static void br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) +{ + struct net_bridge *br = pmctx->port->br; + + spin_lock_bh(&br->multicast_lock); + if (br_multicast_port_ctx_is_vlan(pmctx) && + !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) { + spin_unlock_bh(&br->multicast_lock); + return; + } + + __br_multicast_disable_port_ctx(pmctx); + spin_unlock_bh(&br->multicast_lock); +} + +static void br_multicast_toggle_port(struct net_bridge_port *port, bool on) +{ +#if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) + if (br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *vlan; + + rcu_read_lock(); + vg = nbp_vlan_group_rcu(port); + if (!vg) { + rcu_read_unlock(); + return; + } + + /* iterate each vlan, toggle vlan multicast context */ + list_for_each_entry_rcu(vlan, &vg->vlan_list, vlist) { + struct net_bridge_mcast_port *pmctx = + &vlan->port_mcast_ctx; + u8 state = br_vlan_get_state(vlan); + /* enable vlan multicast context when state is + * LEARNING or FORWARDING + */ + if (on && br_vlan_state_allowed(state, true)) + br_multicast_enable_port_ctx(pmctx); + else + br_multicast_disable_port_ctx(pmctx); + } + rcu_read_unlock(); + return; + } +#endif + /* toggle port multicast context when vlan snooping is disabled */ + if (on) + br_multicast_enable_port_ctx(&port->multicast_ctx); + else + br_multicast_disable_port_ctx(&port->multicast_ctx); +} + +void br_multicast_enable_port(struct net_bridge_port *port) +{ + br_multicast_toggle_port(port, true); +} + void br_multicast_disable_port(struct net_bridge_port *port) { - spin_lock_bh(&port->br->multicast_lock); - __br_multicast_disable_port_ctx(&port->multicast_ctx); - spin_unlock_bh(&port->br->multicast_lock); + br_multicast_toggle_port(port, false); } static int __grp_src_delete_marked(struct net_bridge_port_group *pg) @@ -4211,6 +4272,32 @@ static void __br_multicast_stop(struct net_bridge_mcast *brmctx) #endif } +void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state) +{ +#if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) + struct net_bridge *br; + + if (!br_vlan_should_use(v)) + return; + + if (br_vlan_is_master(v)) + return; + + br = v->port->br; + + if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) + return; + + if (br_vlan_state_allowed(state, true)) + br_multicast_enable_port_ctx(&v->port_mcast_ctx); + + /* Multicast is not disabled for the vlan when it goes in + * blocking state because the timers will expire and stop by + * themselves without sending more queries. + */ +#endif +} + void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) { struct net_bridge *br; @@ -4304,9 +4391,9 @@ int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, __br_multicast_open(&br->multicast_ctx); list_for_each_entry(p, &br->port_list, list) { if (on) - br_multicast_disable_port(p); + br_multicast_disable_port_ctx(&p->multicast_ctx); else - br_multicast_enable_port(p); + br_multicast_enable_port_ctx(&p->multicast_ctx); } list_for_each_entry(vlan, &vg->vlan_list, vlist) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index d5b3c5936a79e..db1bddb330ffe 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -306,11 +306,12 @@ struct net_bridge_fdb_flush_desc { u16 vlan_id; }; -#define MDB_PG_FLAGS_PERMANENT BIT(0) -#define MDB_PG_FLAGS_OFFLOAD BIT(1) -#define MDB_PG_FLAGS_FAST_LEAVE BIT(2) -#define MDB_PG_FLAGS_STAR_EXCL BIT(3) -#define MDB_PG_FLAGS_BLOCKED BIT(4) +#define MDB_PG_FLAGS_PERMANENT BIT(0) +#define MDB_PG_FLAGS_OFFLOAD BIT(1) +#define MDB_PG_FLAGS_FAST_LEAVE BIT(2) +#define MDB_PG_FLAGS_STAR_EXCL BIT(3) +#define MDB_PG_FLAGS_BLOCKED BIT(4) +#define MDB_PG_FLAGS_OFFLOAD_FAILED BIT(5) #define PG_SRC_ENT_LIMIT 32 @@ -483,6 +484,7 @@ enum net_bridge_opts { BROPT_VLAN_BRIDGE_BINDING, BROPT_MCAST_VLAN_SNOOPING_ENABLED, BROPT_MST_ENABLED, + BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, }; struct net_bridge { @@ -1002,6 +1004,8 @@ int br_mdb_hash_init(struct net_bridge *br); void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type); +void br_mdb_flag_change_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg); void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type); void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, @@ -1051,6 +1055,7 @@ void br_multicast_port_ctx_init(struct net_bridge_port *port, struct net_bridge_vlan *vlan, struct net_bridge_mcast_port *pmctx); void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx); +void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state); void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on); int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, struct netlink_ext_ack *extack); @@ -1342,6 +1347,22 @@ br_multicast_ctx_matches_vlan_snooping(const struct net_bridge_mcast *brmctx) return !!(vlan_snooping_enabled == br_multicast_ctx_is_vlan(brmctx)); } + +static inline void +br_multicast_set_pg_offload_flags(struct net_bridge_port_group *p, + bool offloaded) +{ + p->flags &= ~(MDB_PG_FLAGS_OFFLOAD | MDB_PG_FLAGS_OFFLOAD_FAILED); + p->flags |= (offloaded ? MDB_PG_FLAGS_OFFLOAD : + MDB_PG_FLAGS_OFFLOAD_FAILED); +} + +static inline bool +br_mdb_should_notify(const struct net_bridge *br, u8 changed_flags) +{ + return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION) && + (changed_flags & MDB_PG_FLAGS_OFFLOAD_FAILED); +} #else static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx, struct net_bridge_mcast_port **pmctx, @@ -1501,6 +1522,11 @@ static inline void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pm { } +static inline void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, + u8 state) +{ +} + static inline void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) { @@ -1861,7 +1887,9 @@ bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr, bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range, const struct net_bridge_vlan *v_opts); -/* vlan state manipulation helpers using *_ONCE to annotate lock-free access */ +/* vlan state manipulation helpers using *_ONCE to annotate lock-free access, + * while br_vlan_set_state() may access data protected by multicast_lock. + */ static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) { return READ_ONCE(v->state); @@ -1870,6 +1898,7 @@ static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) static inline void br_vlan_set_state(struct net_bridge_vlan *v, u8 state) { WRITE_ONCE(v->state, state); + br_multicast_update_vlan_mcast_ctx(v, state); } static inline u8 br_vlan_get_pvid_state(const struct net_bridge_vlan_group *vg) diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 7b41ee8740cbb..95d7355a04074 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -504,9 +504,10 @@ static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *pri struct net_bridge_mdb_entry *mp; struct net_bridge_port *port = data->port; struct net_bridge *br = port->br; + u8 old_flags; - if (err) - goto err; + if (err == -EOPNOTSUPP) + goto out_free; spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &data->ip); @@ -516,11 +517,15 @@ static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *pri pp = &p->next) { if (p->key.port != port) continue; - p->flags |= MDB_PG_FLAGS_OFFLOAD; + + old_flags = p->flags; + br_multicast_set_pg_offload_flags(p, !err); + if (br_mdb_should_notify(br, old_flags ^ p->flags)) + br_mdb_flag_change_notify(br->dev, mp, p); } out: spin_unlock_bh(&br->multicast_lock); -err: +out_free: kfree(priv); } diff --git a/net/core/datagram.c b/net/core/datagram.c index f0693707aece4..f0634f0cb8346 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -163,8 +163,7 @@ static struct sk_buff *skb_set_peeked(struct sk_buff *skb) return skb; } -struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, - struct sk_buff_head *queue, +struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last) @@ -261,7 +260,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, * However, this function was correct in any case. 8) */ spin_lock_irqsave(&queue->lock, cpu_flags); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error, + skb = __skb_try_recv_from_queue(queue, flags, off, &error, last); spin_unlock_irqrestore(&queue->lock, cpu_flags); if (error) diff --git a/net/core/dev.c b/net/core/dev.c index 1be7cb73a6024..d1a8cad0c99c4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -828,7 +828,7 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id) dev_hold(dev); rcu_read_unlock(); - dev = __netdev_put_lock(dev); + dev = __netdev_put_lock(dev, net); if (!dev) return NULL; @@ -1039,10 +1039,11 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id) * This helper is intended for locking net_device after it has been looked up * using a lockless lookup helper. Lock prevents the instance from going away. */ -struct net_device *__netdev_put_lock(struct net_device *dev) +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net) { netdev_lock(dev); - if (dev->reg_state > NETREG_REGISTERED) { + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { netdev_unlock(dev); dev_put(dev); return NULL; @@ -1051,6 +1052,20 @@ struct net_device *__netdev_put_lock(struct net_device *dev) return dev; } +static struct net_device * +__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net) +{ + netdev_lock_ops_compat(dev); + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { + netdev_unlock_ops_compat(dev); + dev_put(dev); + return NULL; + } + dev_put(dev); + return dev; +} + /** * netdev_get_by_index_lock() - find a device by its ifindex * @net: the applicable net namespace @@ -1070,7 +1085,19 @@ struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex) if (!dev) return NULL; - return __netdev_put_lock(dev); + return __netdev_put_lock(dev, net); +} + +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex) +{ + struct net_device *dev; + + dev = dev_get_by_index(net, ifindex); + if (!dev) + return NULL; + + return __netdev_put_lock_ops_compat(dev, net); } struct net_device * @@ -1090,7 +1117,32 @@ netdev_xa_find_lock(struct net *net, struct net_device *dev, dev_hold(dev); rcu_read_unlock(); - dev = __netdev_put_lock(dev); + dev = __netdev_put_lock(dev, net); + if (dev) + return dev; + + (*index)++; + } while (true); +} + +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index) +{ + if (dev) + netdev_unlock_ops_compat(dev); + + do { + rcu_read_lock(); + dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); + if (!dev) { + rcu_read_unlock(); + return NULL; + } + dev_hold(dev); + rcu_read_unlock(); + + dev = __netdev_put_lock_ops_compat(dev, net); if (dev) return dev; @@ -4946,7 +4998,8 @@ static void rps_trigger_softirq(void *data) struct softnet_data *sd = data; ____napi_schedule(sd, &sd->backlog); - sd->received_rps++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->received_rps, sd->received_rps + 1); } #endif /* CONFIG_RPS */ @@ -5031,7 +5084,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { - new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); + new_flow = hash_32(skb_get_hash(skb), fl->log_buckets); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; @@ -5042,7 +5095,8 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) fl->buckets[old_flow]--; if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { - fl->count++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(fl->count, fl->count + 1); rcu_read_unlock(); return true; } @@ -7515,7 +7569,8 @@ static __latent_entropy void net_rx_action(void) */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) { - sd->time_squeeze++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1); break; } } @@ -11057,8 +11112,7 @@ int register_netdevice(struct net_device *dev) * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); out: @@ -11981,8 +12035,7 @@ void unregister_netdevice_many_notify(struct list_head *head, */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL, NULL, 0, portid, nlh); @@ -12156,7 +12209,11 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, netif_close(dev); /* And unlink it from device chain */ unlist_netdevice(dev); - netdev_unlock_ops(dev); + + if (!netdev_need_ops_lock(dev)) + netdev_lock(dev); + dev->moving_ns = true; + netdev_unlock(dev); synchronize_net(); @@ -12192,7 +12249,9 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, move_netdevice_notifiers_dev_net(dev, net); /* Actually switch the network namespace */ + netdev_lock(dev); dev_net_set(dev, net); + netdev_unlock(dev); dev->ifindex = new_ifindex; if (new_name[0]) { @@ -12218,7 +12277,11 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, err = netdev_change_owner(dev, net_old, net); WARN_ON(err); - netdev_lock_ops(dev); + netdev_lock(dev); + dev->moving_ns = false; + if (!netdev_need_ops_lock(dev)) + netdev_unlock(dev); + /* Add the device back in the hashes */ list_netdevice(dev); /* Notify protocols, that a new device appeared. */ diff --git a/net/core/dev.h b/net/core/dev.h index 7ee203395d8e2..e93f36b7ddf36 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -15,8 +15,9 @@ struct cpumask; /* Random bits of netdevice that don't need to be exposed */ #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ struct sd_flow_limit { - u64 count; - unsigned int num_buckets; + struct rcu_head rcu; + unsigned int count; + u8 log_buckets; unsigned int history_head; u16 history[FLOW_LIMIT_HISTORY]; u8 buckets[]; @@ -29,7 +30,7 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); struct net_device *dev_get_by_napi_id(unsigned int napi_id); struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); -struct net_device *__netdev_put_lock(struct net_device *dev); +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index); @@ -41,6 +42,21 @@ DEFINE_FREE(netdev_unlock, struct net_device *, if (_T) netdev_unlock(_T)); (var_name = netdev_xa_find_lock(net, var_name, &ifindex)); \ ifindex++) +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex); +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index); + +DEFINE_FREE(netdev_unlock_ops_compat, struct net_device *, + if (_T) netdev_unlock_ops_compat(_T)); + +#define for_each_netdev_lock_ops_compat_scoped(net, var_name, ifindex) \ + for (struct net_device *var_name __free(netdev_unlock_ops_compat) = NULL; \ + (var_name = netdev_xa_find_lock_ops_compat(net, var_name, \ + &ifindex)); \ + ifindex++) + #ifdef CONFIG_PROC_FS int __init dev_proc_init(void); #else diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 7af302080a660..8ca634964e363 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -874,13 +874,14 @@ int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, bool rtnl_held) { struct fib_rule *rule = NULL, *r, *last = NULL; - struct fib_rule_hdr *frh = nlmsg_data(nlh); int err = -EINVAL, unresolved = 0; struct fib_rules_ops *ops = NULL; struct nlattr *tb[FRA_MAX + 1]; bool user_priority = false; + struct fib_rule_hdr *frh; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } @@ -1002,13 +1003,14 @@ int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, bool rtnl_held) { struct fib_rule *rule = NULL, *nlrule = NULL; - struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct nlattr *tb[FRA_MAX+1]; bool user_priority = false; + struct fib_rule_hdr *frh; int err = -EINVAL; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } @@ -1260,12 +1262,12 @@ static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, { struct fib_rule_hdr *frh; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); return -EINVAL; } - frh = nlmsg_data(nlh); if (frh->dst_len || frh->src_len || frh->tos || frh->table || frh->res1 || frh->res2 || frh->action || frh->flags) { NL_SET_ERR_MSG(extack, diff --git a/net/core/lock_debug.c b/net/core/lock_debug.c index 941e26c1343de..9e9fb25314b9c 100644 --- a/net/core/lock_debug.c +++ b/net/core/lock_debug.c @@ -18,9 +18,12 @@ int netdev_debug_event(struct notifier_block *nb, unsigned long event, /* Keep enum and don't add default to trigger -Werror=switch */ switch (cmd) { + case NETDEV_XDP_FEAT_CHANGE: + netdev_assert_locked(dev); + fallthrough; + case NETDEV_CHANGE: case NETDEV_REGISTER: case NETDEV_UP: - case NETDEV_CHANGE: netdev_ops_assert_locked(dev); fallthrough; case NETDEV_DOWN: @@ -58,7 +61,6 @@ int netdev_debug_event(struct notifier_block *nb, unsigned long event, case NETDEV_OFFLOAD_XSTATS_DISABLE: case NETDEV_OFFLOAD_XSTATS_REPORT_USED: case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: - case NETDEV_XDP_FEAT_CHANGE: ASSERT_RTNL(); break; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index a07249b59ae1e..254067b719da3 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2430,12 +2430,12 @@ static int neightbl_valid_dump_info(const struct nlmsghdr *nlh, { struct ndtmsg *ndtm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndtm))) { + ndtm = nlmsg_payload(nlh, sizeof(*ndtm)); + if (!ndtm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request"); return -EINVAL; } - ndtm = nlmsg_data(nlh); if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request"); return -EINVAL; @@ -2747,12 +2747,12 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh, if (strict_check) { struct ndmsg *ndm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); @@ -2855,12 +2855,12 @@ static int neigh_valid_get_req(const struct nlmsghdr *nlh, struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 3e92bf0f9060b..4f0f0709a1cbc 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -132,8 +132,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); + /* Pairs with WRITE_ONCE() in skb_flow_limit() */ if (fl) - flow_limit_count = fl->count; + flow_limit_count = READ_ONCE(fl->count); rcu_read_unlock(); #endif @@ -144,11 +145,11 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - sd->processed, atomic_read(&sd->dropped), - sd->time_squeeze, 0, + READ_ONCE(sd->processed), atomic_read(&sd->dropped), + READ_ONCE(sd->time_squeeze), 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ - sd->received_rps, flow_limit_count, + READ_ONCE(sd->received_rps), flow_limit_count, input_qlen + process_qlen, (int)seq->index, input_qlen, process_qlen); return 0; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b0dfdf791ece5..48dd6dc603c9e 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -163,16 +163,45 @@ static void ops_pre_exit_list(const struct pernet_operations *ops, } } +static void ops_exit_rtnl_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + const struct pernet_operations *saved_ops = ops; + LIST_HEAD(dev_kill_list); + struct net *net; + + rtnl_lock(); + + list_for_each_entry(net, net_exit_list, exit_list) { + __rtnl_net_lock(net); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) { + if (ops->exit_rtnl) + ops->exit_rtnl(net, &dev_kill_list); + } + + __rtnl_net_unlock(net); + } + + unregister_netdevice_many(&dev_kill_list); + + rtnl_unlock(); +} + static void ops_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { - struct net *net; if (ops->exit) { + struct net *net; + list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); cond_resched(); } } + if (ops->exit_batch) ops->exit_batch(net_exit_list); } @@ -188,6 +217,56 @@ static void ops_free_list(const struct pernet_operations *ops, } } +static void ops_undo_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list, + bool expedite_rcu) +{ + const struct pernet_operations *saved_ops; + bool hold_rtnl = false; + + if (!ops) + ops = list_entry(ops_list, typeof(*ops), list); + + saved_ops = ops; + + list_for_each_entry_continue_reverse(ops, ops_list, list) { + hold_rtnl |= !!ops->exit_rtnl; + ops_pre_exit_list(ops, net_exit_list); + } + + /* Another CPU might be rcu-iterating the list, wait for it. + * This needs to be before calling the exit() notifiers, so the + * rcu_barrier() after ops_undo_list() isn't sufficient alone. + * Also the pre_exit() and exit() methods need this barrier. + */ + if (expedite_rcu) + synchronize_rcu_expedited(); + else + synchronize_rcu(); + + if (hold_rtnl) + ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_exit_list(ops, net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_free_list(ops, net_exit_list); +} + +static void ops_undo_single(struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + LIST_HEAD(ops_list); + + list_add(&ops->list, &ops_list); + ops_undo_list(&ops_list, NULL, net_exit_list, false); + list_del(&ops->list); +} + /* should be called with nsid_lock held */ static int alloc_netid(struct net *net, struct net *peer, int reqid) { @@ -351,9 +430,8 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ static __net_init int setup_net(struct net *net) { /* Must be called with pernet_ops_rwsem held */ - const struct pernet_operations *ops, *saved_ops; + const struct pernet_operations *ops; LIST_HEAD(net_exit_list); - LIST_HEAD(dev_kill_list); int error = 0; preempt_disable(); @@ -376,29 +454,7 @@ static __net_init int setup_net(struct net *net) * for the pernet modules whose init functions did not fail. */ list_add(&net->exit_list, &net_exit_list); - saved_ops = ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_pre_exit_list(ops, &net_exit_list); - - synchronize_rcu(); - - ops = saved_ops; - rtnl_lock(); - list_for_each_entry_continue_reverse(ops, &pernet_list, list) { - if (ops->exit_batch_rtnl) - ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); - } - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); - - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); - + ops_undo_list(&pernet_list, ops, &net_exit_list, false); rcu_barrier(); goto out; } @@ -594,11 +650,9 @@ struct task_struct *cleanup_net_task; static void cleanup_net(struct work_struct *work) { - const struct pernet_operations *ops; - struct net *net, *tmp, *last; struct llist_node *net_kill_list; + struct net *net, *tmp, *last; LIST_HEAD(net_exit_list); - LIST_HEAD(dev_kill_list); cleanup_net_task = current; @@ -629,33 +683,7 @@ static void cleanup_net(struct work_struct *work) list_add_tail(&net->exit_list, &net_exit_list); } - /* Run all of the network namespace pre_exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_pre_exit_list(ops, &net_exit_list); - - /* - * Another CPU might be rcu-iterating the list, wait for it. - * This needs to be before calling the exit() notifiers, so - * the rcu_barrier() below isn't sufficient alone. - * Also the pre_exit() and exit() methods need this barrier. - */ - synchronize_rcu_expedited(); - - rtnl_lock(); - list_for_each_entry_reverse(ops, &pernet_list, list) { - if (ops->exit_batch_rtnl) - ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); - } - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - - /* Run all of the network namespace exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); - - /* Free the net generic variables */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); + ops_undo_list(&pernet_list, NULL, &net_exit_list, true); up_read(&pernet_ops_rwsem); @@ -1239,31 +1267,13 @@ void __init net_ns_init(void) rtnl_register_many(net_ns_rtnl_msg_handlers); } -static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) -{ - ops_pre_exit_list(ops, net_exit_list); - synchronize_rcu(); - - if (ops->exit_batch_rtnl) { - LIST_HEAD(dev_kill_list); - - rtnl_lock(); - ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - } - ops_exit_list(ops, net_exit_list); - - ops_free_list(ops, net_exit_list); -} - #ifdef CONFIG_NET_NS static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { + LIST_HEAD(net_exit_list); struct net *net; int error; - LIST_HEAD(net_exit_list); list_add_tail(&ops->list, list); if (ops->init || ops->id) { @@ -1282,21 +1292,21 @@ static int __register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); - free_exit_list(ops, &net_exit_list); + ops_undo_single(ops, &net_exit_list); return error; } static void __unregister_pernet_operations(struct pernet_operations *ops) { - struct net *net; LIST_HEAD(net_exit_list); + struct net *net; - list_del(&ops->list); /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); - free_exit_list(ops, &net_exit_list); + list_del(&ops->list); + ops_undo_single(ops, &net_exit_list); } #else @@ -1304,22 +1314,23 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { - if (!init_net_initialized) { - list_add_tail(&ops->list, list); + list_add_tail(&ops->list, list); + + if (!init_net_initialized) return 0; - } return ops_init(ops, &init_net); } static void __unregister_pernet_operations(struct pernet_operations *ops) { - if (!init_net_initialized) { - list_del(&ops->list); - } else { + list_del(&ops->list); + + if (init_net_initialized) { LIST_HEAD(net_exit_list); + list_add(&init_net.exit_list, &net_exit_list); - free_exit_list(ops, &net_exit_list); + ops_undo_single(ops, &net_exit_list); } } diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 5d7af50fe7027..2c104947d224f 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -38,6 +38,8 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, u64 xdp_rx_meta = 0; void *hdr; + netdev_assert_locked(netdev); /* note: rtnl_lock may not be held! */ + hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; @@ -122,15 +124,14 @@ int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) if (!rsp) return -ENOMEM; - rtnl_lock(); - - netdev = __dev_get_by_index(genl_info_net(info), ifindex); - if (netdev) - err = netdev_nl_dev_fill(netdev, rsp, info); - else + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { err = -ENODEV; + goto err_free_msg; + } - rtnl_unlock(); + err = netdev_nl_dev_fill(netdev, rsp, info); + netdev_unlock(netdev); if (err) goto err_free_msg; @@ -146,18 +147,15 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); struct net *net = sock_net(skb->sk); - struct net_device *netdev; - int err = 0; + int err; - rtnl_lock(); - for_each_netdev_dump(net, netdev, ctx->ifindex) { + for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); if (err < 0) - break; + return err; } - rtnl_unlock(); - return err; + return 0; } static int @@ -481,18 +479,15 @@ int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) if (!rsp) return -ENOMEM; - rtnl_lock(); - - netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + netdev = netdev_get_by_index_lock_ops_compat(genl_info_net(info), + ifindex); if (netdev) { err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); - netdev_unlock(netdev); + netdev_unlock_ops_compat(netdev); } else { err = -ENODEV; } - rtnl_unlock(); - if (err) goto err_free_msg; @@ -541,17 +536,17 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); - rtnl_lock(); if (ifindex) { - netdev = netdev_get_by_index_lock(net, ifindex); + netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); if (netdev) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); - netdev_unlock(netdev); + netdev_unlock_ops_compat(netdev); } else { err = -ENODEV; } } else { - for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { + for_each_netdev_lock_ops_compat_scoped(net, netdev, + ctx->ifindex) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); if (err < 0) break; @@ -559,7 +554,6 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) ctx->txq_idx = 0; } } - rtnl_unlock(); return err; } @@ -801,26 +795,31 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); - rtnl_lock(); if (ifindex) { - netdev = __dev_get_by_index(net, ifindex); - if (netdev && netdev->stat_ops) { - err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, - info, ctx); - } else { + netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); + if (!netdev) { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QSTATS_IFINDEX]); - err = netdev ? -EOPNOTSUPP : -ENODEV; + return -ENODEV; } - } else { - for_each_netdev_dump(net, netdev, ctx->ifindex) { + if (netdev->stat_ops) { err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, info, ctx); - if (err < 0) - break; + } else { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_QSTATS_IFINDEX]); + err = -EOPNOTSUPP; } + netdev_unlock_ops_compat(netdev); + return err; + } + + for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); + if (err < 0) + break; } - rtnl_unlock(); return err; } @@ -964,10 +963,14 @@ static int netdev_genl_netdevice_event(struct notifier_block *nb, switch (event) { case NETDEV_REGISTER: + netdev_lock_ops_to_full(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF); + netdev_unlock_full_to_ops(netdev); break; case NETDEV_UNREGISTER: + netdev_lock(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF); + netdev_unlock(netdev); break; case NETDEV_XDP_FEAT_CHANGE: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF); diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index 7eadb8393e002..cd95394399b40 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -5,7 +5,7 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) { - return __netmem_clear_lsb(netmem)->pp_magic; + return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) @@ -15,9 +15,16 @@ static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) static inline void netmem_clear_pp_magic(netmem_ref netmem) { + WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK); + __netmem_clear_lsb(netmem)->pp_magic = 0; } +static inline bool netmem_is_pp(netmem_ref netmem) +{ + return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; +} + static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) { __netmem_clear_lsb(netmem)->pp = pool; @@ -28,4 +35,28 @@ static inline void netmem_set_dma_addr(netmem_ref netmem, { __netmem_clear_lsb(netmem)->dma_addr = dma_addr; } + +static inline unsigned long netmem_get_dma_index(netmem_ref netmem) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return 0; + + magic = __netmem_clear_lsb(netmem)->pp_magic; + + return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT; +} + +static inline void netmem_set_dma_index(netmem_ref netmem, + unsigned long id) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return; + + magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT); + __netmem_clear_lsb(netmem)->pp_magic = magic; +} #endif diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 7745ad924ae2d..2b76848659418 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -276,8 +276,7 @@ static int page_pool_init(struct page_pool *pool, /* Driver calling page_pool_create() also call page_pool_destroy() */ refcount_set(&pool->user_cnt, 1); - if (pool->dma_map) - get_device(pool->p.dev); + xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { netdev_assert_locked(pool->slow.netdev); @@ -320,9 +319,7 @@ static int page_pool_init(struct page_pool *pool, static void page_pool_uninit(struct page_pool *pool) { ptr_ring_cleanup(&pool->ring, NULL); - - if (pool->dma_map) - put_device(pool->p.dev); + xa_destroy(&pool->dma_mapped); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) @@ -463,13 +460,21 @@ page_pool_dma_sync_for_device(const struct page_pool *pool, netmem_ref netmem, u32 dma_sync_size) { - if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) - __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); + if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) { + rcu_read_lock(); + /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ + if (pool->dma_sync) + __page_pool_dma_sync_for_device(pool, netmem, + dma_sync_size); + rcu_read_unlock(); + } } -static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; + int err; + u32 id; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit @@ -483,15 +488,30 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) if (dma_mapping_error(pool->p.dev, dma)) return false; - if (page_pool_set_dma_addr_netmem(netmem, dma)) + if (page_pool_set_dma_addr_netmem(netmem, dma)) { + WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); goto unmap_failed; + } + + if (in_softirq()) + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + else + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + if (err) { + WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + goto unset_failed; + } + netmem_set_dma_index(netmem, id); page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; +unset_failed: + page_pool_set_dma_addr_netmem(netmem, 0); unmap_failed: - WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); @@ -508,7 +528,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, if (unlikely(!page)) return NULL; - if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) { + if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { put_page(page); return NULL; } @@ -554,7 +574,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, */ for (i = 0; i < nr_pages; i++) { netmem = pool->alloc.cache[i]; - if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) { + if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { put_page(netmem_to_page(netmem)); continue; } @@ -656,6 +676,8 @@ void page_pool_clear_pp_info(netmem_ref netmem) static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, netmem_ref netmem) { + struct page *old, *page = netmem_to_page(netmem); + unsigned long id; dma_addr_t dma; if (!pool->dma_map) @@ -664,6 +686,17 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, */ return; + id = netmem_get_dma_index(netmem); + if (!id) + return; + + if (in_softirq()) + old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); + else + old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); + if (old != page) + return; + dma = page_pool_get_dma_addr_netmem(netmem); /* When page is unmapped, it cannot be returned to our pool */ @@ -671,6 +704,7 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr_netmem(netmem, 0); + netmem_set_dma_index(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need @@ -1080,8 +1114,29 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool) static void page_pool_scrub(struct page_pool *pool) { + unsigned long id; + void *ptr; + page_pool_empty_alloc_cache_once(pool); - pool->destroy_cnt++; + if (!pool->destroy_cnt++ && pool->dma_map) { + if (pool->dma_sync) { + /* Disable page_pool_dma_sync_for_device() */ + pool->dma_sync = false; + + /* Make sure all concurrent returns that may see the old + * value of dma_sync (and thus perform a sync) have + * finished before doing the unmapping below. Skip the + * wait if the device doesn't actually need syncing, or + * if there are no outstanding mapped pages. + */ + if (dma_dev_need_sync(pool->p.dev) && + !xa_empty(&pool->dma_mapped)) + synchronize_net(); + } + + xa_for_each(&pool->dma_mapped, id, ptr) + __page_pool_release_page_dma(pool, page_to_netmem(ptr)); + } /* No more consumers should exist, but producers could still * be in-flight. diff --git a/net/core/pktgen.c b/net/core/pktgen.c index fe7fdefab994c..0ebe5461d4d96 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -177,7 +177,7 @@ #define MAX_IMIX_ENTRIES 20 #define IMIX_PRECISION 100 /* Precision of IMIX distribution */ -#define func_enter() pr_debug("entering %s\n", __func__); +#define func_enter() pr_debug("entering %s\n", __func__) #define PKT_FLAGS \ pf(IPV6) /* Interface in IPV6 Mode */ \ @@ -227,12 +227,12 @@ static char *pkt_flag_names[] = { /* Xmit modes */ #define M_START_XMIT 0 /* Default normal TX */ -#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ +#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ #define M_QUEUE_XMIT 2 /* Inject packet into qdisc */ /* If lock -- protects updating of if_list */ -#define if_lock(t) mutex_lock(&(t->if_lock)); -#define if_unlock(t) mutex_unlock(&(t->if_lock)); +#define if_lock(t) mutex_lock(&(t->if_lock)) +#define if_unlock(t) mutex_unlock(&(t->if_lock)) /* Used to help with determining the pkts on receive */ #define PKTGEN_MAGIC 0xbe9be955 @@ -283,7 +283,8 @@ struct pktgen_dev { int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ int nfrags; int removal_mark; /* non-zero => the device is marked for - * removal by worker thread */ + * removal by worker thread + */ struct page *page; u64 delay; /* nano-seconds */ @@ -346,10 +347,12 @@ struct pktgen_dev { __u16 udp_dst_max; /* exclusive, dest UDP port */ /* DSCP + ECN */ - __u8 tos; /* six MSB of (former) IPv4 TOS - are for dscp codepoint */ - __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 - (see RFC 3260, sec. 4) */ + __u8 tos; /* six MSB of (former) IPv4 TOS + * are for dscp codepoint + */ + __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 + * (see RFC 3260, sec. 4) + */ /* IMIX */ unsigned int n_imix_entries; @@ -389,12 +392,12 @@ struct pktgen_dev { __u8 hh[14]; /* = { - 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, - - We fill in SRC address later - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x08, 0x00 - }; + * 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, + * + * We fill in SRC address later + * 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + * 0x08, 0x00 + * }; */ __u16 pad; /* pad out the hh struct to an even 16 bytes */ @@ -458,7 +461,8 @@ struct pktgen_thread { char result[512]; /* Field for thread to receive "posted" events terminate, - stop ifs etc. */ + * stop ifs etc. + */ u32 control; int cpu; @@ -472,8 +476,7 @@ struct pktgen_thread { #define FIND 0 static const char version[] = - "Packet Generator for packet performance testing. " - "Version: " VERSION "\n"; + "Packet Generator for packet performance testing. Version: " VERSION "\n"; static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i); static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); @@ -624,8 +627,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_printf(seq, "%pM\n", pkt_dev->dst_mac); seq_printf(seq, - " udp_src_min: %d udp_src_max: %d" - " udp_dst_min: %d udp_dst_max: %d\n", + " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n", pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min, pkt_dev->udp_dst_max); @@ -754,6 +756,7 @@ static ssize_t hex32_arg(const char __user *user_buffer, size_t maxlen, for (; i < maxlen; i++) { int value; char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; value = hex_to_bin(c); @@ -773,6 +776,7 @@ static ssize_t count_trail_chars(const char __user *user_buffer, size_t maxlen) for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; switch (c) { @@ -799,6 +803,7 @@ static ssize_t num_arg(const char __user *user_buffer, size_t maxlen, for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; if ((c >= '0') && (c <= '9')) { @@ -816,6 +821,7 @@ static ssize_t strn_len(const char __user *user_buffer, size_t maxlen) for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; switch (c) { @@ -974,8 +980,8 @@ static __u32 pktgen_read_flag(const char *f, bool *disable) } static ssize_t pktgen_if_write(struct file *file, - const char __user * user_buffer, size_t count, - loff_t * offset) + const char __user *user_buffer, size_t count, + loff_t *offset) { struct seq_file *seq = file->private_data; struct pktgen_dev *pkt_dev = seq->private; @@ -1307,9 +1313,9 @@ static ssize_t pktgen_if_write(struct file *file, put_page(pkt_dev->page); pkt_dev->page = NULL; } - } - else + } else { sprintf(pg_result, "ERROR: node not possible"); + } return count; } if (!strcmp(name, "xmit_mode")) { @@ -1413,8 +1419,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->dst_min) != 0) { - memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); - strcpy(pkt_dev->dst_min, buf); + strscpy_pad(pkt_dev->dst_min, buf); pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); pkt_dev->cur_daddr = pkt_dev->daddr_min; } @@ -1434,8 +1439,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->dst_max) != 0) { - memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); - strcpy(pkt_dev->dst_max, buf); + strscpy_pad(pkt_dev->dst_max, buf); pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); pkt_dev->cur_daddr = pkt_dev->daddr_max; } @@ -1544,8 +1548,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->src_min) != 0) { - memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); - strcpy(pkt_dev->src_min, buf); + strscpy_pad(pkt_dev->src_min, buf); pkt_dev->saddr_min = in_aton(pkt_dev->src_min); pkt_dev->cur_saddr = pkt_dev->saddr_min; } @@ -1565,8 +1568,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->src_max) != 0) { - memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); - strcpy(pkt_dev->src_max, buf); + strscpy_pad(pkt_dev->src_max, buf); pkt_dev->saddr_max = in_aton(pkt_dev->src_max); pkt_dev->cur_saddr = pkt_dev->saddr_max; } @@ -1909,8 +1911,8 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) } static ssize_t pktgen_thread_write(struct file *file, - const char __user * user_buffer, - size_t count, loff_t * offset) + const char __user *user_buffer, + size_t count, loff_t *offset) { struct seq_file *seq = file->private_data; struct pktgen_thread *t = seq->private; @@ -1962,6 +1964,7 @@ static ssize_t pktgen_thread_write(struct file *file, if (!strcmp(name, "add_device")) { char f[32]; + memset(f, 0, 32); max = min(sizeof(f) - 1, count - i); len = strn_len(&user_buffer[i], max); @@ -2397,13 +2400,14 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) /* If there was already an IPSEC SA, we keep it as is, else * we go look for it ... -*/ + */ #define DUMMY_MARK 0 static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) { #ifdef CONFIG_XFRM struct xfrm_state *x = pkt_dev->flows[flow].x; struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id); + if (!x) { if (pkt_dev->spi) { @@ -2436,6 +2440,7 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev) else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { __u16 t; + if (pkt_dev->flags & F_QUEUE_MAP_RND) { t = get_random_u32_inclusive(pkt_dev->queue_map_min, pkt_dev->queue_map_max); @@ -2517,6 +2522,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->flags & F_MPLS_RND) { unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) pkt_dev->labels[i] = MPLS_STACK_BOTTOM | @@ -2561,6 +2567,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) imx = ntohl(pkt_dev->saddr_max); if (imn < imx) { __u32 t; + if (pkt_dev->flags & F_IPSRC_RND) t = get_random_u32_inclusive(imn, imx - 1); else { @@ -2581,6 +2588,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (imn < imx) { __u32 t; __be32 s; + if (pkt_dev->flags & F_IPDST_RND) { do { @@ -2628,6 +2636,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { __u32 t; + if (pkt_dev->flags & F_TXSIZE_RND) { t = get_random_u32_inclusive(pkt_dev->min_pkt_size, pkt_dev->max_pkt_size - 1); @@ -2694,7 +2703,8 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) if (!x) return 0; /* XXX: we dont support tunnel mode for now until - * we resolve the dst issue */ + * we resolve the dst issue + */ if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0)) return 0; @@ -2729,8 +2739,10 @@ static void free_SAs(struct pktgen_dev *pkt_dev) if (pkt_dev->cflows) { /* let go of the SAs if we have them */ int i; + for (i = 0; i < pkt_dev->cflows; i++) { struct xfrm_state *x = pkt_dev->flows[i].x; + if (x) { xfrm_state_put(x); pkt_dev->flows[i].x = NULL; @@ -2745,6 +2757,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, if (pkt_dev->flags & F_IPSEC) { struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int nhead = 0; + if (x) { struct ethhdr *eth; struct iphdr *iph; @@ -2788,6 +2801,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev) { unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) *mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM; @@ -2900,7 +2914,7 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, skb->dev = dev; } } else { - skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); + skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); } /* the caller pre-fetches from skb->data and reserves for the mac hdr */ @@ -2981,7 +2995,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->priority = pkt_dev->skb_priority; memcpy(eth, pkt_dev->hh, 12); - *(__be16 *) & eth[12] = protocol; + *(__be16 *)ð[12] = protocol; /* Eth + IPh + UDPh + mpls */ datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 - @@ -3210,11 +3224,11 @@ static void pktgen_run(struct pktgen_thread *t) set_pkt_overhead(pkt_dev); - strcpy(pkt_dev->result, "Starting"); + strscpy(pkt_dev->result, "Starting"); pkt_dev->running = 1; /* Cranke yeself! */ started++; } else - strcpy(pkt_dev->result, "Error starting"); + strscpy(pkt_dev->result, "Error starting"); } rcu_read_unlock(); if (started) @@ -3473,6 +3487,7 @@ static void pktgen_rem_thread(struct pktgen_thread *t) static void pktgen_resched(struct pktgen_dev *pkt_dev) { ktime_t idle_start = ktime_get(); + schedule(); pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start)); } @@ -3788,7 +3803,8 @@ static int add_dev_to_thread(struct pktgen_thread *t, * userspace on another CPU than the kthread. The if_lock() * is used here to sync with concurrent instances of * _rem_dev_from_if_list() invoked via kthread, which is also - * updating the if_list */ + * updating the if_list + */ if_lock(t); if (pkt_dev->pg_thread) { @@ -3826,7 +3842,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) if (!pkt_dev) return -ENOMEM; - strcpy(pkt_dev->odevname, ifname); + strscpy(pkt_dev->odevname, ifname); pkt_dev->flows = vzalloc_node(array_size(MAX_CFLOWS, sizeof(struct flow_state)), node); @@ -3983,7 +3999,8 @@ static int pktgen_remove_device(struct pktgen_thread *t, /* Remove proc before if_list entry, because add_device uses * list to determine if interface already exist, avoid race - * with proc_create_data() */ + * with proc_create_data() + */ proc_remove(pkt_dev->entry); /* And update the thread if_list */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c5a7f41982a57..8a914b37ef6e8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2390,12 +2390,12 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, if (strict_check) { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for link dump"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); @@ -3580,7 +3580,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh) { - unsigned int old_flags; + unsigned int old_flags, changed; int err; old_flags = dev->flags; @@ -3591,12 +3591,13 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, return err; } - if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { - __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh); - } else { - dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - __dev_notify_flags(dev, old_flags, ~0U, portid, nlh); + changed = old_flags ^ dev->flags; + if (dev->rtnl_link_initializing) { + dev->rtnl_link_initializing = false; + changed = ~0U; } + + __dev_notify_flags(dev, old_flags, changed, portid, nlh); return 0; } EXPORT_SYMBOL(rtnl_configure_link); @@ -3654,7 +3655,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, dev_net_set(dev, net); dev->rtnl_link_ops = ops; - dev->rtnl_link_state = RTNL_LINK_INITIALIZING; + dev->rtnl_link_initializing = true; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); @@ -4083,7 +4084,8 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, struct ifinfomsg *ifm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for get link"); return -EINVAL; } @@ -4092,7 +4094,6 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for get link request"); @@ -4883,12 +4884,12 @@ static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request"); @@ -5051,12 +5052,12 @@ static int valid_fdb_get_strict(const struct nlmsghdr *nlh, struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); @@ -5323,12 +5324,12 @@ static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, if (strict_check) { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); @@ -6220,7 +6221,8 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, { struct if_stats_msg *ifsm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) { + ifsm = nlmsg_payload(nlh, sizeof(*ifsm)); + if (!ifsm) { NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; } @@ -6228,8 +6230,6 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, if (!strict_check) return 0; - ifsm = nlmsg_data(nlh); - /* only requests using strict checks can pass data to influence * the dump. The legacy exception is filter_mask. */ @@ -6457,12 +6457,12 @@ static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh, { struct br_port_msg *bpm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) { + bpm = nlmsg_payload(nlh, sizeof(*bpm)); + if (!bpm) { NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request"); return -EINVAL; } - bpm = nlmsg_data(nlh); if (bpm->ifindex) { NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request"); return -EINVAL; diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 568779d5a0efa..9a39656804513 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -156,45 +156,3 @@ u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) } EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); #endif - -#if IS_ENABLED(CONFIG_IP_DCCP) -u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) -{ - u64 seq; - net_secret_init(); - seq = siphash_3u32((__force u32)saddr, (__force u32)daddr, - (__force u32)sport << 16 | (__force u32)dport, - &net_secret); - seq += ktime_get_real_ns(); - seq &= (1ull << 48) - 1; - return seq; -} -EXPORT_SYMBOL(secure_dccp_sequence_number); - -#if IS_ENABLED(CONFIG_IPV6) -u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, - __be16 sport, __be16 dport) -{ - const struct { - struct in6_addr saddr; - struct in6_addr daddr; - __be16 sport; - __be16 dport; - } __aligned(SIPHASH_ALIGNMENT) combined = { - .saddr = *(struct in6_addr *)saddr, - .daddr = *(struct in6_addr *)daddr, - .sport = sport, - .dport = dport - }; - u64 seq; - net_secret_init(); - seq = siphash(&combined, offsetofend(typeof(combined), dport), - &net_secret); - seq += ktime_get_real_ns(); - seq &= (1ull << 48) - 1; - return seq; -} -EXPORT_SYMBOL(secure_dccpv6_sequence_number); -#endif -#endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6cbf77bc61fce..d73ad79fe739d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -893,11 +893,6 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } -static bool is_pp_netmem(netmem_ref netmem) -{ - return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE; -} - int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom) { @@ -995,14 +990,7 @@ bool napi_pp_put_page(netmem_ref netmem) { netmem = netmem_compound_head(netmem); - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation - * in order to preserve any existing bits, such as bit 0 for the - * head page of compound page and bit 1 for pfmemalloc page, so - * mask those bits for freeing side when doing below checking, - * and page_is_pfmemalloc() is checked in __page_pool_put_page() - * to avoid recycling the pfmemalloc page. - */ - if (unlikely(!is_pp_netmem(netmem))) + if (unlikely(!netmem_is_pp(netmem))) return false; page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); @@ -1042,7 +1030,7 @@ static int skb_pp_frag_ref(struct sk_buff *skb) for (i = 0; i < shinfo->nr_frags; i++) { head_netmem = netmem_compound_head(shinfo->frags[i].netmem); - if (likely(is_pp_netmem(head_netmem))) + if (likely(netmem_is_pp(head_netmem))) page_pool_ref_netmem(head_netmem); else page_ref_inc(netmem_to_page(head_netmem)); @@ -3239,7 +3227,7 @@ static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg) typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, - int len, sendmsg_func sendmsg) + int len, sendmsg_func sendmsg, int flags) { unsigned int orig_len = len; struct sk_buff *head = skb; @@ -3257,7 +3245,7 @@ static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, kv.iov_base = skb->data + offset; kv.iov_len = slen; memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT; + msg.msg_flags = MSG_DONTWAIT | flags; iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen); ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, @@ -3294,7 +3282,8 @@ static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, while (slen) { struct bio_vec bvec; struct msghdr msg = { - .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT, + .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | + flags, }; bvec_set_page(&bvec, skb_frag_page(frag), slen, @@ -3340,14 +3329,21 @@ static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, sendmsg_locked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0); } EXPORT_SYMBOL_GPL(skb_send_sock_locked); +int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb, + int offset, int len, int flags) +{ + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags); +} +EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags); + /* Send skb data on a socket. Socket must be unlocked. */ int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0); } /** diff --git a/net/core/sock.c b/net/core/sock.c index e54449c9ab0ba..b64df2463300b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2494,17 +2494,14 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ if (!is_charged) RCU_INIT_POINTER(newsk->sk_filter, NULL); - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; + + goto free; } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); - if (bpf_sk_storage_clone(sk, newsk)) { - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; - } + if (bpf_sk_storage_clone(sk, newsk)) + goto free; /* Clear sk_user_data if parent had the pointer tagged * as not suitable for copying when cloning. @@ -2534,18 +2531,17 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) net_enable_timestamp(); out: return newsk; -} -EXPORT_SYMBOL_GPL(sk_clone_lock); - -void sk_free_unlock_clone(struct sock *sk) -{ +free: /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - sk->sk_destruct = NULL; - bh_unlock_sock(sk); - sk_free(sk); + * destructor and make plain sk_free() + */ + newsk->sk_destruct = NULL; + bh_unlock_sock(newsk); + sk_free(newsk); + newsk = NULL; + goto out; } -EXPORT_SYMBOL_GPL(sk_free_unlock_clone); +EXPORT_SYMBOL_GPL(sk_clone_lock); static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) { @@ -4004,7 +4000,7 @@ static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); - if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { + if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) { pr_err("PROTO_INUSE_NR exhausted\n"); return -ENOSPC; } @@ -4015,7 +4011,7 @@ static int assign_proto_idx(struct proto *prot) static void release_proto_idx(struct proto *prot) { - if (prot->inuse_idx != PROTO_INUSE_NR - 1) + if (prot->inuse_idx != PROTO_INUSE_NR) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index a08eed9b9142e..b23594c767f2e 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -264,8 +264,6 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, switch (nlh->nlmsg_type) { case TCPDIAG_GETSOCK: - case DCCPDIAG_GETSOCK: - if (!rcu_access_pointer(inet_rcv_compat)) sock_load_diag_module(AF_INET, 0); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index c7769ee0d9c55..5dbb2c6f371de 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -201,7 +201,7 @@ static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); - kvfree_rcu_mightsleep(orig_sock_table); + kvfree_rcu(orig_sock_table, rcu); } } } @@ -239,7 +239,7 @@ static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); - kfree_rcu_mightsleep(cur); + kfree_rcu(cur, rcu); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); @@ -248,7 +248,7 @@ static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, ret = -ENOMEM; goto write_unlock; } - cur->num_buckets = netdev_flow_limit_table_len; + cur->log_buckets = ilog2(netdev_flow_limit_table_len); rcu_assign_pointer(sd->flow_limit, cur); } } diff --git a/net/core/xdp.c b/net/core/xdp.c index f86eedad586a7..ea819764ae39a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -17,6 +17,7 @@ #include #include +#include #include #include /* struct xdp_mem_allocator */ #include @@ -437,8 +438,8 @@ void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, netmem = netmem_compound_head(netmem); if (napi_direct && xdp_return_frame_no_direct()) napi_direct = false; - /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) - * as mem->type knows this a page_pool page + /* No need to check netmem_is_pp() as mem->type knows this a + * page_pool page */ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, napi_direct); @@ -991,34 +992,60 @@ static int __init xdp_metadata_init(void) } late_initcall(xdp_metadata_init); -void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val) { val &= NETDEV_XDP_ACT_MASK; if (dev->xdp_features == val) return; + netdev_assert_locked_or_invisible(dev); dev->xdp_features = val; if (dev->reg_state == NETREG_REGISTERED) call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); } +EXPORT_SYMBOL_GPL(xdp_set_features_flag_locked); + +void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +{ + netdev_lock(dev); + xdp_set_features_flag_locked(dev, val); + netdev_unlock(dev); +} EXPORT_SYMBOL_GPL(xdp_set_features_flag); -void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) +void xdp_features_set_redirect_target_locked(struct net_device *dev, + bool support_sg) { xdp_features_t val = (dev->xdp_features | NETDEV_XDP_ACT_NDO_XMIT); if (support_sg) val |= NETDEV_XDP_ACT_NDO_XMIT_SG; - xdp_set_features_flag(dev, val); + xdp_set_features_flag_locked(dev, val); +} +EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target_locked); + +void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) +{ + netdev_lock(dev); + xdp_features_set_redirect_target_locked(dev, support_sg); + netdev_unlock(dev); } EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target); -void xdp_features_clear_redirect_target(struct net_device *dev) +void xdp_features_clear_redirect_target_locked(struct net_device *dev) { xdp_features_t val = dev->xdp_features; val &= ~(NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG); - xdp_set_features_flag(dev, val); + xdp_set_features_flag_locked(dev, val); +} +EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target_locked); + +void xdp_features_clear_redirect_target(struct net_device *dev) +{ + netdev_lock(dev); + xdp_features_clear_redirect_target_locked(dev); + netdev_unlock(dev); } EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target); diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig deleted file mode 100644 index 0c7d2f66ba278..0000000000000 --- a/net/dccp/Kconfig +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -menuconfig IP_DCCP - tristate "The DCCP Protocol" - depends on INET - help - Datagram Congestion Control Protocol (RFC 4340) - - From https://www.ietf.org/rfc/rfc4340.txt: - - The Datagram Congestion Control Protocol (DCCP) is a transport - protocol that implements bidirectional, unicast connections of - congestion-controlled, unreliable datagrams. It should be suitable - for use by applications such as streaming media, Internet telephony, - and on-line games. - - To compile this protocol support as a module, choose M here: the - module will be called dccp. - - If in doubt, say N. - -if IP_DCCP - -config INET_DCCP_DIAG - depends on INET_DIAG - def_tristate y if (IP_DCCP = y && INET_DIAG = y) - def_tristate m - -source "net/dccp/ccids/Kconfig" - -menu "DCCP Kernel Hacking" - depends on DEBUG_KERNEL=y - -config IP_DCCP_DEBUG - bool "DCCP debug messages" - help - Only use this if you're hacking DCCP. - - When compiling DCCP as a module, this debugging output can be toggled - by setting the parameter dccp_debug of the `dccp' module to 0 or 1. - - Just say N. - - -endmenu - -endif # IP_DDCP diff --git a/net/dccp/Makefile b/net/dccp/Makefile deleted file mode 100644 index 5b4ff37bc8061..0000000000000 --- a/net/dccp/Makefile +++ /dev/null @@ -1,30 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o - -dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \ - qpolicy.o -# -# CCID algorithms to be used by dccp.ko -# -# CCID-2 is default (RFC 4340, p. 77) and has Ack Vectors as dependency -dccp-y += ccids/ccid2.o ackvec.o -dccp-$(CONFIG_IP_DCCP_CCID3) += ccids/ccid3.o -dccp-$(CONFIG_IP_DCCP_TFRC_LIB) += ccids/lib/tfrc.o \ - ccids/lib/tfrc_equation.o \ - ccids/lib/packet_history.o \ - ccids/lib/loss_interval.o - -dccp_ipv4-y := ipv4.o - -# build dccp_ipv6 as module whenever either IPv6 or DCCP is a module -obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o -dccp_ipv6-y := ipv6.o - -obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o - -dccp-$(CONFIG_SYSCTL) += sysctl.o - -dccp_diag-y := diag.o - -# build with local directory for trace.h -CFLAGS_proto.o := -I$(src) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c deleted file mode 100644 index 1cba001bb4c8e..0000000000000 --- a/net/dccp/ackvec.c +++ /dev/null @@ -1,403 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/ackvec.c - * - * An implementation of Ack Vectors for the DCCP protocol - * Copyright (c) 2007 University of Aberdeen, Scotland, UK - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ -#include "dccp.h" -#include -#include -#include - -static struct kmem_cache *dccp_ackvec_slab; -static struct kmem_cache *dccp_ackvec_record_slab; - -struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) -{ - struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); - - if (av != NULL) { - av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1; - INIT_LIST_HEAD(&av->av_records); - } - return av; -} - -static void dccp_ackvec_purge_records(struct dccp_ackvec *av) -{ - struct dccp_ackvec_record *cur, *next; - - list_for_each_entry_safe(cur, next, &av->av_records, avr_node) - kmem_cache_free(dccp_ackvec_record_slab, cur); - INIT_LIST_HEAD(&av->av_records); -} - -void dccp_ackvec_free(struct dccp_ackvec *av) -{ - if (likely(av != NULL)) { - dccp_ackvec_purge_records(av); - kmem_cache_free(dccp_ackvec_slab, av); - } -} - -/** - * dccp_ackvec_update_records - Record information about sent Ack Vectors - * @av: Ack Vector records to update - * @seqno: Sequence number of the packet carrying the Ack Vector just sent - * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector - */ -int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) -{ - struct dccp_ackvec_record *avr; - - avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); - if (avr == NULL) - return -ENOBUFS; - - avr->avr_ack_seqno = seqno; - avr->avr_ack_ptr = av->av_buf_head; - avr->avr_ack_ackno = av->av_buf_ackno; - avr->avr_ack_nonce = nonce_sum; - avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); - /* - * When the buffer overflows, we keep no more than one record. This is - * the simplest way of disambiguating sender-Acks dating from before the - * overflow from sender-Acks which refer to after the overflow; a simple - * solution is preferable here since we are handling an exception. - */ - if (av->av_overflow) - dccp_ackvec_purge_records(av); - /* - * Since GSS is incremented for each packet, the list is automatically - * arranged in descending order of @ack_seqno. - */ - list_add(&avr->avr_node, &av->av_records); - - dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n", - (unsigned long long)avr->avr_ack_seqno, - (unsigned long long)avr->avr_ack_ackno, - avr->avr_ack_runlen); - return 0; -} - -static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, - const u64 ackno) -{ - struct dccp_ackvec_record *avr; - /* - * Exploit that records are inserted in descending order of sequence - * number, start with the oldest record first. If @ackno is `before' - * the earliest ack_ackno, the packet is too old to be considered. - */ - list_for_each_entry_reverse(avr, av_list, avr_node) { - if (avr->avr_ack_seqno == ackno) - return avr; - if (before48(ackno, avr->avr_ack_seqno)) - break; - } - return NULL; -} - -/* - * Buffer index and length computation using modulo-buffersize arithmetic. - * Note that, as pointers move from right to left, head is `before' tail. - */ -static inline u16 __ackvec_idx_add(const u16 a, const u16 b) -{ - return (a + b) % DCCPAV_MAX_ACKVEC_LEN; -} - -static inline u16 __ackvec_idx_sub(const u16 a, const u16 b) -{ - return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b); -} - -u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) -{ - if (unlikely(av->av_overflow)) - return DCCPAV_MAX_ACKVEC_LEN; - return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); -} - -/** - * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 - * @av: non-empty buffer to update - * @distance: negative or zero distance of @seqno from buf_ackno downward - * @seqno: the (old) sequence number whose record is to be updated - * @state: state in which packet carrying @seqno was received - */ -static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, - u64 seqno, enum dccp_ackvec_states state) -{ - u16 ptr = av->av_buf_head; - - BUG_ON(distance > 0); - if (unlikely(dccp_ackvec_is_empty(av))) - return; - - do { - u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); - - if (distance + runlen >= 0) { - /* - * Only update the state if packet has not been received - * yet. This is OK as per the second table in RFC 4340, - * 11.4.1; i.e. here we are using the following table: - * RECEIVED - * 0 1 3 - * S +---+---+---+ - * T 0 | 0 | 0 | 0 | - * O +---+---+---+ - * R 1 | 1 | 1 | 1 | - * E +---+---+---+ - * D 3 | 0 | 1 | 3 | - * +---+---+---+ - * The "Not Received" state was set by reserve_seats(). - */ - if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED) - av->av_buf[ptr] = state; - else - dccp_pr_debug("Not changing %llu state to %u\n", - (unsigned long long)seqno, state); - break; - } - - distance += runlen + 1; - ptr = __ackvec_idx_add(ptr, 1); - - } while (ptr != av->av_buf_tail); -} - -/* Mark @num entries after buf_head as "Not yet received". */ -static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) -{ - u16 start = __ackvec_idx_add(av->av_buf_head, 1), - len = DCCPAV_MAX_ACKVEC_LEN - start; - - /* check for buffer wrap-around */ - if (num > len) { - memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len); - start = 0; - num -= len; - } - if (num) - memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num); -} - -/** - * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer - * @av: container of buffer to update (can be empty or non-empty) - * @num_packets: number of packets to register (must be >= 1) - * @seqno: sequence number of the first packet in @num_packets - * @state: state in which packet carrying @seqno was received - */ -static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, - u64 seqno, enum dccp_ackvec_states state) -{ - u32 num_cells = num_packets; - - if (num_packets > DCCPAV_BURST_THRESH) { - u32 lost_packets = num_packets - 1; - - DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets); - /* - * We received 1 packet and have a loss of size "num_packets-1" - * which we squeeze into num_cells-1 rather than reserving an - * entire byte for each lost packet. - * The reason is that the vector grows in O(burst_length); when - * it grows too large there will no room left for the payload. - * This is a trade-off: if a few packets out of the burst show - * up later, their state will not be changed; it is simply too - * costly to reshuffle/reallocate/copy the buffer each time. - * Should such problems persist, we will need to switch to a - * different underlying data structure. - */ - for (num_packets = num_cells = 1; lost_packets; ++num_cells) { - u8 len = min_t(u32, lost_packets, DCCPAV_MAX_RUNLEN); - - av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1); - av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len; - - lost_packets -= len; - } - } - - if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { - DCCP_CRIT("Ack Vector buffer overflow: dropping old entries"); - av->av_overflow = true; - } - - av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets); - if (av->av_overflow) - av->av_buf_tail = av->av_buf_head; - - av->av_buf[av->av_buf_head] = state; - av->av_buf_ackno = seqno; - - if (num_packets > 1) - dccp_ackvec_reserve_seats(av, num_packets - 1); -} - -/** - * dccp_ackvec_input - Register incoming packet in the buffer - * @av: Ack Vector to register packet to - * @skb: Packet to register - */ -void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) -{ - u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq; - enum dccp_ackvec_states state = DCCPAV_RECEIVED; - - if (dccp_ackvec_is_empty(av)) { - dccp_ackvec_add_new(av, 1, seqno, state); - av->av_tail_ackno = seqno; - - } else { - s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno); - u8 *current_head = av->av_buf + av->av_buf_head; - - if (num_packets == 1 && - dccp_ackvec_state(current_head) == state && - dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) { - - *current_head += 1; - av->av_buf_ackno = seqno; - - } else if (num_packets > 0) { - dccp_ackvec_add_new(av, num_packets, seqno, state); - } else { - dccp_ackvec_update_old(av, num_packets, seqno, state); - } - } -} - -/** - * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection - * @av: Ack Vector record to clean - * @ackno: last Ack Vector which has been acknowledged - * - * This routine is called when the peer acknowledges the receipt of Ack Vectors - * up to and including @ackno. While based on section A.3 of RFC 4340, here - * are additional precautions to prevent corrupted buffer state. In particular, - * we use tail_ackno to identify outdated records; it always marks the earliest - * packet of group (2) in 11.4.2. - */ -void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno) -{ - struct dccp_ackvec_record *avr, *next; - u8 runlen_now, eff_runlen; - s64 delta; - - avr = dccp_ackvec_lookup(&av->av_records, ackno); - if (avr == NULL) - return; - /* - * Deal with outdated acknowledgments: this arises when e.g. there are - * several old records and the acks from the peer come in slowly. In - * that case we may still have records that pre-date tail_ackno. - */ - delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno); - if (delta < 0) - goto free_records; - /* - * Deal with overlapping Ack Vectors: don't subtract more than the - * number of packets between tail_ackno and ack_ackno. - */ - eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen; - - runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); - /* - * The run length of Ack Vector cells does not decrease over time. If - * the run length is the same as at the time the Ack Vector was sent, we - * free the ack_ptr cell. That cell can however not be freed if the run - * length has increased: in this case we need to move the tail pointer - * backwards (towards higher indices), to its next-oldest neighbour. - */ - if (runlen_now > eff_runlen) { - - av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; - av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1); - - /* This move may not have cleared the overflow flag. */ - if (av->av_overflow) - av->av_overflow = (av->av_buf_head == av->av_buf_tail); - } else { - av->av_buf_tail = avr->avr_ack_ptr; - /* - * We have made sure that avr points to a valid cell within the - * buffer. This cell is either older than head, or equals head - * (empty buffer): in both cases we no longer have any overflow. - */ - av->av_overflow = 0; - } - - /* - * The peer has acknowledged up to and including ack_ackno. Hence the - * first packet in group (2) of 11.4.2 is the successor of ack_ackno. - */ - av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); - -free_records: - list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { - list_del(&avr->avr_node); - kmem_cache_free(dccp_ackvec_record_slab, avr); - } -} - -/* - * Routines to keep track of Ack Vectors received in an skb - */ -int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce) -{ - struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC); - - if (new == NULL) - return -ENOBUFS; - new->vec = vec; - new->len = len; - new->nonce = nonce; - - list_add_tail(&new->node, head); - return 0; -} -EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add); - -void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks) -{ - struct dccp_ackvec_parsed *cur, *next; - - list_for_each_entry_safe(cur, next, parsed_chunks, node) - kfree(cur); - INIT_LIST_HEAD(parsed_chunks); -} -EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); - -int __init dccp_ackvec_init(void) -{ - dccp_ackvec_slab = KMEM_CACHE(dccp_ackvec, SLAB_HWCACHE_ALIGN); - if (dccp_ackvec_slab == NULL) - goto out_err; - - dccp_ackvec_record_slab = KMEM_CACHE(dccp_ackvec_record, SLAB_HWCACHE_ALIGN); - if (dccp_ackvec_record_slab == NULL) - goto out_destroy_slab; - - return 0; - -out_destroy_slab: - kmem_cache_destroy(dccp_ackvec_slab); - dccp_ackvec_slab = NULL; -out_err: - DCCP_CRIT("Unable to create Ack Vector slab cache"); - return -ENOBUFS; -} - -void dccp_ackvec_exit(void) -{ - kmem_cache_destroy(dccp_ackvec_slab); - dccp_ackvec_slab = NULL; - kmem_cache_destroy(dccp_ackvec_record_slab); - dccp_ackvec_record_slab = NULL; -} diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h deleted file mode 100644 index d2c4220fb377b..0000000000000 --- a/net/dccp/ackvec.h +++ /dev/null @@ -1,136 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _ACKVEC_H -#define _ACKVEC_H -/* - * net/dccp/ackvec.h - * - * An implementation of Ack Vectors for the DCCP protocol - * Copyright (c) 2007 University of Aberdeen, Scotland, UK - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include - -/* - * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN, - * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1 - * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives - * more headroom if Ack Ratio is higher or when the sender acknowledges slowly. - * The maximum value is bounded by the u16 types for indices and functions. - */ -#define DCCPAV_NUM_ACKVECS 2 -#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS) - -/* Estimated minimum average Ack Vector length - used for updating MPS */ -#define DCCPAV_MIN_OPTLEN 16 - -/* Threshold for coping with large bursts of losses */ -#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8) - -enum dccp_ackvec_states { - DCCPAV_RECEIVED = 0x00, - DCCPAV_ECN_MARKED = 0x40, - DCCPAV_RESERVED = 0x80, - DCCPAV_NOT_RECEIVED = 0xC0 -}; -#define DCCPAV_MAX_RUNLEN 0x3F - -static inline u8 dccp_ackvec_runlen(const u8 *cell) -{ - return *cell & DCCPAV_MAX_RUNLEN; -} - -static inline u8 dccp_ackvec_state(const u8 *cell) -{ - return *cell & ~DCCPAV_MAX_RUNLEN; -} - -/** - * struct dccp_ackvec - Ack Vector main data structure - * - * This implements a fixed-size circular buffer within an array and is largely - * based on Appendix A of RFC 4340. - * - * @av_buf: circular buffer storage area - * @av_buf_head: head index; begin of live portion in @av_buf - * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf - * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf - * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf - * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to - * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf - * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound - * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) - */ -struct dccp_ackvec { - u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; - u16 av_buf_head; - u16 av_buf_tail; - u64 av_buf_ackno:48; - u64 av_tail_ackno:48; - bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; - u8 av_overflow:1; - struct list_head av_records; -}; - -/** - * struct dccp_ackvec_record - Records information about sent Ack Vectors - * - * These list entries define the additional information which the HC-Receiver - * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A. - * - * @avr_node: the list node in @av_records - * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on - * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to - * @avr_ack_ptr: pointer into @av_buf where this record starts - * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending - * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent - * - * The list as a whole is sorted in descending order by @avr_ack_seqno. - */ -struct dccp_ackvec_record { - struct list_head avr_node; - u64 avr_ack_seqno:48; - u64 avr_ack_ackno:48; - u16 avr_ack_ptr; - u8 avr_ack_runlen; - u8 avr_ack_nonce:1; -}; - -int dccp_ackvec_init(void); -void dccp_ackvec_exit(void); - -struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); -void dccp_ackvec_free(struct dccp_ackvec *av); - -void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb); -int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); -void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); -u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); - -static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) -{ - return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; -} - -/** - * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb - * @vec: start of vector (offset into skb) - * @len: length of @vec - * @nonce: whether @vec had an ECN nonce of 0 or 1 - * @node: FIFO - arranged in descending order of ack_ackno - * - * This structure is used by CCIDs to access Ack Vectors in a received skb. - */ -struct dccp_ackvec_parsed { - u8 *vec, - len, - nonce:1; - struct list_head node; -}; - -int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce); -void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks); -#endif /* _ACKVEC_H */ diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c deleted file mode 100644 index 6beac5d348e2b..0000000000000 --- a/net/dccp/ccid.c +++ /dev/null @@ -1,219 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/ccid.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - * - * CCID infrastructure - */ - -#include - -#include "ccid.h" -#include "ccids/lib/tfrc.h" - -static struct ccid_operations *ccids[] = { - &ccid2_ops, -#ifdef CONFIG_IP_DCCP_CCID3 - &ccid3_ops, -#endif -}; - -static struct ccid_operations *ccid_by_number(const u8 id) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(ccids); i++) - if (ccids[i]->ccid_id == id) - return ccids[i]; - return NULL; -} - -/* check that up to @array_len members in @ccid_array are supported */ -bool ccid_support_check(u8 const *ccid_array, u8 array_len) -{ - while (array_len > 0) - if (ccid_by_number(ccid_array[--array_len]) == NULL) - return false; - return true; -} - -/** - * ccid_get_builtin_ccids - Populate a list of built-in CCIDs - * @ccid_array: pointer to copy into - * @array_len: value to return length into - * - * This function allocates memory - caller must see that it is freed after use. - */ -int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) -{ - *ccid_array = kmalloc(ARRAY_SIZE(ccids), gfp_any()); - if (*ccid_array == NULL) - return -ENOBUFS; - - for (*array_len = 0; *array_len < ARRAY_SIZE(ccids); *array_len += 1) - (*ccid_array)[*array_len] = ccids[*array_len]->ccid_id; - return 0; -} - -int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, - char __user *optval, int __user *optlen) -{ - u8 *ccid_array, array_len; - int err = 0; - - if (ccid_get_builtin_ccids(&ccid_array, &array_len)) - return -ENOBUFS; - - if (put_user(array_len, optlen)) - err = -EFAULT; - else if (len > 0 && copy_to_user(optval, ccid_array, - len > array_len ? array_len : len)) - err = -EFAULT; - - kfree(ccid_array); - return err; -} - -static __printf(3, 4) struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...) -{ - struct kmem_cache *slab; - va_list args; - - va_start(args, fmt); - vsnprintf(slab_name_fmt, CCID_SLAB_NAME_LENGTH, fmt, args); - va_end(args); - - slab = kmem_cache_create(slab_name_fmt, sizeof(struct ccid) + obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL); - return slab; -} - -static void ccid_kmem_cache_destroy(struct kmem_cache *slab) -{ - kmem_cache_destroy(slab); -} - -static int __init ccid_activate(struct ccid_operations *ccid_ops) -{ - int err = -ENOBUFS; - - ccid_ops->ccid_hc_rx_slab = - ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size, - ccid_ops->ccid_hc_rx_slab_name, - "ccid%u_hc_rx_sock", - ccid_ops->ccid_id); - if (ccid_ops->ccid_hc_rx_slab == NULL) - goto out; - - ccid_ops->ccid_hc_tx_slab = - ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size, - ccid_ops->ccid_hc_tx_slab_name, - "ccid%u_hc_tx_sock", - ccid_ops->ccid_id); - if (ccid_ops->ccid_hc_tx_slab == NULL) - goto out_free_rx_slab; - - pr_info("DCCP: Activated CCID %d (%s)\n", - ccid_ops->ccid_id, ccid_ops->ccid_name); - err = 0; -out: - return err; -out_free_rx_slab: - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab); - ccid_ops->ccid_hc_rx_slab = NULL; - goto out; -} - -static void ccid_deactivate(struct ccid_operations *ccid_ops) -{ - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab); - ccid_ops->ccid_hc_tx_slab = NULL; - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab); - ccid_ops->ccid_hc_rx_slab = NULL; - - pr_info("DCCP: Deactivated CCID %d (%s)\n", - ccid_ops->ccid_id, ccid_ops->ccid_name); -} - -struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx) -{ - struct ccid_operations *ccid_ops = ccid_by_number(id); - struct ccid *ccid = NULL; - - if (ccid_ops == NULL) - goto out; - - ccid = kmem_cache_alloc(rx ? ccid_ops->ccid_hc_rx_slab : - ccid_ops->ccid_hc_tx_slab, gfp_any()); - if (ccid == NULL) - goto out; - ccid->ccid_ops = ccid_ops; - if (rx) { - memset(ccid + 1, 0, ccid_ops->ccid_hc_rx_obj_size); - if (ccid->ccid_ops->ccid_hc_rx_init != NULL && - ccid->ccid_ops->ccid_hc_rx_init(ccid, sk) != 0) - goto out_free_ccid; - } else { - memset(ccid + 1, 0, ccid_ops->ccid_hc_tx_obj_size); - if (ccid->ccid_ops->ccid_hc_tx_init != NULL && - ccid->ccid_ops->ccid_hc_tx_init(ccid, sk) != 0) - goto out_free_ccid; - } -out: - return ccid; -out_free_ccid: - kmem_cache_free(rx ? ccid_ops->ccid_hc_rx_slab : - ccid_ops->ccid_hc_tx_slab, ccid); - ccid = NULL; - goto out; -} - -void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk) -{ - if (ccid != NULL) { - if (ccid->ccid_ops->ccid_hc_rx_exit != NULL) - ccid->ccid_ops->ccid_hc_rx_exit(sk); - kmem_cache_free(ccid->ccid_ops->ccid_hc_rx_slab, ccid); - } -} - -void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk) -{ - if (ccid != NULL) { - if (ccid->ccid_ops->ccid_hc_tx_exit != NULL) - ccid->ccid_ops->ccid_hc_tx_exit(sk); - kmem_cache_free(ccid->ccid_ops->ccid_hc_tx_slab, ccid); - } -} - -int __init ccid_initialize_builtins(void) -{ - int i, err = tfrc_lib_init(); - - if (err) - return err; - - for (i = 0; i < ARRAY_SIZE(ccids); i++) { - err = ccid_activate(ccids[i]); - if (err) - goto unwind_registrations; - } - return 0; - -unwind_registrations: - while(--i >= 0) - ccid_deactivate(ccids[i]); - tfrc_lib_exit(); - return err; -} - -void ccid_cleanup_builtins(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(ccids); i++) - ccid_deactivate(ccids[i]); - tfrc_lib_exit(); -} diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h deleted file mode 100644 index 105f3734dadbf..0000000000000 --- a/net/dccp/ccid.h +++ /dev/null @@ -1,262 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _CCID_H -#define _CCID_H -/* - * net/dccp/ccid.h - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - * - * CCID infrastructure - */ - -#include -#include -#include -#include -#include - -/* maximum value for a CCID (RFC 4340, 19.5) */ -#define CCID_MAX 255 -#define CCID_SLAB_NAME_LENGTH 32 - -struct tcp_info; - -/** - * struct ccid_operations - Interface to Congestion-Control Infrastructure - * - * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.) - * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled) - * @ccid_name: alphabetical identifier string for @ccid_id - * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection - * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket - * - * @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup) - * @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction) - * @ccid_hc_rx_packet_recv: implements the HC-receiver side - * @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options - * @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options - * @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender - * @ccid_hc_tx_send_packet: implements the sending part of the HC-sender - * @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender - * @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender - * @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender - */ -struct ccid_operations { - unsigned char ccid_id; - __u32 ccid_ccmps; - const char *ccid_name; - struct kmem_cache *ccid_hc_rx_slab, - *ccid_hc_tx_slab; - char ccid_hc_rx_slab_name[CCID_SLAB_NAME_LENGTH]; - char ccid_hc_tx_slab_name[CCID_SLAB_NAME_LENGTH]; - __u32 ccid_hc_rx_obj_size, - ccid_hc_tx_obj_size; - /* Interface Routines */ - int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk); - int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk); - void (*ccid_hc_rx_exit)(struct sock *sk); - void (*ccid_hc_tx_exit)(struct sock *sk); - void (*ccid_hc_rx_packet_recv)(struct sock *sk, - struct sk_buff *skb); - int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, - u8 opt, u8 *val, u8 len); - int (*ccid_hc_rx_insert_options)(struct sock *sk, - struct sk_buff *skb); - void (*ccid_hc_tx_packet_recv)(struct sock *sk, - struct sk_buff *skb); - int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, - u8 opt, u8 *val, u8 len); - int (*ccid_hc_tx_send_packet)(struct sock *sk, - struct sk_buff *skb); - void (*ccid_hc_tx_packet_sent)(struct sock *sk, - unsigned int len); - void (*ccid_hc_rx_get_info)(struct sock *sk, - struct tcp_info *info); - void (*ccid_hc_tx_get_info)(struct sock *sk, - struct tcp_info *info); - int (*ccid_hc_rx_getsockopt)(struct sock *sk, - const int optname, int len, - u32 __user *optval, - int __user *optlen); - int (*ccid_hc_tx_getsockopt)(struct sock *sk, - const int optname, int len, - u32 __user *optval, - int __user *optlen); -}; - -extern struct ccid_operations ccid2_ops; -#ifdef CONFIG_IP_DCCP_CCID3 -extern struct ccid_operations ccid3_ops; -#endif - -int ccid_initialize_builtins(void); -void ccid_cleanup_builtins(void); - -struct ccid { - struct ccid_operations *ccid_ops; - char ccid_priv[]; -}; - -static inline void *ccid_priv(const struct ccid *ccid) -{ - return (void *)ccid->ccid_priv; -} - -bool ccid_support_check(u8 const *ccid_array, u8 array_len); -int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); -int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, - char __user *, int __user *); - -struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx); - -static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) -{ - struct ccid *ccid = dp->dccps_hc_rx_ccid; - - if (ccid == NULL || ccid->ccid_ops == NULL) - return -1; - return ccid->ccid_ops->ccid_id; -} - -static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) -{ - struct ccid *ccid = dp->dccps_hc_tx_ccid; - - if (ccid == NULL || ccid->ccid_ops == NULL) - return -1; - return ccid->ccid_ops->ccid_id; -} - -void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); -void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); - -/* - * Congestion control of queued data packets via CCID decision. - * - * The TX CCID performs its congestion-control by indicating whether and when a - * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). - * The following modes are supported via the symbolic constants below: - * - timer-based pacing (CCID returns a delay value in milliseconds); - * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). - */ - -enum ccid_dequeueing_decision { - CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ - CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ - CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ - CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ - CCID_PACKET_ERR = 0xF0000, /* error condition */ -}; - -static inline int ccid_packet_dequeue_eval(const int return_code) -{ - if (return_code < 0) - return CCID_PACKET_ERR; - if (return_code == 0) - return CCID_PACKET_SEND_AT_ONCE; - if (return_code <= CCID_PACKET_DELAY_MAX) - return CCID_PACKET_DELAY; - return return_code; -} - -static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) - return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); - return CCID_PACKET_SEND_AT_ONCE; -} - -static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, - unsigned int len) -{ - if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) - ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); -} - -static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL) - ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb); -} - -static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL) - ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); -} - -/** - * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver - * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) - * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) - * @val: value of @opt - * @len: length of @val in bytes - */ -static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, - u8 pkt, u8 opt, u8 *val, u8 len) -{ - if (!ccid || !ccid->ccid_ops->ccid_hc_tx_parse_options) - return 0; - return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); -} - -/** - * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender - * Arguments are analogous to ccid_hc_tx_parse_options() - */ -static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, - u8 pkt, u8 opt, u8 *val, u8 len) -{ - if (!ccid || !ccid->ccid_ops->ccid_hc_rx_parse_options) - return 0; - return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); -} - -static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL) - return ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb); - return 0; -} - -static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk, - struct tcp_info *info) -{ - if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL) - ccid->ccid_ops->ccid_hc_rx_get_info(sk, info); -} - -static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk, - struct tcp_info *info) -{ - if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL) - ccid->ccid_ops->ccid_hc_tx_get_info(sk, info); -} - -static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk, - const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - int rc = -ENOPROTOOPT; - if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) - rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len, - optval, optlen); - return rc; -} - -static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk, - const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - int rc = -ENOPROTOOPT; - if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) - rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len, - optval, optlen); - return rc; -} -#endif /* _CCID_H */ diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig deleted file mode 100644 index e3d388c33d256..0000000000000 --- a/net/dccp/ccids/Kconfig +++ /dev/null @@ -1,55 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -menu "DCCP CCIDs Configuration" - -config IP_DCCP_CCID2_DEBUG - bool "CCID-2 debugging messages" - help - Enable CCID-2 specific debugging messages. - - The debugging output can additionally be toggled by setting the - ccid2_debug parameter to 0 or 1. - - If in doubt, say N. - -config IP_DCCP_CCID3 - bool "CCID-3 (TCP-Friendly)" - default IP_DCCP = y || IP_DCCP = m - help - CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based - rate-controlled congestion control mechanism. TFRC is designed to - be reasonably fair when competing for bandwidth with TCP-like flows, - where a flow is "reasonably fair" if its sending rate is generally - within a factor of two of the sending rate of a TCP flow under the - same conditions. However, TFRC has a much lower variation of - throughput over time compared with TCP, which makes CCID-3 more - suitable than CCID-2 for applications such streaming media where a - relatively smooth sending rate is of importance. - - CCID-3 is further described in RFC 4342, - https://www.ietf.org/rfc/rfc4342.txt - - The TFRC congestion control algorithms were initially described in - RFC 5348. - - This text was extracted from RFC 4340 (sec. 10.2), - https://www.ietf.org/rfc/rfc4340.txt - - If in doubt, say N. - -config IP_DCCP_CCID3_DEBUG - bool "CCID-3 debugging messages" - depends on IP_DCCP_CCID3 - help - Enable CCID-3 specific debugging messages. - - The debugging output can additionally be toggled by setting the - ccid3_debug parameter to 0 or 1. - - If in doubt, say N. - -config IP_DCCP_TFRC_LIB - def_bool y if IP_DCCP_CCID3 - -config IP_DCCP_TFRC_DEBUG - def_bool y if IP_DCCP_CCID3_DEBUG -endmenu diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c deleted file mode 100644 index d6b30700af679..0000000000000 --- a/net/dccp/ccids/ccid2.c +++ /dev/null @@ -1,794 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2005, 2006 Andrea Bittau - * - * Changes to meet Linux coding standards, and DCCP infrastructure fixes. - * - * Copyright (c) 2006 Arnaldo Carvalho de Melo - */ - -/* - * This implementation should follow RFC 4341 - */ -#include -#include "../feat.h" -#include "ccid2.h" - - -#ifdef CONFIG_IP_DCCP_CCID2_DEBUG -static bool ccid2_debug; -#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) -#else -#define ccid2_pr_debug(format, a...) -#endif - -static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) -{ - struct ccid2_seq *seqp; - int i; - - /* check if we have space to preserve the pointer to the buffer */ - if (hc->tx_seqbufc >= (sizeof(hc->tx_seqbuf) / - sizeof(struct ccid2_seq *))) - return -ENOMEM; - - /* allocate buffer and initialize linked list */ - seqp = kmalloc_array(CCID2_SEQBUF_LEN, sizeof(struct ccid2_seq), - gfp_any()); - if (seqp == NULL) - return -ENOMEM; - - for (i = 0; i < (CCID2_SEQBUF_LEN - 1); i++) { - seqp[i].ccid2s_next = &seqp[i + 1]; - seqp[i + 1].ccid2s_prev = &seqp[i]; - } - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = seqp; - seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; - - /* This is the first allocation. Initiate the head and tail. */ - if (hc->tx_seqbufc == 0) - hc->tx_seqh = hc->tx_seqt = seqp; - else { - /* link the existing list with the one we just created */ - hc->tx_seqh->ccid2s_next = seqp; - seqp->ccid2s_prev = hc->tx_seqh; - - hc->tx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hc->tx_seqt; - } - - /* store the original pointer to the buffer so we can free it */ - hc->tx_seqbuf[hc->tx_seqbufc] = seqp; - hc->tx_seqbufc++; - - return 0; -} - -static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) -{ - if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) - return CCID_PACKET_WILL_DEQUEUE_LATER; - return CCID_PACKET_SEND_AT_ONCE; -} - -static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) -{ - u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2); - - /* - * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from - * RFC 4341, 6.1.2. We ignore the statement that Ack Ratio 2 is always - * acceptable since this causes starvation/deadlock whenever cwnd < 2. - * The same problem arises when Ack Ratio is 0 (ie. Ack Ratio disabled). - */ - if (val == 0 || val > max_ratio) { - DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); - val = max_ratio; - } - dccp_feat_signal_nn_change(sk, DCCPF_ACK_RATIO, - min_t(u32, val, DCCPF_ACK_RATIO_MAX)); -} - -static void ccid2_check_l_ack_ratio(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - /* - * After a loss, idle period, application limited period, or RTO we - * need to check that the ack ratio is still less than the congestion - * window. Otherwise, we will send an entire congestion window of - * packets and got no response because we haven't sent ack ratio - * packets yet. - * If the ack ratio does need to be reduced, we reduce it to half of - * the congestion window (or 1 if that's zero) instead of to the - * congestion window. This prevents problems if one ack is lost. - */ - if (dccp_feat_nn_get(sk, DCCPF_ACK_RATIO) > hc->tx_cwnd) - ccid2_change_l_ack_ratio(sk, hc->tx_cwnd/2 ? : 1U); -} - -static void ccid2_change_l_seq_window(struct sock *sk, u64 val) -{ - dccp_feat_signal_nn_change(sk, DCCPF_SEQUENCE_WINDOW, - clamp_val(val, DCCPF_SEQ_WMIN, - DCCPF_SEQ_WMAX)); -} - -static void dccp_tasklet_schedule(struct sock *sk) -{ - struct tasklet_struct *t = &dccp_sk(sk)->dccps_xmitlet; - - if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { - sock_hold(sk); - __tasklet_schedule(t); - } -} - -static void ccid2_hc_tx_rto_expire(struct timer_list *t) -{ - struct ccid2_hc_tx_sock *hc = from_timer(hc, t, tx_rtotimer); - struct sock *sk = hc->sk; - const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + HZ / 5); - goto out; - } - - ccid2_pr_debug("RTO_EXPIRE\n"); - - if (sk->sk_state == DCCP_CLOSED) - goto out; - - /* back-off timer */ - hc->tx_rto <<= 1; - if (hc->tx_rto > DCCP_RTO_MAX) - hc->tx_rto = DCCP_RTO_MAX; - - /* adjust pipe, cwnd etc */ - hc->tx_ssthresh = hc->tx_cwnd / 2; - if (hc->tx_ssthresh < 2) - hc->tx_ssthresh = 2; - hc->tx_cwnd = 1; - hc->tx_pipe = 0; - - /* clear state about stuff we sent */ - hc->tx_seqt = hc->tx_seqh; - hc->tx_packets_acked = 0; - - /* clear ack ratio state. */ - hc->tx_rpseq = 0; - hc->tx_rpdupack = -1; - ccid2_change_l_ack_ratio(sk, 1); - - /* if we were blocked before, we may now send cwnd=1 packet */ - if (sender_was_blocked) - dccp_tasklet_schedule(sk); - /* restart backed-off timer */ - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -/* - * Congestion window validation (RFC 2861). - */ -static bool ccid2_do_cwv = true; -module_param(ccid2_do_cwv, bool, 0644); -MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation"); - -/** - * ccid2_update_used_window - Track how much of cwnd is actually used - * @hc: socket to update window - * @new_wnd: new window values to add into the filter - * - * This is done in addition to CWV. The sender needs to have an idea of how many - * packets may be in flight, to set the local Sequence Window value accordingly - * (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the - * maximum-used window. We use an EWMA low-pass filter to filter out noise. - */ -static void ccid2_update_used_window(struct ccid2_hc_tx_sock *hc, u32 new_wnd) -{ - hc->tx_expected_wnd = (3 * hc->tx_expected_wnd + new_wnd) / 4; -} - -/* This borrows the code of tcp_cwnd_application_limited() */ -static void ccid2_cwnd_application_limited(struct sock *sk, const u32 now) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - /* don't reduce cwnd below the initial window (IW) */ - u32 init_win = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache), - win_used = max(hc->tx_cwnd_used, init_win); - - if (win_used < hc->tx_cwnd) { - hc->tx_ssthresh = max(hc->tx_ssthresh, - (hc->tx_cwnd >> 1) + (hc->tx_cwnd >> 2)); - hc->tx_cwnd = (hc->tx_cwnd + win_used) >> 1; - } - hc->tx_cwnd_used = 0; - hc->tx_cwnd_stamp = now; - - ccid2_check_l_ack_ratio(sk); -} - -/* This borrows the code of tcp_cwnd_restart() */ -static void ccid2_cwnd_restart(struct sock *sk, const u32 now) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - u32 cwnd = hc->tx_cwnd, restart_cwnd, - iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache); - s32 delta = now - hc->tx_lsndtime; - - hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2)); - - /* don't reduce cwnd below the initial window (IW) */ - restart_cwnd = min(cwnd, iwnd); - - while ((delta -= hc->tx_rto) >= 0 && cwnd > restart_cwnd) - cwnd >>= 1; - hc->tx_cwnd = max(cwnd, restart_cwnd); - hc->tx_cwnd_stamp = now; - hc->tx_cwnd_used = 0; - - ccid2_check_l_ack_ratio(sk); -} - -static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - const u32 now = ccid2_jiffies32; - struct ccid2_seq *next; - - /* slow-start after idle periods (RFC 2581, RFC 2861) */ - if (ccid2_do_cwv && !hc->tx_pipe && - (s32)(now - hc->tx_lsndtime) >= hc->tx_rto) - ccid2_cwnd_restart(sk, now); - - hc->tx_lsndtime = now; - hc->tx_pipe += 1; - - /* see whether cwnd was fully used (RFC 2861), update expected window */ - if (ccid2_cwnd_network_limited(hc)) { - ccid2_update_used_window(hc, hc->tx_cwnd); - hc->tx_cwnd_used = 0; - hc->tx_cwnd_stamp = now; - } else { - if (hc->tx_pipe > hc->tx_cwnd_used) - hc->tx_cwnd_used = hc->tx_pipe; - - ccid2_update_used_window(hc, hc->tx_cwnd_used); - - if (ccid2_do_cwv && (s32)(now - hc->tx_cwnd_stamp) >= hc->tx_rto) - ccid2_cwnd_application_limited(sk, now); - } - - hc->tx_seqh->ccid2s_seq = dp->dccps_gss; - hc->tx_seqh->ccid2s_acked = 0; - hc->tx_seqh->ccid2s_sent = now; - - next = hc->tx_seqh->ccid2s_next; - /* check if we need to alloc more space */ - if (next == hc->tx_seqt) { - if (ccid2_hc_tx_alloc_seq(hc)) { - DCCP_CRIT("packet history - out of memory!"); - /* FIXME: find a more graceful way to bail out */ - return; - } - next = hc->tx_seqh->ccid2s_next; - BUG_ON(next == hc->tx_seqt); - } - hc->tx_seqh = next; - - ccid2_pr_debug("cwnd=%d pipe=%d\n", hc->tx_cwnd, hc->tx_pipe); - - /* - * FIXME: The code below is broken and the variables have been removed - * from the socket struct. The `ackloss' variable was always set to 0, - * and with arsent there are several problems: - * (i) it doesn't just count the number of Acks, but all sent packets; - * (ii) it is expressed in # of packets, not # of windows, so the - * comparison below uses the wrong formula: Appendix A of RFC 4341 - * comes up with the number K = cwnd / (R^2 - R) of consecutive windows - * of data with no lost or marked Ack packets. If arsent were the # of - * consecutive Acks received without loss, then Ack Ratio needs to be - * decreased by 1 when - * arsent >= K * cwnd / R = cwnd^2 / (R^3 - R^2) - * where cwnd / R is the number of Acks received per window of data - * (cf. RFC 4341, App. A). The problems are that - * - arsent counts other packets as well; - * - the comparison uses a formula different from RFC 4341; - * - computing a cubic/quadratic equation each time is too complicated. - * Hence a different algorithm is needed. - */ -#if 0 - /* Ack Ratio. Need to maintain a concept of how many windows we sent */ - hc->tx_arsent++; - /* We had an ack loss in this window... */ - if (hc->tx_ackloss) { - if (hc->tx_arsent >= hc->tx_cwnd) { - hc->tx_arsent = 0; - hc->tx_ackloss = 0; - } - } else { - /* No acks lost up to now... */ - /* decrease ack ratio if enough packets were sent */ - if (dp->dccps_l_ack_ratio > 1) { - /* XXX don't calculate denominator each time */ - int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - - dp->dccps_l_ack_ratio; - - denom = hc->tx_cwnd * hc->tx_cwnd / denom; - - if (hc->tx_arsent >= denom) { - ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); - hc->tx_arsent = 0; - } - } else { - /* we can't increase ack ratio further [1] */ - hc->tx_arsent = 0; /* or maybe set it to cwnd*/ - } - } -#endif - - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); - -#ifdef CONFIG_IP_DCCP_CCID2_DEBUG - do { - struct ccid2_seq *seqp = hc->tx_seqt; - - while (seqp != hc->tx_seqh) { - ccid2_pr_debug("out seq=%llu acked=%d time=%u\n", - (unsigned long long)seqp->ccid2s_seq, - seqp->ccid2s_acked, seqp->ccid2s_sent); - seqp = seqp->ccid2s_next; - } - } while (0); - ccid2_pr_debug("=========\n"); -#endif -} - -/** - * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm - * @sk: socket to perform estimator on - * @mrtt: measured RTT - * - * This code is almost identical with TCP's tcp_rtt_estimator(), since - * - it has a higher sampling frequency (recommended by RFC 1323), - * - the RTO does not collapse into RTT due to RTTVAR going towards zero, - * - it is simple (cf. more complex proposals such as Eifel timer or research - * which suggests that the gain should be set according to window size), - * - in tests it was found to work well with CCID2 [gerrit]. - */ -static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - long m = mrtt ? : 1; - - if (hc->tx_srtt == 0) { - /* First measurement m */ - hc->tx_srtt = m << 3; - hc->tx_mdev = m << 1; - - hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk)); - hc->tx_rttvar = hc->tx_mdev_max; - - hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; - } else { - /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ - m -= (hc->tx_srtt >> 3); - hc->tx_srtt += m; - - /* Similarly, update scaled mdev with regard to |m| */ - if (m < 0) { - m = -m; - m -= (hc->tx_mdev >> 2); - /* - * This neutralises RTO increase when RTT < SRTT - mdev - * (see P. Sarolahti, A. Kuznetsov,"Congestion Control - * in Linux TCP", USENIX 2002, pp. 49-62). - */ - if (m > 0) - m >>= 3; - } else { - m -= (hc->tx_mdev >> 2); - } - hc->tx_mdev += m; - - if (hc->tx_mdev > hc->tx_mdev_max) { - hc->tx_mdev_max = hc->tx_mdev; - if (hc->tx_mdev_max > hc->tx_rttvar) - hc->tx_rttvar = hc->tx_mdev_max; - } - - /* - * Decay RTTVAR at most once per flight, exploiting that - * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) - * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) - * GAR is a useful bound for FlightSize = pipe. - * AWL is probably too low here, as it over-estimates pipe. - */ - if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) { - if (hc->tx_mdev_max < hc->tx_rttvar) - hc->tx_rttvar -= (hc->tx_rttvar - - hc->tx_mdev_max) >> 2; - hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; - hc->tx_mdev_max = tcp_rto_min(sk); - } - } - - /* - * Set RTO from SRTT and RTTVAR - * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms. - * This agrees with RFC 4341, 5: - * "Because DCCP does not retransmit data, DCCP does not require - * TCP's recommended minimum timeout of one second". - */ - hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar; - - if (hc->tx_rto > DCCP_RTO_MAX) - hc->tx_rto = DCCP_RTO_MAX; -} - -static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, - unsigned int *maxincr) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - int r_seq_used = hc->tx_cwnd / dp->dccps_l_ack_ratio; - - if (hc->tx_cwnd < dp->dccps_l_seq_win && - r_seq_used < dp->dccps_r_seq_win) { - if (hc->tx_cwnd < hc->tx_ssthresh) { - if (*maxincr > 0 && ++hc->tx_packets_acked >= 2) { - hc->tx_cwnd += 1; - *maxincr -= 1; - hc->tx_packets_acked = 0; - } - } else if (++hc->tx_packets_acked >= hc->tx_cwnd) { - hc->tx_cwnd += 1; - hc->tx_packets_acked = 0; - } - } - - /* - * Adjust the local sequence window and the ack ratio to allow about - * 5 times the number of packets in the network (RFC 4340 7.5.2) - */ - if (r_seq_used * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_r_seq_win) - ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio * 2); - else if (r_seq_used * CCID2_WIN_CHANGE_FACTOR < dp->dccps_r_seq_win/2) - ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio / 2 ? : 1U); - - if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_l_seq_win) - ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win * 2); - else if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR < dp->dccps_l_seq_win/2) - ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win / 2); - - /* - * FIXME: RTT is sampled several times per acknowledgment (for each - * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). - * This causes the RTT to be over-estimated, since the older entries - * in the Ack Vector have earlier sending times. - * The cleanest solution is to not use the ccid2s_sent field at all - * and instead use DCCP timestamps: requires changes in other places. - */ - ccid2_rtt_estimator(sk, ccid2_jiffies32 - seqp->ccid2s_sent); -} - -static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) { - ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); - return; - } - - hc->tx_last_cong = ccid2_jiffies32; - - hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; - hc->tx_ssthresh = max(hc->tx_cwnd, 2U); - - ccid2_check_l_ack_ratio(sk); -} - -static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, - u8 option, u8 *optval, u8 optlen) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - switch (option) { - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen, - option - DCCPO_ACK_VECTOR_0); - } - return 0; -} - -static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); - struct dccp_ackvec_parsed *avp; - u64 ackno, seqno; - struct ccid2_seq *seqp; - int done = 0; - unsigned int maxincr = 0; - - /* check reverse path congestion */ - seqno = DCCP_SKB_CB(skb)->dccpd_seq; - - /* XXX this whole "algorithm" is broken. Need to fix it to keep track - * of the seqnos of the dupacks so that rpseq and rpdupack are correct - * -sorbo. - */ - /* need to bootstrap */ - if (hc->tx_rpdupack == -1) { - hc->tx_rpdupack = 0; - hc->tx_rpseq = seqno; - } else { - /* check if packet is consecutive */ - if (dccp_delta_seqno(hc->tx_rpseq, seqno) == 1) - hc->tx_rpseq = seqno; - /* it's a later packet */ - else if (after48(seqno, hc->tx_rpseq)) { - hc->tx_rpdupack++; - - /* check if we got enough dupacks */ - if (hc->tx_rpdupack >= NUMDUPACK) { - hc->tx_rpdupack = -1; /* XXX lame */ - hc->tx_rpseq = 0; -#ifdef __CCID2_COPES_GRACEFULLY_WITH_ACK_CONGESTION_CONTROL__ - /* - * FIXME: Ack Congestion Control is broken; in - * the current state instabilities occurred with - * Ack Ratios greater than 1; causing hang-ups - * and long RTO timeouts. This needs to be fixed - * before opening up dynamic changes. -- gerrit - */ - ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); -#endif - } - } - } - - /* check forward path congestion */ - if (dccp_packet_without_ack(skb)) - return; - - /* still didn't send out new data packets */ - if (hc->tx_seqh == hc->tx_seqt) - goto done; - - ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; - if (after48(ackno, hc->tx_high_ack)) - hc->tx_high_ack = ackno; - - seqp = hc->tx_seqt; - while (before48(seqp->ccid2s_seq, ackno)) { - seqp = seqp->ccid2s_next; - if (seqp == hc->tx_seqh) { - seqp = hc->tx_seqh->ccid2s_prev; - break; - } - } - - /* - * In slow-start, cwnd can increase up to a maximum of Ack Ratio/2 - * packets per acknowledgement. Rounding up avoids that cwnd is not - * advanced when Ack Ratio is 1 and gives a slight edge otherwise. - */ - if (hc->tx_cwnd < hc->tx_ssthresh) - maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); - - /* go through all ack vectors */ - list_for_each_entry(avp, &hc->tx_av_chunks, node) { - /* go through this ack vector */ - for (; avp->len--; avp->vec++) { - u64 ackno_end_rl = SUB48(ackno, - dccp_ackvec_runlen(avp->vec)); - - ccid2_pr_debug("ackvec %llu |%u,%u|\n", - (unsigned long long)ackno, - dccp_ackvec_state(avp->vec) >> 6, - dccp_ackvec_runlen(avp->vec)); - /* if the seqno we are analyzing is larger than the - * current ackno, then move towards the tail of our - * seqnos. - */ - while (after48(seqp->ccid2s_seq, ackno)) { - if (seqp == hc->tx_seqt) { - done = 1; - break; - } - seqp = seqp->ccid2s_prev; - } - if (done) - break; - - /* check all seqnos in the range of the vector - * run length - */ - while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { - const u8 state = dccp_ackvec_state(avp->vec); - - /* new packet received or marked */ - if (state != DCCPAV_NOT_RECEIVED && - !seqp->ccid2s_acked) { - if (state == DCCPAV_ECN_MARKED) - ccid2_congestion_event(sk, - seqp); - else - ccid2_new_ack(sk, seqp, - &maxincr); - - seqp->ccid2s_acked = 1; - ccid2_pr_debug("Got ack for %llu\n", - (unsigned long long)seqp->ccid2s_seq); - hc->tx_pipe--; - } - if (seqp == hc->tx_seqt) { - done = 1; - break; - } - seqp = seqp->ccid2s_prev; - } - if (done) - break; - - ackno = SUB48(ackno_end_rl, 1); - } - if (done) - break; - } - - /* The state about what is acked should be correct now - * Check for NUMDUPACK - */ - seqp = hc->tx_seqt; - while (before48(seqp->ccid2s_seq, hc->tx_high_ack)) { - seqp = seqp->ccid2s_next; - if (seqp == hc->tx_seqh) { - seqp = hc->tx_seqh->ccid2s_prev; - break; - } - } - done = 0; - while (1) { - if (seqp->ccid2s_acked) { - done++; - if (done == NUMDUPACK) - break; - } - if (seqp == hc->tx_seqt) - break; - seqp = seqp->ccid2s_prev; - } - - /* If there are at least 3 acknowledgements, anything unacknowledged - * below the last sequence number is considered lost - */ - if (done == NUMDUPACK) { - struct ccid2_seq *last_acked = seqp; - - /* check for lost packets */ - while (1) { - if (!seqp->ccid2s_acked) { - ccid2_pr_debug("Packet lost: %llu\n", - (unsigned long long)seqp->ccid2s_seq); - /* XXX need to traverse from tail -> head in - * order to detect multiple congestion events in - * one ack vector. - */ - ccid2_congestion_event(sk, seqp); - hc->tx_pipe--; - } - if (seqp == hc->tx_seqt) - break; - seqp = seqp->ccid2s_prev; - } - - hc->tx_seqt = last_acked; - } - - /* trim acked packets in tail */ - while (hc->tx_seqt != hc->tx_seqh) { - if (!hc->tx_seqt->ccid2s_acked) - break; - - hc->tx_seqt = hc->tx_seqt->ccid2s_next; - } - - /* restart RTO timer if not all outstanding data has been acked */ - if (hc->tx_pipe == 0) - sk_stop_timer(sk, &hc->tx_rtotimer); - else - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); -done: - /* check if incoming Acks allow pending packets to be sent */ - if (sender_was_blocked && !ccid2_cwnd_network_limited(hc)) - dccp_tasklet_schedule(sk); - dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); -} - -static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid_priv(ccid); - struct dccp_sock *dp = dccp_sk(sk); - u32 max_ratio; - - /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ - hc->tx_ssthresh = ~0U; - - /* Use larger initial windows (RFC 4341, section 5). */ - hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); - hc->tx_expected_wnd = hc->tx_cwnd; - - /* Make sure that Ack Ratio is enabled and within bounds. */ - max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2); - if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) - dp->dccps_l_ack_ratio = max_ratio; - - /* XXX init ~ to window size... */ - if (ccid2_hc_tx_alloc_seq(hc)) - return -ENOMEM; - - hc->tx_rto = DCCP_TIMEOUT_INIT; - hc->tx_rpdupack = -1; - hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_jiffies32; - hc->tx_cwnd_used = 0; - hc->sk = sk; - timer_setup(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, 0); - INIT_LIST_HEAD(&hc->tx_av_chunks); - return 0; -} - -static void ccid2_hc_tx_exit(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - int i; - - sk_stop_timer(sk, &hc->tx_rtotimer); - - for (i = 0; i < hc->tx_seqbufc; i++) - kfree(hc->tx_seqbuf[i]); - hc->tx_seqbufc = 0; - dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); -} - -static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct ccid2_hc_rx_sock *hc = ccid2_hc_rx_sk(sk); - - if (!dccp_data_packet(skb)) - return; - - if (++hc->rx_num_data_pkts >= dccp_sk(sk)->dccps_r_ack_ratio) { - dccp_send_ack(sk); - hc->rx_num_data_pkts = 0; - } -} - -struct ccid_operations ccid2_ops = { - .ccid_id = DCCPC_CCID2, - .ccid_name = "TCP-like", - .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), - .ccid_hc_tx_init = ccid2_hc_tx_init, - .ccid_hc_tx_exit = ccid2_hc_tx_exit, - .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, - .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, - .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, - .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, - .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), - .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, -}; - -#ifdef CONFIG_IP_DCCP_CCID2_DEBUG -module_param(ccid2_debug, bool, 0644); -MODULE_PARM_DESC(ccid2_debug, "Enable CCID-2 debug messages"); -#endif diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h deleted file mode 100644 index 330c7b4ec0013..0000000000000 --- a/net/dccp/ccids/ccid2.h +++ /dev/null @@ -1,121 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (c) 2005 Andrea Bittau - */ -#ifndef _DCCP_CCID2_H_ -#define _DCCP_CCID2_H_ - -#include -#include -#include "../ccid.h" -#include "../dccp.h" - -/* - * CCID-2 timestamping faces the same issues as TCP timestamping. - * Hence we reuse/share as much of the code as possible. - */ -#define ccid2_jiffies32 ((u32)jiffies) - -/* NUMDUPACK parameter from RFC 4341, p. 6 */ -#define NUMDUPACK 3 - -struct ccid2_seq { - u64 ccid2s_seq; - u32 ccid2s_sent; - int ccid2s_acked; - struct ccid2_seq *ccid2s_prev; - struct ccid2_seq *ccid2s_next; -}; - -#define CCID2_SEQBUF_LEN 1024 -#define CCID2_SEQBUF_MAX 128 - -/* - * Multiple of congestion window to keep the sequence window at - * (RFC 4340 7.5.2) - */ -#define CCID2_WIN_CHANGE_FACTOR 5 - -/** - * struct ccid2_hc_tx_sock - CCID2 TX half connection - * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 - * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465) - * @tx_srtt: smoothed RTT estimate, scaled by 2^3 - * @tx_mdev: smoothed RTT variation, scaled by 2^2 - * @tx_mdev_max: maximum of @mdev during one flight - * @tx_rttvar: moving average/maximum of @mdev_max - * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) - * @tx_rtt_seq: to decay RTTVAR at most once per flight - * @tx_cwnd_used: actually used cwnd, W_used of RFC 2861 - * @tx_expected_wnd: moving average of @tx_cwnd_used - * @tx_cwnd_stamp: to track idle periods in CWV - * @tx_lsndtime: last time (in jiffies) a data packet was sent - * @tx_rpseq: last consecutive seqno - * @tx_rpdupack: dupacks since rpseq - * @tx_av_chunks: list of Ack Vectors received on current skb - */ -struct ccid2_hc_tx_sock { - u32 tx_cwnd; - u32 tx_ssthresh; - u32 tx_pipe; - u32 tx_packets_acked; - struct ccid2_seq *tx_seqbuf[CCID2_SEQBUF_MAX]; - int tx_seqbufc; - struct ccid2_seq *tx_seqh; - struct ccid2_seq *tx_seqt; - - /* RTT measurement: variables/principles are the same as in TCP */ - u32 tx_srtt, - tx_mdev, - tx_mdev_max, - tx_rttvar, - tx_rto; - u64 tx_rtt_seq:48; - struct timer_list tx_rtotimer; - struct sock *sk; - - /* Congestion Window validation (optional, RFC 2861) */ - u32 tx_cwnd_used, - tx_expected_wnd, - tx_cwnd_stamp, - tx_lsndtime; - - u64 tx_rpseq; - int tx_rpdupack; - u32 tx_last_cong; - u64 tx_high_ack; - struct list_head tx_av_chunks; -}; - -static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc) -{ - return hc->tx_pipe >= hc->tx_cwnd; -} - -/* - * Convert RFC 3390 larger initial window into an equivalent number of packets. - * This is based on the numbers specified in RFC 5681, 3.1. - */ -static inline u32 rfc3390_bytes_to_packets(const u32 smss) -{ - return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3); -} - -/** - * struct ccid2_hc_rx_sock - Receiving end of CCID-2 half-connection - * @rx_num_data_pkts: number of data packets received since last feedback - */ -struct ccid2_hc_rx_sock { - u32 rx_num_data_pkts; -}; - -static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) -{ - return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid); -} - -static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk) -{ - return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid); -} -#endif /* _DCCP_CCID2_H_ */ diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c deleted file mode 100644 index f349d16dd8f65..0000000000000 --- a/net/dccp/ccids/ccid3.c +++ /dev/null @@ -1,866 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-7 Ian McDonald - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo . - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ -#include "../dccp.h" -#include "ccid3.h" - -#include - -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static bool ccid3_debug; -#define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a) -#else -#define ccid3_pr_debug(format, a...) -#endif - -/* - * Transmitter Half-Connection Routines - */ -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) -{ - static const char *const ccid3_state_names[] = { - [TFRC_SSTATE_NO_SENT] = "NO_SENT", - [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", - [TFRC_SSTATE_FBACK] = "FBACK", - }; - - return ccid3_state_names[state]; -} -#endif - -static void ccid3_hc_tx_set_state(struct sock *sk, - enum ccid3_hc_tx_states state) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - enum ccid3_hc_tx_states oldstate = hc->tx_state; - - ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", - dccp_role(sk), sk, ccid3_tx_state_name(oldstate), - ccid3_tx_state_name(state)); - WARN_ON(state == oldstate); - hc->tx_state = state; -} - -/* - * Compute the initial sending rate X_init in the manner of RFC 3390: - * - * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT - * - * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis - * (rev-02) clarifies the use of RFC 3390 with regard to the above formula. - * For consistency with other parts of the code, X_init is scaled by 2^6. - */ -static inline u64 rfc3390_initial_rate(struct sock *sk) -{ - const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - const __u32 w_init = clamp_t(__u32, 4380U, 2 * hc->tx_s, 4 * hc->tx_s); - - return scaled_div(w_init << 6, hc->tx_rtt); -} - -/** - * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst - * @hc: socket to have the send interval updated - * - * This respects the granularity of X_inst (64 * bytes/second). - */ -static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc) -{ - hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x); - - DCCP_BUG_ON(hc->tx_t_ipi == 0); - ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi, - hc->tx_s, (unsigned int)(hc->tx_x >> 6)); -} - -static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now) -{ - u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count); - - return delta / hc->tx_rtt; -} - -/** - * ccid3_hc_tx_update_x - Update allowed sending rate X - * @sk: socket to be updated - * @stamp: most recent time if available - can be left NULL. - * - * This function tracks draft rfc3448bis, check there for latest details. - * - * Note: X and X_recv are both stored in units of 64 * bytes/second, to support - * fine-grained resolution of sending rates. This requires scaling by 2^6 - * throughout the code. Only X_calc is unscaled (in bytes/second). - * - */ -static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - __u64 min_rate = 2 * hc->tx_x_recv; - const __u64 old_x = hc->tx_x; - ktime_t now = stamp ? *stamp : ktime_get_real(); - - /* - * Handle IDLE periods: do not reduce below RFC3390 initial sending rate - * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis: - * a sender is idle if it has not sent anything over a 2-RTT-period. - * For consistency with X and X_recv, min_rate is also scaled by 2^6. - */ - if (ccid3_hc_tx_idle_rtt(hc, now) >= 2) { - min_rate = rfc3390_initial_rate(sk); - min_rate = max(min_rate, 2 * hc->tx_x_recv); - } - - if (hc->tx_p > 0) { - - hc->tx_x = min(((__u64)hc->tx_x_calc) << 6, min_rate); - hc->tx_x = max(hc->tx_x, (((__u64)hc->tx_s) << 6) / TFRC_T_MBI); - - } else if (ktime_us_delta(now, hc->tx_t_ld) - (s64)hc->tx_rtt >= 0) { - - hc->tx_x = min(2 * hc->tx_x, min_rate); - hc->tx_x = max(hc->tx_x, - scaled_div(((__u64)hc->tx_s) << 6, hc->tx_rtt)); - hc->tx_t_ld = now; - } - - if (hc->tx_x != old_x) { - ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " - "X_recv=%u\n", (unsigned int)(old_x >> 6), - (unsigned int)(hc->tx_x >> 6), hc->tx_x_calc, - (unsigned int)(hc->tx_x_recv >> 6)); - - ccid3_update_send_interval(hc); - } -} - -/** - * ccid3_hc_tx_update_s - Track the mean packet size `s' - * @hc: socket to be updated - * @len: DCCP packet payload size in bytes - * - * cf. RFC 4342, 5.3 and RFC 3448, 4.1 - */ -static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len) -{ - const u16 old_s = hc->tx_s; - - hc->tx_s = tfrc_ewma(hc->tx_s, len, 9); - - if (hc->tx_s != old_s) - ccid3_update_send_interval(hc); -} - -/* - * Update Window Counter using the algorithm from [RFC 4342, 8.1]. - * As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt(). - */ -static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hc, - ktime_t now) -{ - u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count), - quarter_rtts = (4 * delta) / hc->tx_rtt; - - if (quarter_rtts > 0) { - hc->tx_t_last_win_count = now; - hc->tx_last_win_count += min(quarter_rtts, 5U); - hc->tx_last_win_count &= 0xF; /* mod 16 */ - } -} - -static void ccid3_hc_tx_no_feedback_timer(struct timer_list *t) -{ - struct ccid3_hc_tx_sock *hc = from_timer(hc, t, tx_no_feedback_timer); - struct sock *sk = hc->sk; - unsigned long t_nfb = USEC_PER_SEC / 5; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later. */ - /* XXX: set some sensible MIB */ - goto restart_timer; - } - - ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk, - ccid3_tx_state_name(hc->tx_state)); - - /* Ignore and do not restart after leaving the established state */ - if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) - goto out; - - /* Reset feedback state to "no feedback received" */ - if (hc->tx_state == TFRC_SSTATE_FBACK) - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - - /* - * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 - * RTO is 0 if and only if no feedback has been received yet. - */ - if (hc->tx_t_rto == 0 || hc->tx_p == 0) { - - /* halve send rate directly */ - hc->tx_x = max(hc->tx_x / 2, - (((__u64)hc->tx_s) << 6) / TFRC_T_MBI); - ccid3_update_send_interval(hc); - } else { - /* - * Modify the cached value of X_recv - * - * If (X_calc > 2 * X_recv) - * X_recv = max(X_recv / 2, s / (2 * t_mbi)); - * Else - * X_recv = X_calc / 4; - * - * Note that X_recv is scaled by 2^6 while X_calc is not - */ - if (hc->tx_x_calc > (hc->tx_x_recv >> 5)) - hc->tx_x_recv = - max(hc->tx_x_recv / 2, - (((__u64)hc->tx_s) << 6) / (2*TFRC_T_MBI)); - else { - hc->tx_x_recv = hc->tx_x_calc; - hc->tx_x_recv <<= 4; - } - ccid3_hc_tx_update_x(sk, NULL); - } - ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", - (unsigned long long)hc->tx_x); - - /* - * Set new timeout for the nofeedback timer. - * See comments in packet_recv() regarding the value of t_RTO. - */ - if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */ - t_nfb = TFRC_INITIAL_TIMEOUT; - else - t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); - -restart_timer: - sk_reset_timer(sk, &hc->tx_no_feedback_timer, - jiffies + usecs_to_jiffies(t_nfb)); -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -/** - * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets - * @sk: socket to send packet from - * @skb: next packet candidate to send on @sk - * - * This function uses the convention of ccid_packet_dequeue_eval() and - * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. - */ -static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - ktime_t now = ktime_get_real(); - s64 delay; - - /* - * This function is called only for Data and DataAck packets. Sending - * zero-sized Data(Ack)s is theoretically possible, but for congestion - * control this case is pathological - ignore it. - */ - if (unlikely(skb->len == 0)) - return -EBADMSG; - - if (hc->tx_state == TFRC_SSTATE_NO_SENT) { - sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies + - usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); - hc->tx_last_win_count = 0; - hc->tx_t_last_win_count = now; - - /* Set t_0 for initial packet */ - hc->tx_t_nom = now; - - hc->tx_s = skb->len; - - /* - * Use initial RTT sample when available: recommended by erratum - * to RFC 4342. This implements the initialisation procedure of - * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6. - */ - if (dp->dccps_syn_rtt) { - ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); - hc->tx_rtt = dp->dccps_syn_rtt; - hc->tx_x = rfc3390_initial_rate(sk); - hc->tx_t_ld = now; - } else { - /* - * Sender does not have RTT sample: - * - set fallback RTT (RFC 4340, 3.4) since a RTT value - * is needed in several parts (e.g. window counter); - * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. - */ - hc->tx_rtt = DCCP_FALLBACK_RTT; - hc->tx_x = hc->tx_s; - hc->tx_x <<= 6; - } - ccid3_update_send_interval(hc); - - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - - } else { - delay = ktime_us_delta(hc->tx_t_nom, now); - ccid3_pr_debug("delay=%ld\n", (long)delay); - /* - * Scheduling of packet transmissions (RFC 5348, 8.3) - * - * if (t_now > t_nom - delta) - * // send the packet now - * else - * // send the packet in (t_nom - t_now) milliseconds. - */ - if (delay >= TFRC_T_DELTA) - return (u32)delay / USEC_PER_MSEC; - - ccid3_hc_tx_update_win_count(hc, now); - } - - /* prepare to send now (add options etc.) */ - dp->dccps_hc_tx_insert_options = 1; - DCCP_SKB_CB(skb)->dccpd_ccval = hc->tx_last_win_count; - - /* set the nominal send time for the next following packet */ - hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi); - return CCID_PACKET_SEND_AT_ONCE; -} - -static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - - ccid3_hc_tx_update_s(hc, len); - - if (tfrc_tx_hist_add(&hc->tx_hist, dccp_sk(sk)->dccps_gss)) - DCCP_CRIT("packet history - out of memory!"); -} - -static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct tfrc_tx_hist_entry *acked; - ktime_t now; - unsigned long t_nfb; - u32 r_sample; - - /* we are only interested in ACKs */ - if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || - DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) - return; - /* - * Locate the acknowledged packet in the TX history. - * - * Returning "entry not found" here can for instance happen when - * - the host has not sent out anything (e.g. a passive server), - * - the Ack is outdated (packet with higher Ack number was received), - * - it is a bogus Ack (for a packet not sent on this connection). - */ - acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb)); - if (acked == NULL) - return; - /* For the sake of RTT sampling, ignore/remove all older entries */ - tfrc_tx_hist_purge(&acked->next); - - /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ - now = ktime_get_real(); - r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); - hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9); - - /* - * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 - */ - if (hc->tx_state == TFRC_SSTATE_NO_FBACK) { - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); - - if (hc->tx_t_rto == 0) { - /* - * Initial feedback packet: Larger Initial Windows (4.2) - */ - hc->tx_x = rfc3390_initial_rate(sk); - hc->tx_t_ld = now; - - ccid3_update_send_interval(hc); - - goto done_computing_x; - } else if (hc->tx_p == 0) { - /* - * First feedback after nofeedback timer expiry (4.3) - */ - goto done_computing_x; - } - } - - /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ - if (hc->tx_p > 0) - hc->tx_x_calc = tfrc_calc_x(hc->tx_s, hc->tx_rtt, hc->tx_p); - ccid3_hc_tx_update_x(sk, &now); - -done_computing_x: - ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " - "p=%u, X_calc=%u, X_recv=%u, X=%u\n", - dccp_role(sk), sk, hc->tx_rtt, r_sample, - hc->tx_s, hc->tx_p, hc->tx_x_calc, - (unsigned int)(hc->tx_x_recv >> 6), - (unsigned int)(hc->tx_x >> 6)); - - /* unschedule no feedback timer */ - sk_stop_timer(sk, &hc->tx_no_feedback_timer); - - /* - * As we have calculated new ipi, delta, t_nom it is possible - * that we now can send a packet, so wake up dccp_wait_for_ccid - */ - sk->sk_write_space(sk); - - /* - * Update timeout interval for the nofeedback timer. In order to control - * rate halving on networks with very low RTTs (<= 1 ms), use per-route - * tunable RTAX_RTO_MIN value as the lower bound. - */ - hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, - USEC_PER_SEC/HZ * tcp_rto_min(sk)); - /* - * Schedule no feedback timer to expire in - * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) - */ - t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); - - ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " - "expire in %lu jiffies (%luus)\n", - dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); - - sk_reset_timer(sk, &hc->tx_no_feedback_timer, - jiffies + usecs_to_jiffies(t_nfb)); -} - -static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, - u8 option, u8 *optval, u8 optlen) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - __be32 opt_val; - - switch (option) { - case TFRC_OPT_RECEIVE_RATE: - case TFRC_OPT_LOSS_EVENT_RATE: - /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ - if (packet_type == DCCP_PKT_DATA) - break; - if (unlikely(optlen != 4)) { - DCCP_WARN("%s(%p), invalid len %d for %u\n", - dccp_role(sk), sk, optlen, option); - return -EINVAL; - } - opt_val = ntohl(get_unaligned((__be32 *)optval)); - - if (option == TFRC_OPT_RECEIVE_RATE) { - /* Receive Rate is kept in units of 64 bytes/second */ - hc->tx_x_recv = opt_val; - hc->tx_x_recv <<= 6; - - ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", - dccp_role(sk), sk, opt_val); - } else { - /* Update the fixpoint Loss Event Rate fraction */ - hc->tx_p = tfrc_invert_loss_event_rate(opt_val); - - ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", - dccp_role(sk), sk, opt_val); - } - } - return 0; -} - -static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) -{ - struct ccid3_hc_tx_sock *hc = ccid_priv(ccid); - - hc->tx_state = TFRC_SSTATE_NO_SENT; - hc->tx_hist = NULL; - hc->sk = sk; - timer_setup(&hc->tx_no_feedback_timer, - ccid3_hc_tx_no_feedback_timer, 0); - return 0; -} - -static void ccid3_hc_tx_exit(struct sock *sk) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - - sk_stop_timer(sk, &hc->tx_no_feedback_timer); - tfrc_tx_hist_purge(&hc->tx_hist); -} - -static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) -{ - info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto; - info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt; -} - -static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct tfrc_tx_info tfrc; - const void *val; - - switch (optname) { - case DCCP_SOCKOPT_CCID_TX_INFO: - if (len < sizeof(tfrc)) - return -EINVAL; - memset(&tfrc, 0, sizeof(tfrc)); - tfrc.tfrctx_x = hc->tx_x; - tfrc.tfrctx_x_recv = hc->tx_x_recv; - tfrc.tfrctx_x_calc = hc->tx_x_calc; - tfrc.tfrctx_rtt = hc->tx_rtt; - tfrc.tfrctx_p = hc->tx_p; - tfrc.tfrctx_rto = hc->tx_t_rto; - tfrc.tfrctx_ipi = hc->tx_t_ipi; - len = sizeof(tfrc); - val = &tfrc; - break; - default: - return -ENOPROTOOPT; - } - - if (put_user(len, optlen) || copy_to_user(optval, val, len)) - return -EFAULT; - - return 0; -} - -/* - * Receiver Half-Connection Routines - */ - -/* CCID3 feedback types */ -enum ccid3_fback_type { - CCID3_FBACK_NONE = 0, - CCID3_FBACK_INITIAL, - CCID3_FBACK_PERIODIC, - CCID3_FBACK_PARAM_CHANGE -}; - -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) -{ - static const char *const ccid3_rx_state_names[] = { - [TFRC_RSTATE_NO_DATA] = "NO_DATA", - [TFRC_RSTATE_DATA] = "DATA", - }; - - return ccid3_rx_state_names[state]; -} -#endif - -static void ccid3_hc_rx_set_state(struct sock *sk, - enum ccid3_hc_rx_states state) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - enum ccid3_hc_rx_states oldstate = hc->rx_state; - - ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", - dccp_role(sk), sk, ccid3_rx_state_name(oldstate), - ccid3_rx_state_name(state)); - WARN_ON(state == oldstate); - hc->rx_state = state; -} - -static void ccid3_hc_rx_send_feedback(struct sock *sk, - const struct sk_buff *skb, - enum ccid3_fback_type fbtype) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - ktime_t now = ktime_get(); - s64 delta = 0; - - switch (fbtype) { - case CCID3_FBACK_INITIAL: - hc->rx_x_recv = 0; - hc->rx_pinv = ~0U; /* see RFC 4342, 8.5 */ - break; - case CCID3_FBACK_PARAM_CHANGE: - /* - * When parameters change (new loss or p > p_prev), we do not - * have a reliable estimate for R_m of [RFC 3448, 6.2] and so - * need to reuse the previous value of X_recv. However, when - * X_recv was 0 (due to early loss), this would kill X down to - * s/t_mbi (i.e. one packet in 64 seconds). - * To avoid such drastic reduction, we approximate X_recv as - * the number of bytes since last feedback. - * This is a safe fallback, since X is bounded above by X_calc. - */ - if (hc->rx_x_recv > 0) - break; - fallthrough; - case CCID3_FBACK_PERIODIC: - delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback); - if (delta <= 0) - delta = 1; - hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta); - break; - default: - return; - } - - ccid3_pr_debug("Interval %lldusec, X_recv=%u, 1/p=%u\n", delta, - hc->rx_x_recv, hc->rx_pinv); - - hc->rx_tstamp_last_feedback = now; - hc->rx_last_counter = dccp_hdr(skb)->dccph_ccval; - hc->rx_bytes_recv = 0; - - dp->dccps_hc_rx_insert_options = 1; - dccp_send_ack(sk); -} - -static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) -{ - const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - __be32 x_recv, pinv; - - if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) - return 0; - - if (dccp_packet_without_ack(skb)) - return 0; - - x_recv = htonl(hc->rx_x_recv); - pinv = htonl(hc->rx_pinv); - - if (dccp_insert_option(skb, TFRC_OPT_LOSS_EVENT_RATE, - &pinv, sizeof(pinv)) || - dccp_insert_option(skb, TFRC_OPT_RECEIVE_RATE, - &x_recv, sizeof(x_recv))) - return -1; - - return 0; -} - -/** - * ccid3_first_li - Implements [RFC 5348, 6.3.1] - * @sk: socket to calculate loss interval for - * - * Determine the length of the first loss interval via inverse lookup. - * Assume that X_recv can be computed by the throughput equation - * s - * X_recv = -------- - * R * fval - * Find some p such that f(p) = fval; return 1/p (scaled). - */ -static u32 ccid3_first_li(struct sock *sk) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - u32 x_recv, p; - s64 delta; - u64 fval; - - if (hc->rx_rtt == 0) { - DCCP_WARN("No RTT estimate available, using fallback RTT\n"); - hc->rx_rtt = DCCP_FALLBACK_RTT; - } - - delta = ktime_us_delta(ktime_get(), hc->rx_tstamp_last_feedback); - if (delta <= 0) - delta = 1; - x_recv = scaled_div32(hc->rx_bytes_recv, delta); - if (x_recv == 0) { /* would also trigger divide-by-zero */ - DCCP_WARN("X_recv==0\n"); - if (hc->rx_x_recv == 0) { - DCCP_BUG("stored value of X_recv is zero"); - return ~0U; - } - x_recv = hc->rx_x_recv; - } - - fval = scaled_div(hc->rx_s, hc->rx_rtt); - fval = scaled_div32(fval, x_recv); - p = tfrc_calc_x_reverse_lookup(fval); - - ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " - "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); - - return p == 0 ? ~0U : scaled_div(1, p); -} - -static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; - const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; - const bool is_data_packet = dccp_data_packet(skb); - - if (unlikely(hc->rx_state == TFRC_RSTATE_NO_DATA)) { - if (is_data_packet) { - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; - do_feedback = CCID3_FBACK_INITIAL; - ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); - hc->rx_s = payload; - /* - * Not necessary to update rx_bytes_recv here, - * since X_recv = 0 for the first feedback packet (cf. - * RFC 3448, 6.3) -- gerrit - */ - } - goto update_records; - } - - if (tfrc_rx_hist_duplicate(&hc->rx_hist, skb)) - return; /* done receiving */ - - if (is_data_packet) { - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; - /* - * Update moving-average of s and the sum of received payload bytes - */ - hc->rx_s = tfrc_ewma(hc->rx_s, payload, 9); - hc->rx_bytes_recv += payload; - } - - /* - * Perform loss detection and handle pending losses - */ - if (tfrc_rx_handle_loss(&hc->rx_hist, &hc->rx_li_hist, - skb, ndp, ccid3_first_li, sk)) { - do_feedback = CCID3_FBACK_PARAM_CHANGE; - goto done_receiving; - } - - if (tfrc_rx_hist_loss_pending(&hc->rx_hist)) - return; /* done receiving */ - - /* - * Handle data packets: RTT sampling and monitoring p - */ - if (unlikely(!is_data_packet)) - goto update_records; - - if (!tfrc_lh_is_initialised(&hc->rx_li_hist)) { - const u32 sample = tfrc_rx_hist_sample_rtt(&hc->rx_hist, skb); - /* - * Empty loss history: no loss so far, hence p stays 0. - * Sample RTT values, since an RTT estimate is required for the - * computation of p when the first loss occurs; RFC 3448, 6.3.1. - */ - if (sample != 0) - hc->rx_rtt = tfrc_ewma(hc->rx_rtt, sample, 9); - - } else if (tfrc_lh_update_i_mean(&hc->rx_li_hist, skb)) { - /* - * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean - * has decreased (resp. p has increased), send feedback now. - */ - do_feedback = CCID3_FBACK_PARAM_CHANGE; - } - - /* - * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 - */ - if (SUB16(dccp_hdr(skb)->dccph_ccval, hc->rx_last_counter) > 3) - do_feedback = CCID3_FBACK_PERIODIC; - -update_records: - tfrc_rx_hist_add_packet(&hc->rx_hist, skb, ndp); - -done_receiving: - if (do_feedback) - ccid3_hc_rx_send_feedback(sk, skb, do_feedback); -} - -static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) -{ - struct ccid3_hc_rx_sock *hc = ccid_priv(ccid); - - hc->rx_state = TFRC_RSTATE_NO_DATA; - tfrc_lh_init(&hc->rx_li_hist); - return tfrc_rx_hist_alloc(&hc->rx_hist); -} - -static void ccid3_hc_rx_exit(struct sock *sk) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - - tfrc_rx_hist_purge(&hc->rx_hist); - tfrc_lh_cleanup(&hc->rx_li_hist); -} - -static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) -{ - info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state; - info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt; -} - -static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - struct tfrc_rx_info rx_info; - const void *val; - - switch (optname) { - case DCCP_SOCKOPT_CCID_RX_INFO: - if (len < sizeof(rx_info)) - return -EINVAL; - rx_info.tfrcrx_x_recv = hc->rx_x_recv; - rx_info.tfrcrx_rtt = hc->rx_rtt; - rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv); - len = sizeof(rx_info); - val = &rx_info; - break; - default: - return -ENOPROTOOPT; - } - - if (put_user(len, optlen) || copy_to_user(optval, val, len)) - return -EFAULT; - - return 0; -} - -struct ccid_operations ccid3_ops = { - .ccid_id = DCCPC_CCID3, - .ccid_name = "TCP-Friendly Rate Control", - .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock), - .ccid_hc_tx_init = ccid3_hc_tx_init, - .ccid_hc_tx_exit = ccid3_hc_tx_exit, - .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet, - .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent, - .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv, - .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options, - .ccid_hc_rx_obj_size = sizeof(struct ccid3_hc_rx_sock), - .ccid_hc_rx_init = ccid3_hc_rx_init, - .ccid_hc_rx_exit = ccid3_hc_rx_exit, - .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options, - .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv, - .ccid_hc_rx_get_info = ccid3_hc_rx_get_info, - .ccid_hc_tx_get_info = ccid3_hc_tx_get_info, - .ccid_hc_rx_getsockopt = ccid3_hc_rx_getsockopt, - .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, -}; - -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -module_param(ccid3_debug, bool, 0644); -MODULE_PARM_DESC(ccid3_debug, "Enable CCID-3 debug messages"); -#endif diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h deleted file mode 100644 index 02e0fc9f6334b..0000000000000 --- a/net/dccp/ccids/ccid3.h +++ /dev/null @@ -1,148 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo . - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ -#ifndef _DCCP_CCID3_H_ -#define _DCCP_CCID3_H_ - -#include -#include -#include -#include -#include "lib/tfrc.h" -#include "../ccid.h" - -/* Two seconds as per RFC 5348, 4.2 */ -#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) - -/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ -#define TFRC_T_MBI 64 - -/* - * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are - * rounded down to 0, since sk_reset_timer() here uses millisecond granularity. - * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse - * resolution of HZ < 500 means that the error is below one timer tick (t_gran) - * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ). - */ -#if (HZ >= 500) -# define TFRC_T_DELTA USEC_PER_MSEC -#else -# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ)) -#endif - -enum ccid3_options { - TFRC_OPT_LOSS_EVENT_RATE = 192, - TFRC_OPT_LOSS_INTERVALS = 193, - TFRC_OPT_RECEIVE_RATE = 194, -}; - -/* TFRC sender states */ -enum ccid3_hc_tx_states { - TFRC_SSTATE_NO_SENT = 1, - TFRC_SSTATE_NO_FBACK, - TFRC_SSTATE_FBACK, -}; - -/** - * struct ccid3_hc_tx_sock - CCID3 sender half-connection socket - * @tx_x: Current sending rate in 64 * bytes per second - * @tx_x_recv: Receive rate in 64 * bytes per second - * @tx_x_calc: Calculated rate in bytes per second - * @tx_rtt: Estimate of current round trip time in usecs - * @tx_p: Current loss event rate (0-1) scaled by 1000000 - * @tx_s: Packet size in bytes - * @tx_t_rto: Nofeedback Timer setting in usecs - * @tx_t_ipi: Interpacket (send) interval (RFC 3448, 4.6) in usecs - * @tx_state: Sender state, one of %ccid3_hc_tx_states - * @tx_last_win_count: Last window counter sent - * @tx_t_last_win_count: Timestamp of earliest packet - * with last_win_count value sent - * @tx_no_feedback_timer: Handle to no feedback timer - * @tx_t_ld: Time last doubled during slow start - * @tx_t_nom: Nominal send time of next packet - * @tx_hist: Packet history - */ -struct ccid3_hc_tx_sock { - u64 tx_x; - u64 tx_x_recv; - u32 tx_x_calc; - u32 tx_rtt; - u32 tx_p; - u32 tx_t_rto; - u32 tx_t_ipi; - u16 tx_s; - enum ccid3_hc_tx_states tx_state:8; - u8 tx_last_win_count; - ktime_t tx_t_last_win_count; - struct timer_list tx_no_feedback_timer; - struct sock *sk; - ktime_t tx_t_ld; - ktime_t tx_t_nom; - struct tfrc_tx_hist_entry *tx_hist; -}; - -static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) -{ - struct ccid3_hc_tx_sock *hctx = ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid); - BUG_ON(hctx == NULL); - return hctx; -} - -/* TFRC receiver states */ -enum ccid3_hc_rx_states { - TFRC_RSTATE_NO_DATA = 1, - TFRC_RSTATE_DATA, -}; - -/** - * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket - * @rx_last_counter: Tracks window counter (RFC 4342, 8.1) - * @rx_state: Receiver state, one of %ccid3_hc_rx_states - * @rx_bytes_recv: Total sum of DCCP payload bytes - * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3) - * @rx_rtt: Receiver estimate of RTT - * @rx_tstamp_last_feedback: Time at which last feedback was sent - * @rx_hist: Packet history (loss detection + RTT sampling) - * @rx_li_hist: Loss Interval database - * @rx_s: Received packet size in bytes - * @rx_pinv: Inverse of Loss Event Rate (RFC 4342, sec. 8.5) - */ -struct ccid3_hc_rx_sock { - u8 rx_last_counter:4; - enum ccid3_hc_rx_states rx_state:8; - u32 rx_bytes_recv; - u32 rx_x_recv; - u32 rx_rtt; - ktime_t rx_tstamp_last_feedback; - struct tfrc_rx_hist rx_hist; - struct tfrc_loss_hist rx_li_hist; - u16 rx_s; -#define rx_pinv rx_li_hist.i_mean -}; - -static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) -{ - struct ccid3_hc_rx_sock *hcrx = ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid); - BUG_ON(hcrx == NULL); - return hcrx; -} - -#endif /* _DCCP_CCID3_H_ */ diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c deleted file mode 100644 index da95319842bbe..0000000000000 --- a/net/dccp/ccids/lib/loss_interval.c +++ /dev/null @@ -1,184 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-7 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ -#include -#include "tfrc.h" - -static struct kmem_cache *tfrc_lh_slab __read_mostly; -/* Loss Interval weights from [RFC 3448, 5.4], scaled by 10 */ -static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 }; - -/* implements LIFO semantics on the array */ -static inline u8 LIH_INDEX(const u8 ctr) -{ - return LIH_SIZE - 1 - (ctr % LIH_SIZE); -} - -/* the `counter' index always points at the next entry to be populated */ -static inline struct tfrc_loss_interval *tfrc_lh_peek(struct tfrc_loss_hist *lh) -{ - return lh->counter ? lh->ring[LIH_INDEX(lh->counter - 1)] : NULL; -} - -/* given i with 0 <= i <= k, return I_i as per the rfc3448bis notation */ -static inline u32 tfrc_lh_get_interval(struct tfrc_loss_hist *lh, const u8 i) -{ - BUG_ON(i >= lh->counter); - return lh->ring[LIH_INDEX(lh->counter - i - 1)]->li_length; -} - -/* - * On-demand allocation and de-allocation of entries - */ -static struct tfrc_loss_interval *tfrc_lh_demand_next(struct tfrc_loss_hist *lh) -{ - if (lh->ring[LIH_INDEX(lh->counter)] == NULL) - lh->ring[LIH_INDEX(lh->counter)] = kmem_cache_alloc(tfrc_lh_slab, - GFP_ATOMIC); - return lh->ring[LIH_INDEX(lh->counter)]; -} - -void tfrc_lh_cleanup(struct tfrc_loss_hist *lh) -{ - if (!tfrc_lh_is_initialised(lh)) - return; - - for (lh->counter = 0; lh->counter < LIH_SIZE; lh->counter++) - if (lh->ring[LIH_INDEX(lh->counter)] != NULL) { - kmem_cache_free(tfrc_lh_slab, - lh->ring[LIH_INDEX(lh->counter)]); - lh->ring[LIH_INDEX(lh->counter)] = NULL; - } -} - -static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) -{ - u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0; - int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */ - - if (k <= 0) - return; - - for (i = 0; i <= k; i++) { - i_i = tfrc_lh_get_interval(lh, i); - - if (i < k) { - i_tot0 += i_i * tfrc_lh_weights[i]; - w_tot += tfrc_lh_weights[i]; - } - if (i > 0) - i_tot1 += i_i * tfrc_lh_weights[i-1]; - } - - lh->i_mean = max(i_tot0, i_tot1) / w_tot; -} - -/** - * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 - * @lh: histogram to update - * @skb: received socket triggering loss interval update - * - * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev - */ -u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) -{ - struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); - u32 old_i_mean = lh->i_mean; - s64 len; - - if (cur == NULL) /* not initialised */ - return 0; - - len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; - - if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ - return 0; - - if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) - /* - * Implements RFC 4342, 10.2: - * If a packet S (skb) exists whose seqno comes `after' the one - * starting the current loss interval (cur) and if the modulo-16 - * distance from C(cur) to C(S) is greater than 4, consider all - * subsequent packets as belonging to a new loss interval. This - * test is necessary since CCVal may wrap between intervals. - */ - cur->li_is_closed = 1; - - if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ - return 0; - - cur->li_length = len; - tfrc_lh_calc_i_mean(lh); - - return lh->i_mean < old_i_mean; -} - -/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ -static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, - struct tfrc_rx_hist_entry *new_loss) -{ - return dccp_delta_seqno(cur->li_seqno, new_loss->tfrchrx_seqno) > 0 && - (cur->li_is_closed || SUB16(new_loss->tfrchrx_ccval, cur->li_ccval) > 4); -} - -/** - * tfrc_lh_interval_add - Insert new record into the Loss Interval database - * @lh: Loss Interval database - * @rh: Receive history containing a fresh loss event - * @calc_first_li: Caller-dependent routine to compute length of first interval - * @sk: Used by @calc_first_li in caller-specific way (subtyping) - * - * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. - */ -int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, - u32 (*calc_first_li)(struct sock *), struct sock *sk) -{ - struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; - - if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) - return 0; - - new = tfrc_lh_demand_next(lh); - if (unlikely(new == NULL)) { - DCCP_CRIT("Cannot allocate/add loss record."); - return 0; - } - - new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; - new->li_ccval = tfrc_rx_hist_loss_prev(rh)->tfrchrx_ccval; - new->li_is_closed = 0; - - if (++lh->counter == 1) - lh->i_mean = new->li_length = (*calc_first_li)(sk); - else { - cur->li_length = dccp_delta_seqno(cur->li_seqno, new->li_seqno); - new->li_length = dccp_delta_seqno(new->li_seqno, - tfrc_rx_hist_last_rcv(rh)->tfrchrx_seqno) + 1; - if (lh->counter > (2*LIH_SIZE)) - lh->counter -= LIH_SIZE; - - tfrc_lh_calc_i_mean(lh); - } - return 1; -} - -int __init tfrc_li_init(void) -{ - tfrc_lh_slab = kmem_cache_create("tfrc_li_hist", - sizeof(struct tfrc_loss_interval), 0, - SLAB_HWCACHE_ALIGN, NULL); - return tfrc_lh_slab == NULL ? -ENOBUFS : 0; -} - -void tfrc_li_exit(void) -{ - if (tfrc_lh_slab != NULL) { - kmem_cache_destroy(tfrc_lh_slab); - tfrc_lh_slab = NULL; - } -} diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h deleted file mode 100644 index c3d95f85e43b1..0000000000000 --- a/net/dccp/ccids/lib/loss_interval.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _DCCP_LI_HIST_ -#define _DCCP_LI_HIST_ -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-7 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ -#include -#include -#include - -/* - * Number of loss intervals (RFC 4342, 8.6.1). The history size is one more than - * NINTERVAL, since the `open' interval I_0 is always stored as the first entry. - */ -#define NINTERVAL 8 -#define LIH_SIZE (NINTERVAL + 1) - -/** - * tfrc_loss_interval - Loss history record for TFRC-based protocols - * @li_seqno: Highest received seqno before the start of loss - * @li_ccval: The CCVal belonging to @li_seqno - * @li_is_closed: Whether @li_seqno is older than 1 RTT - * @li_length: Loss interval sequence length - */ -struct tfrc_loss_interval { - u64 li_seqno:48, - li_ccval:4, - li_is_closed:1; - u32 li_length; -}; - -/** - * tfrc_loss_hist - Loss record database - * @ring: Circular queue managed in LIFO manner - * @counter: Current count of entries (can be more than %LIH_SIZE) - * @i_mean: Current Average Loss Interval [RFC 3448, 5.4] - */ -struct tfrc_loss_hist { - struct tfrc_loss_interval *ring[LIH_SIZE]; - u8 counter; - u32 i_mean; -}; - -static inline void tfrc_lh_init(struct tfrc_loss_hist *lh) -{ - memset(lh, 0, sizeof(struct tfrc_loss_hist)); -} - -static inline u8 tfrc_lh_is_initialised(struct tfrc_loss_hist *lh) -{ - return lh->counter > 0; -} - -static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh) -{ - return min(lh->counter, (u8)LIH_SIZE); -} - -struct tfrc_rx_hist; - -int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, - u32 (*first_li)(struct sock *), struct sock *); -u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); -void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); - -#endif /* _DCCP_LI_HIST_ */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c deleted file mode 100644 index 0cdda3c66fb5e..0000000000000 --- a/net/dccp/ccids/lib/packet_history.c +++ /dev/null @@ -1,439 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo . - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ - -#include -#include -#include "packet_history.h" -#include "../../dccp.h" - -/* - * Transmitter History Routines - */ -static struct kmem_cache *tfrc_tx_hist_slab; - -int __init tfrc_tx_packet_history_init(void) -{ - tfrc_tx_hist_slab = kmem_cache_create("tfrc_tx_hist", - sizeof(struct tfrc_tx_hist_entry), - 0, SLAB_HWCACHE_ALIGN, NULL); - return tfrc_tx_hist_slab == NULL ? -ENOBUFS : 0; -} - -void tfrc_tx_packet_history_exit(void) -{ - if (tfrc_tx_hist_slab != NULL) { - kmem_cache_destroy(tfrc_tx_hist_slab); - tfrc_tx_hist_slab = NULL; - } -} - -int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) -{ - struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); - - if (entry == NULL) - return -ENOBUFS; - entry->seqno = seqno; - entry->stamp = ktime_get_real(); - entry->next = *headp; - *headp = entry; - return 0; -} - -void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) -{ - struct tfrc_tx_hist_entry *head = *headp; - - while (head != NULL) { - struct tfrc_tx_hist_entry *next = head->next; - - kmem_cache_free(tfrc_tx_hist_slab, head); - head = next; - } - - *headp = NULL; -} - -/* - * Receiver History Routines - */ -static struct kmem_cache *tfrc_rx_hist_slab; - -int __init tfrc_rx_packet_history_init(void) -{ - tfrc_rx_hist_slab = kmem_cache_create("tfrc_rxh_cache", - sizeof(struct tfrc_rx_hist_entry), - 0, SLAB_HWCACHE_ALIGN, NULL); - return tfrc_rx_hist_slab == NULL ? -ENOBUFS : 0; -} - -void tfrc_rx_packet_history_exit(void) -{ - if (tfrc_rx_hist_slab != NULL) { - kmem_cache_destroy(tfrc_rx_hist_slab); - tfrc_rx_hist_slab = NULL; - } -} - -static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry, - const struct sk_buff *skb, - const u64 ndp) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - - entry->tfrchrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; - entry->tfrchrx_ccval = dh->dccph_ccval; - entry->tfrchrx_type = dh->dccph_type; - entry->tfrchrx_ndp = ndp; - entry->tfrchrx_tstamp = ktime_get_real(); -} - -void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, - const struct sk_buff *skb, - const u64 ndp) -{ - struct tfrc_rx_hist_entry *entry = tfrc_rx_hist_last_rcv(h); - - tfrc_rx_hist_entry_from_skb(entry, skb, ndp); -} - -/* has the packet contained in skb been seen before? */ -int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) -{ - const u64 seq = DCCP_SKB_CB(skb)->dccpd_seq; - int i; - - if (dccp_delta_seqno(tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, seq) <= 0) - return 1; - - for (i = 1; i <= h->loss_count; i++) - if (tfrc_rx_hist_entry(h, i)->tfrchrx_seqno == seq) - return 1; - - return 0; -} - -static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) -{ - const u8 idx_a = tfrc_rx_hist_index(h, a), - idx_b = tfrc_rx_hist_index(h, b); - - swap(h->ring[idx_a], h->ring[idx_b]); -} - -/* - * Private helper functions for loss detection. - * - * In the descriptions, `Si' refers to the sequence number of entry number i, - * whose NDP count is `Ni' (lower case is used for variables). - * Note: All __xxx_loss functions expect that a test against duplicates has been - * performed already: the seqno of the skb must not be less than the seqno - * of loss_prev; and it must not equal that of any valid history entry. - */ -static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1) -{ - u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, - s1 = DCCP_SKB_CB(skb)->dccpd_seq; - - if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ - h->loss_count = 1; - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1); - } -} - -static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) -{ - u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, - s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, - s2 = DCCP_SKB_CB(skb)->dccpd_seq; - - if (likely(dccp_delta_seqno(s1, s2) > 0)) { /* S1 < S2 */ - h->loss_count = 2; - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n2); - return; - } - - /* S0 < S2 < S1 */ - - if (dccp_loss_free(s0, s2, n2)) { - u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp; - - if (dccp_loss_free(s2, s1, n1)) { - /* hole is filled: S0, S2, and S1 are consecutive */ - h->loss_count = 0; - h->loss_start = tfrc_rx_hist_index(h, 1); - } else - /* gap between S2 and S1: just update loss_prev */ - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); - - } else { /* gap between S0 and S2 */ - /* - * Reorder history to insert S2 between S0 and S1 - */ - tfrc_rx_hist_swap(h, 0, 3); - h->loss_start = tfrc_rx_hist_index(h, 3); - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n2); - h->loss_count = 2; - } -} - -/* return 1 if a new loss event has been identified */ -static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3) -{ - u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, - s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, - s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno, - s3 = DCCP_SKB_CB(skb)->dccpd_seq; - - if (likely(dccp_delta_seqno(s2, s3) > 0)) { /* S2 < S3 */ - h->loss_count = 3; - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 3), skb, n3); - return 1; - } - - /* S3 < S2 */ - - if (dccp_delta_seqno(s1, s3) > 0) { /* S1 < S3 < S2 */ - /* - * Reorder history to insert S3 between S1 and S2 - */ - tfrc_rx_hist_swap(h, 2, 3); - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n3); - h->loss_count = 3; - return 1; - } - - /* S0 < S3 < S1 */ - - if (dccp_loss_free(s0, s3, n3)) { - u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp; - - if (dccp_loss_free(s3, s1, n1)) { - /* hole between S0 and S1 filled by S3 */ - u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp; - - if (dccp_loss_free(s1, s2, n2)) { - /* entire hole filled by S0, S3, S1, S2 */ - h->loss_start = tfrc_rx_hist_index(h, 2); - h->loss_count = 0; - } else { - /* gap remains between S1 and S2 */ - h->loss_start = tfrc_rx_hist_index(h, 1); - h->loss_count = 1; - } - - } else /* gap exists between S3 and S1, loss_count stays at 2 */ - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n3); - - return 0; - } - - /* - * The remaining case: S0 < S3 < S1 < S2; gap between S0 and S3 - * Reorder history to insert S3 between S0 and S1. - */ - tfrc_rx_hist_swap(h, 0, 3); - h->loss_start = tfrc_rx_hist_index(h, 3); - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n3); - h->loss_count = 3; - - return 1; -} - -/* recycle RX history records to continue loss detection if necessary */ -static void __three_after_loss(struct tfrc_rx_hist *h) -{ - /* - * At this stage we know already that there is a gap between S0 and S1 - * (since S0 was the highest sequence number received before detecting - * the loss). To recycle the loss record, it is thus only necessary to - * check for other possible gaps between S1/S2 and between S2/S3. - */ - u64 s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, - s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno, - s3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_seqno; - u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp, - n3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_ndp; - - if (dccp_loss_free(s1, s2, n2)) { - - if (dccp_loss_free(s2, s3, n3)) { - /* no gap between S2 and S3: entire hole is filled */ - h->loss_start = tfrc_rx_hist_index(h, 3); - h->loss_count = 0; - } else { - /* gap between S2 and S3 */ - h->loss_start = tfrc_rx_hist_index(h, 2); - h->loss_count = 1; - } - - } else { /* gap between S1 and S2 */ - h->loss_start = tfrc_rx_hist_index(h, 1); - h->loss_count = 2; - } -} - -/** - * tfrc_rx_handle_loss - Loss detection and further processing - * @h: The non-empty RX history object - * @lh: Loss Intervals database to update - * @skb: Currently received packet - * @ndp: The NDP count belonging to @skb - * @calc_first_li: Caller-dependent computation of first loss interval in @lh - * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) - * - * Chooses action according to pending loss, updates LI database when a new - * loss was detected, and does required post-processing. Returns 1 when caller - * should send feedback, 0 otherwise. - * Since it also takes care of reordering during loss detection and updates the - * records accordingly, the caller should not perform any more RX history - * operations when loss_count is greater than 0 after calling this function. - */ -int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, - struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*calc_first_li)(struct sock *), struct sock *sk) -{ - int is_new_loss = 0; - - if (h->loss_count == 0) { - __do_track_loss(h, skb, ndp); - } else if (h->loss_count == 1) { - __one_after_loss(h, skb, ndp); - } else if (h->loss_count != 2) { - DCCP_BUG("invalid loss_count %d", h->loss_count); - } else if (__two_after_loss(h, skb, ndp)) { - /* - * Update Loss Interval database and recycle RX records - */ - is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); - __three_after_loss(h); - } - return is_new_loss; -} - -int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) -{ - int i; - - for (i = 0; i <= TFRC_NDUPACK; i++) { - h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); - if (h->ring[i] == NULL) - goto out_free; - } - - h->loss_count = h->loss_start = 0; - return 0; - -out_free: - while (i-- != 0) { - kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); - h->ring[i] = NULL; - } - return -ENOBUFS; -} - -void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) -{ - int i; - - for (i = 0; i <= TFRC_NDUPACK; ++i) - if (h->ring[i] != NULL) { - kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); - h->ring[i] = NULL; - } -} - -/** - * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against - * @h: The non-empty RX history object - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) -{ - return h->ring[0]; -} - -/** - * tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry - * @h: The non-empty RX history object - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) -{ - return h->ring[h->rtt_sample_prev]; -} - -/** - * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal - * @h: receive histogram - * @skb: packet containing timestamp. - * - * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able - * to compute a sample with given data - calling function should check this. - */ -u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) -{ - u32 sample = 0, - delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - - if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ - if (h->rtt_sample_prev == 2) { /* previous candidate stored */ - sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - if (sample) - sample = 4 / sample * - ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp); - else /* - * FIXME: This condition is in principle not - * possible but occurs when CCID is used for - * two-way data traffic. I have tried to trace - * it, but the cause does not seem to be here. - */ - DCCP_BUG("please report to dccp@vger.kernel.org" - " => prev = %u, last = %u", - tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - } else if (delta_v < 1) { - h->rtt_sample_prev = 1; - goto keep_ref_for_next_time; - } - - } else if (delta_v == 4) /* optimal match */ - sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); - else { /* suboptimal match */ - h->rtt_sample_prev = 2; - goto keep_ref_for_next_time; - } - - if (unlikely(sample > DCCP_SANE_RTT_MAX)) { - DCCP_WARN("RTT sample %u too large, using max\n", sample); - sample = DCCP_SANE_RTT_MAX; - } - - h->rtt_sample_prev = 0; /* use current entry as next reference */ -keep_ref_for_next_time: - - return sample; -} diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h deleted file mode 100644 index 159cc9326eabc..0000000000000 --- a/net/dccp/ccids/lib/packet_history.h +++ /dev/null @@ -1,142 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Packet RX/TX history data structures and routines for TFRC-based protocols. - * - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo . - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ - -#ifndef _DCCP_PKT_HIST_ -#define _DCCP_PKT_HIST_ - -#include -#include -#include "tfrc.h" - -/** - * tfrc_tx_hist_entry - Simple singly-linked TX history list - * @next: next oldest entry (LIFO order) - * @seqno: sequence number of this entry - * @stamp: send time of packet with sequence number @seqno - */ -struct tfrc_tx_hist_entry { - struct tfrc_tx_hist_entry *next; - u64 seqno; - ktime_t stamp; -}; - -static inline struct tfrc_tx_hist_entry * - tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) -{ - while (head != NULL && head->seqno != seqno) - head = head->next; - return head; -} - -int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); -void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); - -/* Subtraction a-b modulo-16, respects circular wrap-around */ -#define SUB16(a, b) (((a) + 16 - (b)) & 0xF) - -/* Number of packets to wait after a missing packet (RFC 4342, 6.1) */ -#define TFRC_NDUPACK 3 - -/** - * tfrc_rx_hist_entry - Store information about a single received packet - * @tfrchrx_seqno: DCCP packet sequence number - * @tfrchrx_ccval: window counter value of packet (RFC 4342, 8.1) - * @tfrchrx_ndp: the NDP count (if any) of the packet - * @tfrchrx_tstamp: actual receive time of packet - */ -struct tfrc_rx_hist_entry { - u64 tfrchrx_seqno:48, - tfrchrx_ccval:4, - tfrchrx_type:4; - u64 tfrchrx_ndp:48; - ktime_t tfrchrx_tstamp; -}; - -/** - * tfrc_rx_hist - RX history structure for TFRC-based protocols - * @ring: Packet history for RTT sampling and loss detection - * @loss_count: Number of entries in circular history - * @loss_start: Movable index (for loss detection) - * @rtt_sample_prev: Used during RTT sampling, points to candidate entry - */ -struct tfrc_rx_hist { - struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; - u8 loss_count:2, - loss_start:2; -#define rtt_sample_prev loss_start -}; - -/** - * tfrc_rx_hist_index - index to reach n-th entry after loss_start - */ -static inline u8 tfrc_rx_hist_index(const struct tfrc_rx_hist *h, const u8 n) -{ - return (h->loss_start + n) & TFRC_NDUPACK; -} - -/** - * tfrc_rx_hist_last_rcv - entry with highest-received-seqno so far - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_last_rcv(const struct tfrc_rx_hist *h) -{ - return h->ring[tfrc_rx_hist_index(h, h->loss_count)]; -} - -/** - * tfrc_rx_hist_entry - return the n-th history entry after loss_start - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_entry(const struct tfrc_rx_hist *h, const u8 n) -{ - return h->ring[tfrc_rx_hist_index(h, n)]; -} - -/** - * tfrc_rx_hist_loss_prev - entry with highest-received-seqno before loss was detected - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_loss_prev(const struct tfrc_rx_hist *h) -{ - return h->ring[h->loss_start]; -} - -/* indicate whether previously a packet was detected missing */ -static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h) -{ - return h->loss_count > 0; -} - -void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb, - const u64 ndp); - -int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); - -struct tfrc_loss_hist; -int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*first_li)(struct sock *sk), struct sock *sk); -u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb); -int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); -void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); - -#endif /* _DCCP_PKT_HIST_ */ diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c deleted file mode 100644 index d7f265e1f50c8..0000000000000 --- a/net/dccp/ccids/lib/tfrc.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * TFRC library initialisation - * - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2007 Arnaldo Carvalho de Melo - */ -#include -#include "tfrc.h" - -#ifdef CONFIG_IP_DCCP_TFRC_DEBUG -bool tfrc_debug; -module_param(tfrc_debug, bool, 0644); -MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages"); -#endif - -int __init tfrc_lib_init(void) -{ - int rc = tfrc_li_init(); - - if (rc) - goto out; - - rc = tfrc_tx_packet_history_init(); - if (rc) - goto out_free_loss_intervals; - - rc = tfrc_rx_packet_history_init(); - if (rc) - goto out_free_tx_history; - return 0; - -out_free_tx_history: - tfrc_tx_packet_history_exit(); -out_free_loss_intervals: - tfrc_li_exit(); -out: - return rc; -} - -void tfrc_lib_exit(void) -{ - tfrc_rx_packet_history_exit(); - tfrc_tx_packet_history_exit(); - tfrc_li_exit(); -} diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h deleted file mode 100644 index 0a63e8750cc5b..0000000000000 --- a/net/dccp/ccids/lib/tfrc.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _TFRC_H_ -#define _TFRC_H_ -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-6 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - */ -#include -#include -#include "../../dccp.h" - -/* internal includes that this library exports: */ -#include "loss_interval.h" -#include "packet_history.h" - -#ifdef CONFIG_IP_DCCP_TFRC_DEBUG -extern bool tfrc_debug; -#define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a) -#else -#define tfrc_pr_debug(format, a...) -#endif - -/* integer-arithmetic divisions of type (a * 1000000)/b */ -static inline u64 scaled_div(u64 a, u64 b) -{ - BUG_ON(b == 0); - return div64_u64(a * 1000000, b); -} - -static inline u32 scaled_div32(u64 a, u64 b) -{ - u64 result = scaled_div(a, b); - - if (result > UINT_MAX) { - DCCP_CRIT("Overflow: %llu/%llu > UINT_MAX", - (unsigned long long)a, (unsigned long long)b); - return UINT_MAX; - } - return result; -} - -/** - * tfrc_ewma - Exponentially weighted moving average - * @weight: Weight to be used as damping factor, in units of 1/10 - */ -static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) -{ - return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval; -} - -u32 tfrc_calc_x(u16 s, u32 R, u32 p); -u32 tfrc_calc_x_reverse_lookup(u32 fvalue); -u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); - -int tfrc_tx_packet_history_init(void); -void tfrc_tx_packet_history_exit(void); -int tfrc_rx_packet_history_init(void); -void tfrc_rx_packet_history_exit(void); - -int tfrc_li_init(void); -void tfrc_li_exit(void); - -#ifdef CONFIG_IP_DCCP_TFRC_LIB -int tfrc_lib_init(void); -void tfrc_lib_exit(void); -#else -#define tfrc_lib_init() (0) -#define tfrc_lib_exit() -#endif -#endif /* _TFRC_H_ */ diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c deleted file mode 100644 index 92a8c6bea3167..0000000000000 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ /dev/null @@ -1,702 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - */ - -#include -#include "../../dccp.h" -#include "tfrc.h" - -#define TFRC_CALC_X_ARRSIZE 500 -#define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */ -#define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE) - -/* - TFRC TCP Reno Throughput Equation Lookup Table for f(p) - - The following two-column lookup table implements a part of the TCP throughput - equation from [RFC 3448, sec. 3.1]: - - s - X_calc = -------------------------------------------------------------- - R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3)) - - Where: - X is the transmit rate in bytes/second - s is the packet size in bytes - R is the round trip time in seconds - p is the loss event rate, between 0 and 1.0, of the number of loss - events as a fraction of the number of packets transmitted - t_RTO is the TCP retransmission timeout value in seconds - b is the number of packets acknowledged by a single TCP ACK - - We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes: - - s - X_calc = ------------------------------------------------------- - R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3)) - - which we can break down into: - - s - X_calc = --------- - R * f(p) - - where f(p) is given for 0 < p <= 1 by: - - f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3) - - Since this is kernel code, floating-point arithmetic is avoided in favour of - integer arithmetic. This means that nearly all fractional parameters are - scaled by 1000000: - * the parameters p and R - * the return result f(p) - The lookup table therefore actually tabulates the following function g(q): - - g(q) = 1000000 * f(q/1000000) - - Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer - granularity for the practically more relevant case of small values of p (up to - 5%), the second column is used; the first one ranges up to 100%. This split - corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also - determines the smallest resolution possible with this lookup table: - - TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE - - The entire table is generated by: - for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) { - lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE); - lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE); - } - - With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1, - lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%) - lookup[M][0] = g(1000000) = 1000000 * f(100%) - lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%) - lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%) - - In summary, the two columns represent f(p) for the following ranges: - * The first column is for 0.002 <= p <= 1.0 - * The second column is for 0.0001 <= p <= 0.05 - Where the columns overlap, the second (finer-grained) is given preference, - i.e. the first column is used only for p >= 0.05. - */ -static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { - { 37172, 8172 }, - { 53499, 11567 }, - { 66664, 14180 }, - { 78298, 16388 }, - { 89021, 18339 }, - { 99147, 20108 }, - { 108858, 21738 }, - { 118273, 23260 }, - { 127474, 24693 }, - { 136520, 26052 }, - { 145456, 27348 }, - { 154316, 28589 }, - { 163130, 29783 }, - { 171919, 30935 }, - { 180704, 32049 }, - { 189502, 33130 }, - { 198328, 34180 }, - { 207194, 35202 }, - { 216114, 36198 }, - { 225097, 37172 }, - { 234153, 38123 }, - { 243294, 39055 }, - { 252527, 39968 }, - { 261861, 40864 }, - { 271305, 41743 }, - { 280866, 42607 }, - { 290553, 43457 }, - { 300372, 44293 }, - { 310333, 45117 }, - { 320441, 45929 }, - { 330705, 46729 }, - { 341131, 47518 }, - { 351728, 48297 }, - { 362501, 49066 }, - { 373460, 49826 }, - { 384609, 50577 }, - { 395958, 51320 }, - { 407513, 52054 }, - { 419281, 52780 }, - { 431270, 53499 }, - { 443487, 54211 }, - { 455940, 54916 }, - { 468635, 55614 }, - { 481581, 56306 }, - { 494785, 56991 }, - { 508254, 57671 }, - { 521996, 58345 }, - { 536019, 59014 }, - { 550331, 59677 }, - { 564939, 60335 }, - { 579851, 60988 }, - { 595075, 61636 }, - { 610619, 62279 }, - { 626491, 62918 }, - { 642700, 63553 }, - { 659253, 64183 }, - { 676158, 64809 }, - { 693424, 65431 }, - { 711060, 66050 }, - { 729073, 66664 }, - { 747472, 67275 }, - { 766266, 67882 }, - { 785464, 68486 }, - { 805073, 69087 }, - { 825103, 69684 }, - { 845562, 70278 }, - { 866460, 70868 }, - { 887805, 71456 }, - { 909606, 72041 }, - { 931873, 72623 }, - { 954614, 73202 }, - { 977839, 73778 }, - { 1001557, 74352 }, - { 1025777, 74923 }, - { 1050508, 75492 }, - { 1075761, 76058 }, - { 1101544, 76621 }, - { 1127867, 77183 }, - { 1154739, 77741 }, - { 1182172, 78298 }, - { 1210173, 78852 }, - { 1238753, 79405 }, - { 1267922, 79955 }, - { 1297689, 80503 }, - { 1328066, 81049 }, - { 1359060, 81593 }, - { 1390684, 82135 }, - { 1422947, 82675 }, - { 1455859, 83213 }, - { 1489430, 83750 }, - { 1523671, 84284 }, - { 1558593, 84817 }, - { 1594205, 85348 }, - { 1630518, 85878 }, - { 1667543, 86406 }, - { 1705290, 86932 }, - { 1743770, 87457 }, - { 1782994, 87980 }, - { 1822973, 88501 }, - { 1863717, 89021 }, - { 1905237, 89540 }, - { 1947545, 90057 }, - { 1990650, 90573 }, - { 2034566, 91087 }, - { 2079301, 91600 }, - { 2124869, 92111 }, - { 2171279, 92622 }, - { 2218543, 93131 }, - { 2266673, 93639 }, - { 2315680, 94145 }, - { 2365575, 94650 }, - { 2416371, 95154 }, - { 2468077, 95657 }, - { 2520707, 96159 }, - { 2574271, 96660 }, - { 2628782, 97159 }, - { 2684250, 97658 }, - { 2740689, 98155 }, - { 2798110, 98651 }, - { 2856524, 99147 }, - { 2915944, 99641 }, - { 2976382, 100134 }, - { 3037850, 100626 }, - { 3100360, 101117 }, - { 3163924, 101608 }, - { 3228554, 102097 }, - { 3294263, 102586 }, - { 3361063, 103073 }, - { 3428966, 103560 }, - { 3497984, 104045 }, - { 3568131, 104530 }, - { 3639419, 105014 }, - { 3711860, 105498 }, - { 3785467, 105980 }, - { 3860253, 106462 }, - { 3936229, 106942 }, - { 4013410, 107422 }, - { 4091808, 107902 }, - { 4171435, 108380 }, - { 4252306, 108858 }, - { 4334431, 109335 }, - { 4417825, 109811 }, - { 4502501, 110287 }, - { 4588472, 110762 }, - { 4675750, 111236 }, - { 4764349, 111709 }, - { 4854283, 112182 }, - { 4945564, 112654 }, - { 5038206, 113126 }, - { 5132223, 113597 }, - { 5227627, 114067 }, - { 5324432, 114537 }, - { 5422652, 115006 }, - { 5522299, 115474 }, - { 5623389, 115942 }, - { 5725934, 116409 }, - { 5829948, 116876 }, - { 5935446, 117342 }, - { 6042439, 117808 }, - { 6150943, 118273 }, - { 6260972, 118738 }, - { 6372538, 119202 }, - { 6485657, 119665 }, - { 6600342, 120128 }, - { 6716607, 120591 }, - { 6834467, 121053 }, - { 6953935, 121514 }, - { 7075025, 121976 }, - { 7197752, 122436 }, - { 7322131, 122896 }, - { 7448175, 123356 }, - { 7575898, 123815 }, - { 7705316, 124274 }, - { 7836442, 124733 }, - { 7969291, 125191 }, - { 8103877, 125648 }, - { 8240216, 126105 }, - { 8378321, 126562 }, - { 8518208, 127018 }, - { 8659890, 127474 }, - { 8803384, 127930 }, - { 8948702, 128385 }, - { 9095861, 128840 }, - { 9244875, 129294 }, - { 9395760, 129748 }, - { 9548529, 130202 }, - { 9703198, 130655 }, - { 9859782, 131108 }, - { 10018296, 131561 }, - { 10178755, 132014 }, - { 10341174, 132466 }, - { 10505569, 132917 }, - { 10671954, 133369 }, - { 10840345, 133820 }, - { 11010757, 134271 }, - { 11183206, 134721 }, - { 11357706, 135171 }, - { 11534274, 135621 }, - { 11712924, 136071 }, - { 11893673, 136520 }, - { 12076536, 136969 }, - { 12261527, 137418 }, - { 12448664, 137867 }, - { 12637961, 138315 }, - { 12829435, 138763 }, - { 13023101, 139211 }, - { 13218974, 139658 }, - { 13417071, 140106 }, - { 13617407, 140553 }, - { 13819999, 140999 }, - { 14024862, 141446 }, - { 14232012, 141892 }, - { 14441465, 142339 }, - { 14653238, 142785 }, - { 14867346, 143230 }, - { 15083805, 143676 }, - { 15302632, 144121 }, - { 15523842, 144566 }, - { 15747453, 145011 }, - { 15973479, 145456 }, - { 16201939, 145900 }, - { 16432847, 146345 }, - { 16666221, 146789 }, - { 16902076, 147233 }, - { 17140429, 147677 }, - { 17381297, 148121 }, - { 17624696, 148564 }, - { 17870643, 149007 }, - { 18119154, 149451 }, - { 18370247, 149894 }, - { 18623936, 150336 }, - { 18880241, 150779 }, - { 19139176, 151222 }, - { 19400759, 151664 }, - { 19665007, 152107 }, - { 19931936, 152549 }, - { 20201564, 152991 }, - { 20473907, 153433 }, - { 20748982, 153875 }, - { 21026807, 154316 }, - { 21307399, 154758 }, - { 21590773, 155199 }, - { 21876949, 155641 }, - { 22165941, 156082 }, - { 22457769, 156523 }, - { 22752449, 156964 }, - { 23049999, 157405 }, - { 23350435, 157846 }, - { 23653774, 158287 }, - { 23960036, 158727 }, - { 24269236, 159168 }, - { 24581392, 159608 }, - { 24896521, 160049 }, - { 25214642, 160489 }, - { 25535772, 160929 }, - { 25859927, 161370 }, - { 26187127, 161810 }, - { 26517388, 162250 }, - { 26850728, 162690 }, - { 27187165, 163130 }, - { 27526716, 163569 }, - { 27869400, 164009 }, - { 28215234, 164449 }, - { 28564236, 164889 }, - { 28916423, 165328 }, - { 29271815, 165768 }, - { 29630428, 166208 }, - { 29992281, 166647 }, - { 30357392, 167087 }, - { 30725779, 167526 }, - { 31097459, 167965 }, - { 31472452, 168405 }, - { 31850774, 168844 }, - { 32232445, 169283 }, - { 32617482, 169723 }, - { 33005904, 170162 }, - { 33397730, 170601 }, - { 33792976, 171041 }, - { 34191663, 171480 }, - { 34593807, 171919 }, - { 34999428, 172358 }, - { 35408544, 172797 }, - { 35821174, 173237 }, - { 36237335, 173676 }, - { 36657047, 174115 }, - { 37080329, 174554 }, - { 37507197, 174993 }, - { 37937673, 175433 }, - { 38371773, 175872 }, - { 38809517, 176311 }, - { 39250924, 176750 }, - { 39696012, 177190 }, - { 40144800, 177629 }, - { 40597308, 178068 }, - { 41053553, 178507 }, - { 41513554, 178947 }, - { 41977332, 179386 }, - { 42444904, 179825 }, - { 42916290, 180265 }, - { 43391509, 180704 }, - { 43870579, 181144 }, - { 44353520, 181583 }, - { 44840352, 182023 }, - { 45331092, 182462 }, - { 45825761, 182902 }, - { 46324378, 183342 }, - { 46826961, 183781 }, - { 47333531, 184221 }, - { 47844106, 184661 }, - { 48358706, 185101 }, - { 48877350, 185541 }, - { 49400058, 185981 }, - { 49926849, 186421 }, - { 50457743, 186861 }, - { 50992759, 187301 }, - { 51531916, 187741 }, - { 52075235, 188181 }, - { 52622735, 188622 }, - { 53174435, 189062 }, - { 53730355, 189502 }, - { 54290515, 189943 }, - { 54854935, 190383 }, - { 55423634, 190824 }, - { 55996633, 191265 }, - { 56573950, 191706 }, - { 57155606, 192146 }, - { 57741621, 192587 }, - { 58332014, 193028 }, - { 58926806, 193470 }, - { 59526017, 193911 }, - { 60129666, 194352 }, - { 60737774, 194793 }, - { 61350361, 195235 }, - { 61967446, 195677 }, - { 62589050, 196118 }, - { 63215194, 196560 }, - { 63845897, 197002 }, - { 64481179, 197444 }, - { 65121061, 197886 }, - { 65765563, 198328 }, - { 66414705, 198770 }, - { 67068508, 199213 }, - { 67726992, 199655 }, - { 68390177, 200098 }, - { 69058085, 200540 }, - { 69730735, 200983 }, - { 70408147, 201426 }, - { 71090343, 201869 }, - { 71777343, 202312 }, - { 72469168, 202755 }, - { 73165837, 203199 }, - { 73867373, 203642 }, - { 74573795, 204086 }, - { 75285124, 204529 }, - { 76001380, 204973 }, - { 76722586, 205417 }, - { 77448761, 205861 }, - { 78179926, 206306 }, - { 78916102, 206750 }, - { 79657310, 207194 }, - { 80403571, 207639 }, - { 81154906, 208084 }, - { 81911335, 208529 }, - { 82672880, 208974 }, - { 83439562, 209419 }, - { 84211402, 209864 }, - { 84988421, 210309 }, - { 85770640, 210755 }, - { 86558080, 211201 }, - { 87350762, 211647 }, - { 88148708, 212093 }, - { 88951938, 212539 }, - { 89760475, 212985 }, - { 90574339, 213432 }, - { 91393551, 213878 }, - { 92218133, 214325 }, - { 93048107, 214772 }, - { 93883493, 215219 }, - { 94724314, 215666 }, - { 95570590, 216114 }, - { 96422343, 216561 }, - { 97279594, 217009 }, - { 98142366, 217457 }, - { 99010679, 217905 }, - { 99884556, 218353 }, - { 100764018, 218801 }, - { 101649086, 219250 }, - { 102539782, 219698 }, - { 103436128, 220147 }, - { 104338146, 220596 }, - { 105245857, 221046 }, - { 106159284, 221495 }, - { 107078448, 221945 }, - { 108003370, 222394 }, - { 108934074, 222844 }, - { 109870580, 223294 }, - { 110812910, 223745 }, - { 111761087, 224195 }, - { 112715133, 224646 }, - { 113675069, 225097 }, - { 114640918, 225548 }, - { 115612702, 225999 }, - { 116590442, 226450 }, - { 117574162, 226902 }, - { 118563882, 227353 }, - { 119559626, 227805 }, - { 120561415, 228258 }, - { 121569272, 228710 }, - { 122583219, 229162 }, - { 123603278, 229615 }, - { 124629471, 230068 }, - { 125661822, 230521 }, - { 126700352, 230974 }, - { 127745083, 231428 }, - { 128796039, 231882 }, - { 129853241, 232336 }, - { 130916713, 232790 }, - { 131986475, 233244 }, - { 133062553, 233699 }, - { 134144966, 234153 }, - { 135233739, 234608 }, - { 136328894, 235064 }, - { 137430453, 235519 }, - { 138538440, 235975 }, - { 139652876, 236430 }, - { 140773786, 236886 }, - { 141901190, 237343 }, - { 143035113, 237799 }, - { 144175576, 238256 }, - { 145322604, 238713 }, - { 146476218, 239170 }, - { 147636442, 239627 }, - { 148803298, 240085 }, - { 149976809, 240542 }, - { 151156999, 241000 }, - { 152343890, 241459 }, - { 153537506, 241917 }, - { 154737869, 242376 }, - { 155945002, 242835 }, - { 157158929, 243294 }, - { 158379673, 243753 }, - { 159607257, 244213 }, - { 160841704, 244673 }, - { 162083037, 245133 }, - { 163331279, 245593 }, - { 164586455, 246054 }, - { 165848586, 246514 }, - { 167117696, 246975 }, - { 168393810, 247437 }, - { 169676949, 247898 }, - { 170967138, 248360 }, - { 172264399, 248822 }, - { 173568757, 249284 }, - { 174880235, 249747 }, - { 176198856, 250209 }, - { 177524643, 250672 }, - { 178857621, 251136 }, - { 180197813, 251599 }, - { 181545242, 252063 }, - { 182899933, 252527 }, - { 184261908, 252991 }, - { 185631191, 253456 }, - { 187007807, 253920 }, - { 188391778, 254385 }, - { 189783129, 254851 }, - { 191181884, 255316 }, - { 192588065, 255782 }, - { 194001698, 256248 }, - { 195422805, 256714 }, - { 196851411, 257181 }, - { 198287540, 257648 }, - { 199731215, 258115 }, - { 201182461, 258582 }, - { 202641302, 259050 }, - { 204107760, 259518 }, - { 205581862, 259986 }, - { 207063630, 260454 }, - { 208553088, 260923 }, - { 210050262, 261392 }, - { 211555174, 261861 }, - { 213067849, 262331 }, - { 214588312, 262800 }, - { 216116586, 263270 }, - { 217652696, 263741 }, - { 219196666, 264211 }, - { 220748520, 264682 }, - { 222308282, 265153 }, - { 223875978, 265625 }, - { 225451630, 266097 }, - { 227035265, 266569 }, - { 228626905, 267041 }, - { 230226576, 267514 }, - { 231834302, 267986 }, - { 233450107, 268460 }, - { 235074016, 268933 }, - { 236706054, 269407 }, - { 238346244, 269881 }, - { 239994613, 270355 }, - { 241651183, 270830 }, - { 243315981, 271305 } -}; - -/* return largest index i such that fval <= lookup[i][small] */ -static inline u32 tfrc_binsearch(u32 fval, u8 small) -{ - u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1; - - while (low < high) { - try = (low + high) / 2; - if (fval <= tfrc_calc_x_lookup[try][small]) - high = try; - else - low = try + 1; - } - return high; -} - -/** - * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448 - * @s: packet size in bytes - * @R: RTT scaled by 1000000 (i.e., microseconds) - * @p: loss ratio estimate scaled by 1000000 - * - * Returns X_calc in bytes per second (not scaled). - */ -u32 tfrc_calc_x(u16 s, u32 R, u32 p) -{ - u16 index; - u32 f; - u64 result; - - /* check against invalid parameters and divide-by-zero */ - BUG_ON(p > 1000000); /* p must not exceed 100% */ - BUG_ON(p == 0); /* f(0) = 0, divide by zero */ - if (R == 0) { /* possible divide by zero */ - DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc."); - return ~0U; - } - - if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ - if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ - DCCP_WARN("Value of p (%d) below resolution. " - "Substituting %d\n", p, TFRC_SMALLEST_P); - index = 0; - } else /* 0.0001 <= p <= 0.05 */ - index = p/TFRC_SMALLEST_P - 1; - - f = tfrc_calc_x_lookup[index][1]; - - } else { /* 0.05 < p <= 1.00 */ - index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1; - - f = tfrc_calc_x_lookup[index][0]; - } - - /* - * Compute X = s/(R*f(p)) in bytes per second. - * Since f(p) and R are both scaled by 1000000, we need to multiply by - * 1000000^2. To avoid overflow, the result is computed in two stages. - * This works under almost all reasonable operational conditions, for a - * wide range of parameters. Yet, should some strange combination of - * parameters result in overflow, the use of scaled_div32 will catch - * this and return UINT_MAX - which is a logically adequate consequence. - */ - result = scaled_div(s, R); - return scaled_div32(result, f); -} - -/** - * tfrc_calc_x_reverse_lookup - try to find p given f(p) - * @fvalue: function value to match, scaled by 1000000 - * - * Returns closest match for p, also scaled by 1000000 - */ -u32 tfrc_calc_x_reverse_lookup(u32 fvalue) -{ - int index; - - if (fvalue == 0) /* f(p) = 0 whenever p = 0 */ - return 0; - - /* Error cases. */ - if (fvalue < tfrc_calc_x_lookup[0][1]) { - DCCP_WARN("fvalue %u smaller than resolution\n", fvalue); - return TFRC_SMALLEST_P; - } - if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) { - DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue); - return 1000000; - } - - if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) { - index = tfrc_binsearch(fvalue, 1); - return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE; - } - - /* else ... it must be in the coarse-grained column */ - index = tfrc_binsearch(fvalue, 0); - return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; -} - -/** - * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% - * @loss_event_rate: loss event rate to invert - * When @loss_event_rate is large, there is a chance that p is truncated to 0. - * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. - */ -u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) -{ - if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ - return 0; - if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ - return 1000000; - return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); -} diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h deleted file mode 100644 index 1f748ed1279d3..0000000000000 --- a/net/dccp/dccp.h +++ /dev/null @@ -1,483 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _DCCP_H -#define _DCCP_H -/* - * net/dccp/dccp.h - * - * An implementation of the DCCP protocol - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2005-6 Ian McDonald - */ - -#include -#include -#include -#include -#include -#include "ackvec.h" - -/* - * DCCP - specific warning and debugging macros. - */ -#define DCCP_WARN(fmt, ...) \ - net_warn_ratelimited("%s: " fmt, __func__, ##__VA_ARGS__) -#define DCCP_CRIT(fmt, a...) printk(KERN_CRIT fmt " at %s:%d/%s()\n", ##a, \ - __FILE__, __LINE__, __func__) -#define DCCP_BUG(a...) do { DCCP_CRIT("BUG: " a); dump_stack(); } while(0) -#define DCCP_BUG_ON(cond) do { if (unlikely((cond) != 0)) \ - DCCP_BUG("\"%s\" holds (exception!)", \ - __stringify(cond)); \ - } while (0) - -#define DCCP_PRINTK(enable, fmt, args...) do { if (enable) \ - printk(fmt, ##args); \ - } while(0) -#define DCCP_PR_DEBUG(enable, fmt, a...) DCCP_PRINTK(enable, KERN_DEBUG \ - "%s: " fmt, __func__, ##a) - -#ifdef CONFIG_IP_DCCP_DEBUG -extern bool dccp_debug; -#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) -#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) -#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) -#else -#define dccp_pr_debug(format, a...) do {} while (0) -#define dccp_pr_debug_cat(format, a...) do {} while (0) -#define dccp_debug(format, a...) do {} while (0) -#endif - -extern struct inet_hashinfo dccp_hashinfo; - -DECLARE_PER_CPU(unsigned int, dccp_orphan_count); - -void dccp_time_wait(struct sock *sk, int state, int timeo); - -/* - * Set safe upper bounds for header and option length. Since Data Offset is 8 - * bits (RFC 4340, sec. 5.1), the total header length can never be more than - * 4 * 255 = 1020 bytes. The largest possible header length is 28 bytes (X=1): - * - DCCP-Response with ACK Subheader and 4 bytes of Service code OR - * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields - * Hence a safe upper bound for the maximum option length is 1020-28 = 992 - */ -#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) -#define DCCP_MAX_PACKET_HDR 28 -#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) -#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) - -/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ -#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) - -#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT - * state, about 60 seconds */ - -/* RFC 1122, 4.2.3.1 initial RTO value */ -#define DCCP_TIMEOUT_INIT ((unsigned int)(3 * HZ)) - -/* - * The maximum back-off value for retransmissions. This is needed for - * - retransmitting client-Requests (sec. 8.1.1), - * - retransmitting Close/CloseReq when closing (sec. 8.3), - * - feature-negotiation retransmission (sec. 6.6.3), - * - Acks in client-PARTOPEN state (sec. 8.1.5). - */ -#define DCCP_RTO_MAX ((unsigned int)(64 * HZ)) - -/* - * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 - */ -#define DCCP_SANE_RTT_MIN 100 -#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) -#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) - -/* sysctl variables for DCCP */ -extern int sysctl_dccp_request_retries; -extern int sysctl_dccp_retries1; -extern int sysctl_dccp_retries2; -extern int sysctl_dccp_tx_qlen; -extern int sysctl_dccp_sync_ratelimit; - -/* - * 48-bit sequence number arithmetic (signed and unsigned) - */ -#define INT48_MIN 0x800000000000LL /* 2^47 */ -#define UINT48_MAX 0xFFFFFFFFFFFFLL /* 2^48 - 1 */ -#define COMPLEMENT48(x) (0x1000000000000LL - (x)) /* 2^48 - x */ -#define TO_SIGNED48(x) (((x) < INT48_MIN)? (x) : -COMPLEMENT48( (x))) -#define TO_UNSIGNED48(x) (((x) >= 0)? (x) : COMPLEMENT48(-(x))) -#define ADD48(a, b) (((a) + (b)) & UINT48_MAX) -#define SUB48(a, b) ADD48((a), COMPLEMENT48(b)) - -static inline void dccp_inc_seqno(u64 *seqno) -{ - *seqno = ADD48(*seqno, 1); -} - -/* signed mod-2^48 distance: pos. if seqno1 < seqno2, neg. if seqno1 > seqno2 */ -static inline s64 dccp_delta_seqno(const u64 seqno1, const u64 seqno2) -{ - u64 delta = SUB48(seqno2, seqno1); - - return TO_SIGNED48(delta); -} - -/* is seq1 < seq2 ? */ -static inline int before48(const u64 seq1, const u64 seq2) -{ - return (s64)((seq2 << 16) - (seq1 << 16)) > 0; -} - -/* is seq1 > seq2 ? */ -#define after48(seq1, seq2) before48(seq2, seq1) - -/* is seq2 <= seq1 <= seq3 ? */ -static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3) -{ - return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16); -} - -/** - * dccp_loss_count - Approximate the number of lost data packets in a burst loss - * @s1: last known sequence number before the loss ('hole') - * @s2: first sequence number seen after the 'hole' - * @ndp: NDP count on packet with sequence number @s2 - */ -static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp) -{ - s64 delta = dccp_delta_seqno(s1, s2); - - WARN_ON(delta < 0); - delta -= ndp + 1; - - return delta > 0 ? delta : 0; -} - -/** - * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1 - */ -static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp) -{ - return dccp_loss_count(s1, s2, ndp) == 0; -} - -enum { - DCCP_MIB_NUM = 0, - DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */ - DCCP_MIB_ESTABRESETS, /* EstabResets */ - DCCP_MIB_CURRESTAB, /* CurrEstab */ - DCCP_MIB_OUTSEGS, /* OutSegs */ - DCCP_MIB_OUTRSTS, - DCCP_MIB_ABORTONTIMEOUT, - DCCP_MIB_TIMEOUTS, - DCCP_MIB_ABORTFAILED, - DCCP_MIB_PASSIVEOPENS, - DCCP_MIB_ATTEMPTFAILS, - DCCP_MIB_OUTDATAGRAMS, - DCCP_MIB_INERRS, - DCCP_MIB_OPTMANDATORYERROR, - DCCP_MIB_INVALIDOPT, - __DCCP_MIB_MAX -}; - -#define DCCP_MIB_MAX __DCCP_MIB_MAX -struct dccp_mib { - unsigned long mibs[DCCP_MIB_MAX]; -}; - -DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); -#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) -#define __DCCP_INC_STATS(field) __SNMP_INC_STATS(dccp_statistics, field) -#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) - -/* - * Checksumming routines - */ -static inline unsigned int dccp_csum_coverage(const struct sk_buff *skb) -{ - const struct dccp_hdr* dh = dccp_hdr(skb); - - if (dh->dccph_cscov == 0) - return skb->len; - return (dh->dccph_doff + dh->dccph_cscov - 1) * sizeof(u32); -} - -static inline void dccp_csum_outgoing(struct sk_buff *skb) -{ - unsigned int cov = dccp_csum_coverage(skb); - - if (cov >= skb->len) - dccp_hdr(skb)->dccph_cscov = 0; - - skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0); -} - -void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb); - -int dccp_retransmit_skb(struct sock *sk); - -void dccp_send_ack(struct sock *sk); -void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, - struct request_sock *rsk); - -void dccp_send_sync(struct sock *sk, const u64 seq, - const enum dccp_pkt_type pkt_type); - -/* - * TX Packet Dequeueing Interface - */ -void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); -bool dccp_qpolicy_full(struct sock *sk); -void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); -struct sk_buff *dccp_qpolicy_top(struct sock *sk); -struct sk_buff *dccp_qpolicy_pop(struct sock *sk); -bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param); - -/* - * TX Packet Output and TX Timers - */ -void dccp_write_xmit(struct sock *sk); -void dccp_write_space(struct sock *sk); -void dccp_flush_write_queue(struct sock *sk, long *time_budget); - -void dccp_init_xmit_timers(struct sock *sk); -static inline void dccp_clear_xmit_timers(struct sock *sk) -{ - inet_csk_clear_xmit_timers(sk); -} - -unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu); - -const char *dccp_packet_name(const int type); - -void dccp_set_state(struct sock *sk, const int state); -void dccp_done(struct sock *sk); - -int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, - struct sk_buff const *skb); - -int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); - -struct sock *dccp_create_openreq_child(const struct sock *sk, - const struct request_sock *req, - const struct sk_buff *skb); - -int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); - -struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req); -struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, - struct request_sock *req); - -int dccp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb); -int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - struct dccp_hdr *dh, unsigned int len); -int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned int len); - -void dccp_destruct_common(struct sock *sk); -int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized); -void dccp_destroy_sock(struct sock *sk); - -void dccp_close(struct sock *sk, long timeout); -struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, - struct request_sock *req); - -int dccp_connect(struct sock *sk); -int dccp_disconnect(struct sock *sk, int flags); -int dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); -int dccp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen); -int dccp_ioctl(struct sock *sk, int cmd, int *karg); -int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); -int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, - int *addr_len); -void dccp_shutdown(struct sock *sk, int how); -int inet_dccp_listen(struct socket *sock, int backlog); -__poll_t dccp_poll(struct file *file, struct socket *sock, - poll_table *wait); -int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); -void dccp_req_err(struct sock *sk, u64 seq); - -struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb); -int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); -void dccp_send_close(struct sock *sk, const int active); -int dccp_invalid_packet(struct sk_buff *skb); -u32 dccp_sample_rtt(struct sock *sk, long delta); - -static inline bool dccp_bad_service_code(const struct sock *sk, - const __be32 service) -{ - const struct dccp_sock *dp = dccp_sk(sk); - - if (dp->dccps_service == service) - return false; - return !dccp_list_has_service(dp->dccps_service_list, service); -} - -/** - * dccp_skb_cb - DCCP per-packet control information - * @dccpd_type: one of %dccp_pkt_type (or unknown) - * @dccpd_ccval: CCVal field (5.1), see e.g. RFC 4342, 8.1 - * @dccpd_reset_code: one of %dccp_reset_codes - * @dccpd_reset_data: Data1..3 fields (depend on @dccpd_reset_code) - * @dccpd_opt_len: total length of all options (5.8) in the packet - * @dccpd_seq: sequence number - * @dccpd_ack_seq: acknowledgment number subheader field value - * - * This is used for transmission as well as for reception. - */ -struct dccp_skb_cb { - union { - struct inet_skb_parm h4; -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_skb_parm h6; -#endif - } header; - __u8 dccpd_type:4; - __u8 dccpd_ccval:4; - __u8 dccpd_reset_code, - dccpd_reset_data[3]; - __u16 dccpd_opt_len; - __u64 dccpd_seq; - __u64 dccpd_ack_seq; -}; - -#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0])) - -/* RFC 4340, sec. 7.7 */ -static inline int dccp_non_data_packet(const struct sk_buff *skb) -{ - const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; - - return type == DCCP_PKT_ACK || - type == DCCP_PKT_CLOSE || - type == DCCP_PKT_CLOSEREQ || - type == DCCP_PKT_RESET || - type == DCCP_PKT_SYNC || - type == DCCP_PKT_SYNCACK; -} - -/* RFC 4340, sec. 7.7 */ -static inline int dccp_data_packet(const struct sk_buff *skb) -{ - const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; - - return type == DCCP_PKT_DATA || - type == DCCP_PKT_DATAACK || - type == DCCP_PKT_REQUEST || - type == DCCP_PKT_RESPONSE; -} - -static inline int dccp_packet_without_ack(const struct sk_buff *skb) -{ - const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; - - return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST; -} - -#define DCCP_PKT_WITHOUT_ACK_SEQ (UINT48_MAX << 2) - -static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss) -{ - struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh + - sizeof(*dh)); - dh->dccph_seq2 = 0; - dh->dccph_seq = htons((gss >> 32) & 0xfffff); - dhx->dccph_seq_low = htonl(gss & 0xffffffff); -} - -static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, - const u64 gsr) -{ - dhack->dccph_reserved1 = 0; - dhack->dccph_ack_nr_high = htons(gsr >> 32); - dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff); -} - -static inline void dccp_update_gsr(struct sock *sk, u64 seq) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (after48(seq, dp->dccps_gsr)) - dp->dccps_gsr = seq; - /* Sequence validity window depends on remote Sequence Window (7.5.1) */ - dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); - /* - * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, - * 7.5.1 we perform this check beyond the initial handshake: W/W' are - * always > 32, so for the first W/W' packets in the lifetime of a - * connection we always have to adjust SWL. - * A second reason why we are doing this is that the window depends on - * the feature-remote value of Sequence Window: nothing stops the peer - * from updating this value while we are busy adjusting SWL for the - * first W packets (we would have to count from scratch again then). - * Therefore it is safer to always make sure that the Sequence Window - * is not artificially extended by a peer who grows SWL downwards by - * continually updating the feature-remote Sequence-Window. - * If sequence numbers wrap it is bad luck. But that will take a while - * (48 bit), and this measure prevents Sequence-number attacks. - */ - if (before48(dp->dccps_swl, dp->dccps_isr)) - dp->dccps_swl = dp->dccps_isr; - dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); -} - -static inline void dccp_update_gss(struct sock *sk, u64 seq) -{ - struct dccp_sock *dp = dccp_sk(sk); - - dp->dccps_gss = seq; - /* Ack validity window depends on local Sequence Window value (7.5.1) */ - dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); - /* Adjust AWL so that it is not below ISS - see comment above for SWL */ - if (before48(dp->dccps_awl, dp->dccps_iss)) - dp->dccps_awl = dp->dccps_iss; - dp->dccps_awh = dp->dccps_gss; -} - -static inline int dccp_ackvec_pending(const struct sock *sk) -{ - return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL && - !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec); -} - -static inline int dccp_ack_pending(const struct sock *sk) -{ - return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk); -} - -int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val); -int dccp_feat_finalise_settings(struct dccp_sock *dp); -int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); -int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, - struct sk_buff *skb); -int dccp_feat_activate_values(struct sock *sk, struct list_head *fn); -void dccp_feat_list_purge(struct list_head *fn_list); - -int dccp_insert_options(struct sock *sk, struct sk_buff *skb); -int dccp_insert_options_rsk(struct dccp_request_sock *, struct sk_buff *); -u32 dccp_timestamp(void); -void dccp_timestamping_init(void); -int dccp_insert_option(struct sk_buff *skb, unsigned char option, - const void *value, unsigned char len); - -#ifdef CONFIG_SYSCTL -int dccp_sysctl_init(void); -void dccp_sysctl_exit(void); -#else -static inline int dccp_sysctl_init(void) -{ - return 0; -} - -static inline void dccp_sysctl_exit(void) -{ -} -#endif - -#endif /* _DCCP_H */ diff --git a/net/dccp/diag.c b/net/dccp/diag.c deleted file mode 100644 index f5019d95c3ae5..0000000000000 --- a/net/dccp/diag.c +++ /dev/null @@ -1,85 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/diag.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - - -#include -#include - -#include "ccid.h" -#include "dccp.h" - -static void dccp_get_info(struct sock *sk, struct tcp_info *info) -{ - struct dccp_sock *dp = dccp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); - - memset(info, 0, sizeof(*info)); - - info->tcpi_state = sk->sk_state; - info->tcpi_retransmits = icsk->icsk_retransmits; - info->tcpi_probes = icsk->icsk_probes_out; - info->tcpi_backoff = icsk->icsk_backoff; - info->tcpi_pmtu = icsk->icsk_pmtu_cookie; - - if (dp->dccps_hc_rx_ackvec != NULL) - info->tcpi_options |= TCPI_OPT_SACK; - - if (dp->dccps_hc_rx_ccid != NULL) - ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); - - if (dp->dccps_hc_tx_ccid != NULL) - ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info); -} - -static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, - void *_info) -{ - r->idiag_rqueue = r->idiag_wqueue = 0; - - if (_info != NULL) - dccp_get_info(sk, _info); -} - -static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r) -{ - inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r); -} - -static int dccp_diag_dump_one(struct netlink_callback *cb, - const struct inet_diag_req_v2 *req) -{ - return inet_diag_dump_one_icsk(&dccp_hashinfo, cb, req); -} - -static const struct inet_diag_handler dccp_diag_handler = { - .owner = THIS_MODULE, - .dump = dccp_diag_dump, - .dump_one = dccp_diag_dump_one, - .idiag_get_info = dccp_diag_get_info, - .idiag_type = IPPROTO_DCCP, - .idiag_info_size = sizeof(struct tcp_info), -}; - -static int __init dccp_diag_init(void) -{ - return inet_diag_register(&dccp_diag_handler); -} - -static void __exit dccp_diag_fini(void) -{ - inet_diag_unregister(&dccp_diag_handler); -} - -module_init(dccp_diag_init); -module_exit(dccp_diag_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo "); -MODULE_DESCRIPTION("DCCP inet_diag handler"); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-33 /* AF_INET - IPPROTO_DCCP */); diff --git a/net/dccp/feat.c b/net/dccp/feat.c deleted file mode 100644 index f7554dcdaaba9..0000000000000 --- a/net/dccp/feat.c +++ /dev/null @@ -1,1581 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/feat.c - * - * Feature negotiation for the DCCP protocol (RFC 4340, section 6) - * - * Copyright (c) 2008 Gerrit Renker - * Rewrote from scratch, some bits from earlier code by - * Copyright (c) 2005 Andrea Bittau - * - * ASSUMPTIONS - * ----------- - * o Feature negotiation is coordinated with connection setup (as in TCP), wild - * changes of parameters of an established connection are not supported. - * o Changing non-negotiable (NN) values is supported in state OPEN/PARTOPEN. - * o All currently known SP features have 1-byte quantities. If in the future - * extensions of RFCs 4340..42 define features with item lengths larger than - * one byte, a feature-specific extension of the code will be required. - */ -#include -#include -#include "ccid.h" -#include "feat.h" - -/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ -unsigned long sysctl_dccp_sequence_window __read_mostly = 100; -int sysctl_dccp_rx_ccid __read_mostly = 2, - sysctl_dccp_tx_ccid __read_mostly = 2; - -/* - * Feature activation handlers. - * - * These all use an u64 argument, to provide enough room for NN/SP features. At - * this stage the negotiated values have been checked to be within their range. - */ -static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid *new_ccid = ccid_new(ccid, sk, rx); - - if (new_ccid == NULL) - return -ENOMEM; - - if (rx) { - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = new_ccid; - } else { - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_tx_ccid = new_ccid; - } - return 0; -} - -static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (rx) { - dp->dccps_r_seq_win = seq_win; - /* propagate changes to update SWL/SWH */ - dccp_update_gsr(sk, dp->dccps_gsr); - } else { - dp->dccps_l_seq_win = seq_win; - /* propagate changes to update AWL */ - dccp_update_gss(sk, dp->dccps_gss); - } - return 0; -} - -static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) -{ - if (rx) - dccp_sk(sk)->dccps_r_ack_ratio = ratio; - else - dccp_sk(sk)->dccps_l_ack_ratio = ratio; - return 0; -} - -static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (rx) { - if (enable && dp->dccps_hc_rx_ackvec == NULL) { - dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any()); - if (dp->dccps_hc_rx_ackvec == NULL) - return -ENOMEM; - } else if (!enable) { - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - } - } - return 0; -} - -static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) -{ - if (!rx) - dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); - return 0; -} - -/* - * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that - * `rx' holds when the sending peer informs about his partial coverage via a - * ChangeR() option. In the other case, we are the sender and the receiver - * announces its coverage via ChangeL() options. The policy here is to honour - * such communication by enabling the corresponding partial coverage - but only - * if it has not been set manually before; the warning here means that all - * packets will be dropped. - */ -static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (rx) - dp->dccps_pcrlen = cscov; - else { - if (dp->dccps_pcslen == 0) - dp->dccps_pcslen = cscov; - else if (cscov > dp->dccps_pcslen) - DCCP_WARN("CsCov %u too small, peer requires >= %u\n", - dp->dccps_pcslen, (u8)cscov); - } - return 0; -} - -static const struct { - u8 feat_num; /* DCCPF_xxx */ - enum dccp_feat_type rxtx; /* RX or TX */ - enum dccp_feat_type reconciliation; /* SP or NN */ - u8 default_value; /* as in 6.4 */ - int (*activation_hdlr)(struct sock *sk, u64 val, bool rx); -/* - * Lookup table for location and type of features (from RFC 4340/4342) - * +--------------------------+----+-----+----+----+---------+-----------+ - * | Feature | Location | Reconc. | Initial | Section | - * | | RX | TX | SP | NN | Value | Reference | - * +--------------------------+----+-----+----+----+---------+-----------+ - * | DCCPF_CCID | | X | X | | 2 | 10 | - * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 | - * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 | - * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 | - * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 | - * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 | - * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 | - * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 | - * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 | - * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 | - * +--------------------------+----+-----+----+----+---------+-----------+ - */ -} dccp_feat_table[] = { - { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid }, - { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL }, - { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win }, - { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL }, - { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio}, - { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec }, - { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp }, - { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov}, - { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL }, - { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL }, -}; -#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table) - -/** - * dccp_feat_index - Hash function to map feature number into array position - * @feat_num: feature to hash, one of %dccp_feature_numbers - * - * Returns consecutive array index or -1 if the feature is not understood. - */ -static int dccp_feat_index(u8 feat_num) -{ - /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */ - if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM) - return feat_num - 1; - - /* - * Other features: add cases for new feature types here after adding - * them to the above table. - */ - switch (feat_num) { - case DCCPF_SEND_LEV_RATE: - return DCCP_FEAT_SUPPORTED_MAX - 1; - } - return -1; -} - -static u8 dccp_feat_type(u8 feat_num) -{ - int idx = dccp_feat_index(feat_num); - - if (idx < 0) - return FEAT_UNKNOWN; - return dccp_feat_table[idx].reconciliation; -} - -static int dccp_feat_default_value(u8 feat_num) -{ - int idx = dccp_feat_index(feat_num); - /* - * There are no default values for unknown features, so encountering a - * negative index here indicates a serious problem somewhere else. - */ - DCCP_BUG_ON(idx < 0); - - return idx < 0 ? 0 : dccp_feat_table[idx].default_value; -} - -/* - * Debugging and verbose-printing section - */ -static const char *dccp_feat_fname(const u8 feat) -{ - static const char *const feature_names[] = { - [DCCPF_RESERVED] = "Reserved", - [DCCPF_CCID] = "CCID", - [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", - [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", - [DCCPF_ECN_INCAPABLE] = "ECN Incapable", - [DCCPF_ACK_RATIO] = "Ack Ratio", - [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", - [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", - [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", - [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", - }; - if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) - return feature_names[DCCPF_RESERVED]; - - if (feat == DCCPF_SEND_LEV_RATE) - return "Send Loss Event Rate"; - if (feat >= DCCPF_MIN_CCID_SPECIFIC) - return "CCID-specific"; - - return feature_names[feat]; -} - -static const char *const dccp_feat_sname[] = { - "DEFAULT", "INITIALISING", "CHANGING", "UNSTABLE", "STABLE", -}; - -#ifdef CONFIG_IP_DCCP_DEBUG -static const char *dccp_feat_oname(const u8 opt) -{ - switch (opt) { - case DCCPO_CHANGE_L: return "Change_L"; - case DCCPO_CONFIRM_L: return "Confirm_L"; - case DCCPO_CHANGE_R: return "Change_R"; - case DCCPO_CONFIRM_R: return "Confirm_R"; - } - return NULL; -} - -static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val) -{ - u8 i, type = dccp_feat_type(feat_num); - - if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL)) - dccp_pr_debug_cat("(NULL)"); - else if (type == FEAT_SP) - for (i = 0; i < val->sp.len; i++) - dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]); - else if (type == FEAT_NN) - dccp_pr_debug_cat("%llu", (unsigned long long)val->nn); - else - dccp_pr_debug_cat("unknown type %u", type); -} - -static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len) -{ - u8 type = dccp_feat_type(feat_num); - dccp_feat_val fval = { .sp.vec = list, .sp.len = len }; - - if (type == FEAT_NN) - fval.nn = dccp_decode_value_var(list, len); - dccp_feat_printval(feat_num, &fval); -} - -static void dccp_feat_print_entry(struct dccp_feat_entry const *entry) -{ - dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote", - dccp_feat_fname(entry->feat_num)); - dccp_feat_printval(entry->feat_num, &entry->val); - dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state], - entry->needs_confirm ? "(Confirm pending)" : ""); -} - -#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \ - dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\ - dccp_feat_printvals(feat, val, len); \ - dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0) - -#define dccp_feat_print_fnlist(fn_list) { \ - const struct dccp_feat_entry *___entry; \ - \ - dccp_pr_debug("List Dump:\n"); \ - list_for_each_entry(___entry, fn_list, node) \ - dccp_feat_print_entry(___entry); \ -} -#else /* ! CONFIG_IP_DCCP_DEBUG */ -#define dccp_feat_print_opt(opt, feat, val, len, mandatory) -#define dccp_feat_print_fnlist(fn_list) -#endif - -static int __dccp_feat_activate(struct sock *sk, const int idx, - const bool is_local, dccp_feat_val const *fval) -{ - bool rx; - u64 val; - - if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX) - return -1; - if (dccp_feat_table[idx].activation_hdlr == NULL) - return 0; - - if (fval == NULL) { - val = dccp_feat_table[idx].default_value; - } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) { - if (fval->sp.vec == NULL) { - /* - * This can happen when an empty Confirm is sent - * for an SP (i.e. known) feature. In this case - * we would be using the default anyway. - */ - DCCP_CRIT("Feature #%d undefined: using default", idx); - val = dccp_feat_table[idx].default_value; - } else { - val = fval->sp.vec[0]; - } - } else { - val = fval->nn; - } - - /* Location is RX if this is a local-RX or remote-TX feature */ - rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); - - dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX", - dccp_feat_fname(dccp_feat_table[idx].feat_num), - fval ? "" : "default ", (unsigned long long)val); - - return dccp_feat_table[idx].activation_hdlr(sk, val, rx); -} - -/** - * dccp_feat_activate - Activate feature value on socket - * @sk: fully connected DCCP socket (after handshake is complete) - * @feat_num: feature to activate, one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is meant - * @fval: the value (SP or NN) to activate, or NULL to use the default value - * - * For general use this function is preferable over __dccp_feat_activate(). - */ -static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local, - dccp_feat_val const *fval) -{ - return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval); -} - -/* Test for "Req'd" feature (RFC 4340, 6.4) */ -static inline int dccp_feat_must_be_understood(u8 feat_num) -{ - return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS || - feat_num == DCCPF_SEQUENCE_WINDOW; -} - -/* copy constructor, fval must not already contain allocated memory */ -static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) -{ - fval->sp.len = len; - if (fval->sp.len > 0) { - fval->sp.vec = kmemdup(val, len, gfp_any()); - if (fval->sp.vec == NULL) { - fval->sp.len = 0; - return -ENOMEM; - } - } - return 0; -} - -static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val) -{ - if (unlikely(val == NULL)) - return; - if (dccp_feat_type(feat_num) == FEAT_SP) - kfree(val->sp.vec); - memset(val, 0, sizeof(*val)); -} - -static struct dccp_feat_entry * - dccp_feat_clone_entry(struct dccp_feat_entry const *original) -{ - struct dccp_feat_entry *new; - u8 type = dccp_feat_type(original->feat_num); - - if (type == FEAT_UNKNOWN) - return NULL; - - new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any()); - if (new == NULL) - return NULL; - - if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val, - original->val.sp.vec, - original->val.sp.len)) { - kfree(new); - return NULL; - } - return new; -} - -static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) -{ - if (entry != NULL) { - dccp_feat_val_destructor(entry->feat_num, &entry->val); - kfree(entry); - } -} - -/* - * List management functions - * - * Feature negotiation lists rely on and maintain the following invariants: - * - each feat_num in the list is known, i.e. we know its type and default value - * - each feat_num/is_local combination is unique (old entries are overwritten) - * - SP values are always freshly allocated - * - list is sorted in increasing order of feature number (faster lookup) - */ -static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list, - u8 feat_num, bool is_local) -{ - struct dccp_feat_entry *entry; - - list_for_each_entry(entry, fn_list, node) { - if (entry->feat_num == feat_num && entry->is_local == is_local) - return entry; - else if (entry->feat_num > feat_num) - break; - } - return NULL; -} - -/** - * dccp_feat_entry_new - Central list update routine (called by all others) - * @head: list to add to - * @feat: feature number - * @local: whether the local (1) or remote feature with number @feat is meant - * - * This is the only constructor and serves to ensure the above invariants. - */ -static struct dccp_feat_entry * - dccp_feat_entry_new(struct list_head *head, u8 feat, bool local) -{ - struct dccp_feat_entry *entry; - - list_for_each_entry(entry, head, node) - if (entry->feat_num == feat && entry->is_local == local) { - dccp_feat_val_destructor(entry->feat_num, &entry->val); - return entry; - } else if (entry->feat_num > feat) { - head = &entry->node; - break; - } - - entry = kmalloc(sizeof(*entry), gfp_any()); - if (entry != NULL) { - entry->feat_num = feat; - entry->is_local = local; - list_add_tail(&entry->node, head); - } - return entry; -} - -/** - * dccp_feat_push_change - Add/overwrite a Change option in the list - * @fn_list: feature-negotiation list to update - * @feat: one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is meant - * @mandatory: whether to use Mandatory feature negotiation options - * @fval: pointer to NN/SP value to be inserted (will be copied) - */ -static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local, - u8 mandatory, dccp_feat_val *fval) -{ - struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); - - if (new == NULL) - return -ENOMEM; - - new->feat_num = feat; - new->is_local = local; - new->state = FEAT_INITIALISING; - new->needs_confirm = false; - new->empty_confirm = false; - new->val = *fval; - new->needs_mandatory = mandatory; - - return 0; -} - -/** - * dccp_feat_push_confirm - Add a Confirm entry to the FN list - * @fn_list: feature-negotiation list to add to - * @feat: one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is being confirmed - * @fval: pointer to NN/SP value to be inserted or NULL - * - * Returns 0 on success, a Reset code for further processing otherwise. - */ -static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local, - dccp_feat_val *fval) -{ - struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); - - if (new == NULL) - return DCCP_RESET_CODE_TOO_BUSY; - - new->feat_num = feat; - new->is_local = local; - new->state = FEAT_STABLE; /* transition in 6.6.2 */ - new->needs_confirm = true; - new->empty_confirm = (fval == NULL); - new->val.nn = 0; /* zeroes the whole structure */ - if (!new->empty_confirm) - new->val = *fval; - new->needs_mandatory = false; - - return 0; -} - -static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local) -{ - return dccp_feat_push_confirm(fn_list, feat, local, NULL); -} - -static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry) -{ - list_del(&entry->node); - dccp_feat_entry_destructor(entry); -} - -void dccp_feat_list_purge(struct list_head *fn_list) -{ - struct dccp_feat_entry *entry, *next; - - list_for_each_entry_safe(entry, next, fn_list, node) - dccp_feat_entry_destructor(entry); - INIT_LIST_HEAD(fn_list); -} -EXPORT_SYMBOL_GPL(dccp_feat_list_purge); - -/* generate @to as full clone of @from - @to must not contain any nodes */ -int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) -{ - struct dccp_feat_entry *entry, *new; - - INIT_LIST_HEAD(to); - list_for_each_entry(entry, from, node) { - new = dccp_feat_clone_entry(entry); - if (new == NULL) - goto cloning_failed; - list_add_tail(&new->node, to); - } - return 0; - -cloning_failed: - dccp_feat_list_purge(to); - return -ENOMEM; -} - -/** - * dccp_feat_valid_nn_length - Enforce length constraints on NN options - * @feat_num: feature to return length of, one of %dccp_feature_numbers - * - * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, - * incoming options are accepted as long as their values are valid. - */ -static u8 dccp_feat_valid_nn_length(u8 feat_num) -{ - if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */ - return 2; - if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */ - return 6; - return 0; -} - -static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) -{ - switch (feat_num) { - case DCCPF_ACK_RATIO: - return val <= DCCPF_ACK_RATIO_MAX; - case DCCPF_SEQUENCE_WINDOW: - return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX; - } - return 0; /* feature unknown - so we can't tell */ -} - -/* check that SP values are within the ranges defined in RFC 4340 */ -static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val) -{ - switch (feat_num) { - case DCCPF_CCID: - return val == DCCPC_CCID2 || val == DCCPC_CCID3; - /* Type-check Boolean feature values: */ - case DCCPF_SHORT_SEQNOS: - case DCCPF_ECN_INCAPABLE: - case DCCPF_SEND_ACK_VECTOR: - case DCCPF_SEND_NDP_COUNT: - case DCCPF_DATA_CHECKSUM: - case DCCPF_SEND_LEV_RATE: - return val < 2; - case DCCPF_MIN_CSUM_COVER: - return val < 16; - } - return 0; /* feature unknown */ -} - -static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) -{ - if (sp_list == NULL || sp_len < 1) - return 0; - while (sp_len--) - if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++)) - return 0; - return 1; -} - -/** - * dccp_feat_insert_opts - Generate FN options from current list state - * @skb: next sk_buff to be sent to the peer - * @dp: for client during handshake and general negotiation - * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND) - */ -int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; - struct dccp_feat_entry *pos, *next; - u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN]; - bool rpt; - - /* put entries into @skb in the order they appear in the list */ - list_for_each_entry_safe_reverse(pos, next, fn, node) { - opt = dccp_feat_genopt(pos); - type = dccp_feat_type(pos->feat_num); - rpt = false; - - if (pos->empty_confirm) { - len = 0; - ptr = NULL; - } else { - if (type == FEAT_SP) { - len = pos->val.sp.len; - ptr = pos->val.sp.vec; - rpt = pos->needs_confirm; - } else if (type == FEAT_NN) { - len = dccp_feat_valid_nn_length(pos->feat_num); - ptr = nn_in_nbo; - dccp_encode_value_var(pos->val.nn, ptr, len); - } else { - DCCP_BUG("unknown feature %u", pos->feat_num); - return -1; - } - } - dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0); - - if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) - return -1; - if (pos->needs_mandatory && dccp_insert_option_mandatory(skb)) - return -1; - - if (skb->sk->sk_state == DCCP_OPEN && - (opt == DCCPO_CONFIRM_R || opt == DCCPO_CONFIRM_L)) { - /* - * Confirms don't get retransmitted (6.6.3) once the - * connection is in state OPEN - */ - dccp_feat_list_pop(pos); - } else { - /* - * Enter CHANGING after transmitting the Change - * option (6.6.2). - */ - if (pos->state == FEAT_INITIALISING) - pos->state = FEAT_CHANGING; - } - } - return 0; -} - -/** - * __feat_register_nn - Register new NN value on socket - * @fn: feature-negotiation list to register with - * @feat: an NN feature from %dccp_feature_numbers - * @mandatory: use Mandatory option if 1 - * @nn_val: value to register (restricted to 4 bytes) - * - * Note that NN features are local by definition (RFC 4340, 6.3.2). - */ -static int __feat_register_nn(struct list_head *fn, u8 feat, - u8 mandatory, u64 nn_val) -{ - dccp_feat_val fval = { .nn = nn_val }; - - if (dccp_feat_type(feat) != FEAT_NN || - !dccp_feat_is_valid_nn_val(feat, nn_val)) - return -EINVAL; - - /* Don't bother with default values, they will be activated anyway. */ - if (nn_val - (u64)dccp_feat_default_value(feat) == 0) - return 0; - - return dccp_feat_push_change(fn, feat, 1, mandatory, &fval); -} - -/** - * __feat_register_sp - Register new SP value/list on socket - * @fn: feature-negotiation list to register with - * @feat: an SP feature from %dccp_feature_numbers - * @is_local: whether the local (1) or the remote (0) @feat is meant - * @mandatory: use Mandatory option if 1 - * @sp_val: SP value followed by optional preference list - * @sp_len: length of @sp_val in bytes - */ -static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, - u8 mandatory, u8 const *sp_val, u8 sp_len) -{ - dccp_feat_val fval; - - if (dccp_feat_type(feat) != FEAT_SP || - !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) - return -EINVAL; - - /* Avoid negotiating alien CCIDs by only advertising supported ones */ - if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) - return -EOPNOTSUPP; - - if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) - return -ENOMEM; - - if (dccp_feat_push_change(fn, feat, is_local, mandatory, &fval)) { - kfree(fval.sp.vec); - return -ENOMEM; - } - - return 0; -} - -/** - * dccp_feat_register_sp - Register requests to change SP feature values - * @sk: client or listening socket - * @feat: one of %dccp_feature_numbers - * @is_local: whether the local (1) or remote (0) @feat is meant - * @list: array of preferred values, in descending order of preference - * @len: length of @list in bytes - */ -int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, - u8 const *list, u8 len) -{ /* any changes must be registered before establishing the connection */ - if (sk->sk_state != DCCP_CLOSED) - return -EISCONN; - if (dccp_feat_type(feat) != FEAT_SP) - return -EINVAL; - return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local, - 0, list, len); -} - -/** - * dccp_feat_nn_get - Query current/pending value of NN feature - * @sk: DCCP socket of an established connection - * @feat: NN feature number from %dccp_feature_numbers - * - * For a known NN feature, returns value currently being negotiated, or - * current (confirmed) value if no negotiation is going on. - */ -u64 dccp_feat_nn_get(struct sock *sk, u8 feat) -{ - if (dccp_feat_type(feat) == FEAT_NN) { - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_feat_entry *entry; - - entry = dccp_feat_list_lookup(&dp->dccps_featneg, feat, 1); - if (entry != NULL) - return entry->val.nn; - - switch (feat) { - case DCCPF_ACK_RATIO: - return dp->dccps_l_ack_ratio; - case DCCPF_SEQUENCE_WINDOW: - return dp->dccps_l_seq_win; - } - } - DCCP_BUG("attempt to look up unsupported feature %u", feat); - return 0; -} -EXPORT_SYMBOL_GPL(dccp_feat_nn_get); - -/** - * dccp_feat_signal_nn_change - Update NN values for an established connection - * @sk: DCCP socket of an established connection - * @feat: NN feature number from %dccp_feature_numbers - * @nn_val: the new value to use - * - * This function is used to communicate NN updates out-of-band. - */ -int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - dccp_feat_val fval = { .nn = nn_val }; - struct dccp_feat_entry *entry; - - if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN) - return 0; - - if (dccp_feat_type(feat) != FEAT_NN || - !dccp_feat_is_valid_nn_val(feat, nn_val)) - return -EINVAL; - - if (nn_val == dccp_feat_nn_get(sk, feat)) - return 0; /* already set or negotiation under way */ - - entry = dccp_feat_list_lookup(fn, feat, 1); - if (entry != NULL) { - dccp_pr_debug("Clobbering existing NN entry %llu -> %llu\n", - (unsigned long long)entry->val.nn, - (unsigned long long)nn_val); - dccp_feat_list_pop(entry); - } - - inet_csk_schedule_ack(sk); - return dccp_feat_push_change(fn, feat, 1, 0, &fval); -} -EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change); - -/* - * Tracking features whose value depend on the choice of CCID - * - * This is designed with an extension in mind so that a list walk could be done - * before activating any features. However, the existing framework was found to - * work satisfactorily up until now, the automatic verification is left open. - * When adding new CCIDs, add a corresponding dependency table here. - */ -static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local) -{ - static const struct ccid_dependency ccid2_dependencies[2][2] = { - /* - * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX - * feature and Send Ack Vector is an RX feature, `is_local' - * needs to be reversed. - */ - { /* Dependencies of the receiver-side (remote) CCID2 */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = true, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 } - }, - { /* Dependencies of the sender-side (local) CCID2 */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 } - } - }; - static const struct ccid_dependency ccid3_dependencies[2][5] = { - { /* - * Dependencies of the receiver-side CCID3 - */ - { /* locally disable Ack Vectors */ - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = true, - .is_mandatory = false, - .val = 0 - }, - { /* see below why Send Loss Event Rate is on */ - .dependent_feat = DCCPF_SEND_LEV_RATE, - .is_local = true, - .is_mandatory = true, - .val = 1 - }, - { /* NDP Count is needed as per RFC 4342, 6.1.1 */ - .dependent_feat = DCCPF_SEND_NDP_COUNT, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 }, - }, - { /* - * CCID3 at the TX side: we request that the HC-receiver - * will not send Ack Vectors (they will be ignored, so - * Mandatory is not set); we enable Send Loss Event Rate - * (Mandatory since the implementation does not support - * the Loss Intervals option of RFC 4342, 8.6). - * The last two options are for peer's information only. - */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = false, - .is_mandatory = false, - .val = 0 - }, - { - .dependent_feat = DCCPF_SEND_LEV_RATE, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { /* this CCID does not support Ack Ratio */ - .dependent_feat = DCCPF_ACK_RATIO, - .is_local = true, - .is_mandatory = false, - .val = 0 - }, - { /* tell receiver we are sending NDP counts */ - .dependent_feat = DCCPF_SEND_NDP_COUNT, - .is_local = true, - .is_mandatory = false, - .val = 1 - }, - { 0, 0, 0, 0 } - } - }; - switch (ccid) { - case DCCPC_CCID2: - return ccid2_dependencies[is_local]; - case DCCPC_CCID3: - return ccid3_dependencies[is_local]; - default: - return NULL; - } -} - -/** - * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID - * @fn: feature-negotiation list to update - * @id: CCID number to track - * @is_local: whether TX CCID (1) or RX CCID (0) is meant - * - * This function needs to be called after registering all other features. - */ -static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) -{ - const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local); - int i, rc = (table == NULL); - - for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++) - if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP) - rc = __feat_register_sp(fn, table[i].dependent_feat, - table[i].is_local, - table[i].is_mandatory, - &table[i].val, 1); - else - rc = __feat_register_nn(fn, table[i].dependent_feat, - table[i].is_mandatory, - table[i].val); - return rc; -} - -/** - * dccp_feat_finalise_settings - Finalise settings before starting negotiation - * @dp: client or listening socket (settings will be inherited) - * - * This is called after all registrations (socket initialisation, sysctls, and - * sockopt calls), and before sending the first packet containing Change options - * (ie. client-Request or server-Response), to ensure internal consistency. - */ -int dccp_feat_finalise_settings(struct dccp_sock *dp) -{ - struct list_head *fn = &dp->dccps_featneg; - struct dccp_feat_entry *entry; - int i = 2, ccids[2] = { -1, -1 }; - - /* - * Propagating CCIDs: - * 1) not useful to propagate CCID settings if this host advertises more - * than one CCID: the choice of CCID may still change - if this is - * the client, or if this is the server and the client sends - * singleton CCID values. - * 2) since is that propagate_ccid changes the list, we defer changing - * the sorted list until after the traversal. - */ - list_for_each_entry(entry, fn, node) - if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1) - ccids[entry->is_local] = entry->val.sp.vec[0]; - while (i--) - if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) - return -1; - dccp_feat_print_fnlist(fn); - return 0; -} - -/** - * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features - * @dreq: server socket to resolve - * - * It is the server which resolves the dependencies once the CCID has been - * fully negotiated. If no CCID has been negotiated, it uses the default CCID. - */ -int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq) -{ - struct list_head *fn = &dreq->dreq_featneg; - struct dccp_feat_entry *entry; - u8 is_local, ccid; - - for (is_local = 0; is_local <= 1; is_local++) { - entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local); - - if (entry != NULL && !entry->empty_confirm) - ccid = entry->val.sp.vec[0]; - else - ccid = dccp_feat_default_value(DCCPF_CCID); - - if (dccp_feat_propagate_ccid(fn, ccid, is_local)) - return -1; - } - return 0; -} - -/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ -static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) -{ - u8 c, s; - - for (s = 0; s < slen; s++) - for (c = 0; c < clen; c++) - if (servlist[s] == clilist[c]) - return servlist[s]; - return -1; -} - -/** - * dccp_feat_prefer - Move preferred entry to the start of array - * @preferred_value: entry to move to start of array - * @array: array of preferred entries - * @array_len: size of the array - * - * Reorder the @array_len elements in @array so that @preferred_value comes - * first. Returns >0 to indicate that @preferred_value does occur in @array. - */ -static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len) -{ - u8 i, does_occur = 0; - - if (array != NULL) { - for (i = 0; i < array_len; i++) - if (array[i] == preferred_value) { - array[i] = array[0]; - does_occur++; - } - if (does_occur) - array[0] = preferred_value; - } - return does_occur; -} - -/** - * dccp_feat_reconcile - Reconcile SP preference lists - * @fv: SP list to reconcile into - * @arr: received SP preference list - * @len: length of @arr in bytes - * @is_server: whether this side is the server (and @fv is the server's list) - * @reorder: whether to reorder the list in @fv after reconciling with @arr - * When successful, > 0 is returned and the reconciled list is in @fval. - * A value of 0 means that negotiation failed (no shared entry). - */ -static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len, - bool is_server, bool reorder) -{ - int rc; - - if (!fv->sp.vec || !arr) { - DCCP_CRIT("NULL feature value or array"); - return 0; - } - - if (is_server) - rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len); - else - rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len); - - if (!reorder) - return rc; - if (rc < 0) - return 0; - - /* - * Reorder list: used for activating features and in dccp_insert_fn_opt. - */ - return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len); -} - -/** - * dccp_feat_change_recv - Process incoming ChangeL/R options - * @fn: feature-negotiation list to update - * @is_mandatory: whether the Change was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R - * @feat: one of %dccp_feature_numbers - * @val: NN value or SP value/preference list - * @len: length of @val in bytes - * @server: whether this node is the server (1) or the client (0) - */ -static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, - u8 feat, u8 *val, u8 len, const bool server) -{ - u8 defval, type = dccp_feat_type(feat); - const bool local = (opt == DCCPO_CHANGE_R); - struct dccp_feat_entry *entry; - dccp_feat_val fval; - - if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ - goto unknown_feature_or_value; - - dccp_feat_print_opt(opt, feat, val, len, is_mandatory); - - /* - * Negotiation of NN features: Change R is invalid, so there is no - * simultaneous negotiation; hence we do not look up in the list. - */ - if (type == FEAT_NN) { - if (local || len > sizeof(fval.nn)) - goto unknown_feature_or_value; - - /* 6.3.2: "The feature remote MUST accept any valid value..." */ - fval.nn = dccp_decode_value_var(val, len); - if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) - goto unknown_feature_or_value; - - return dccp_feat_push_confirm(fn, feat, local, &fval); - } - - /* - * Unidirectional/simultaneous negotiation of SP features (6.3.1) - */ - entry = dccp_feat_list_lookup(fn, feat, local); - if (entry == NULL) { - /* - * No particular preferences have been registered. We deal with - * this situation by assuming that all valid values are equally - * acceptable, and apply the following checks: - * - if the peer's list is a singleton, we accept a valid value; - * - if we are the server, we first try to see if the peer (the - * client) advertises the default value. If yes, we use it, - * otherwise we accept the preferred value; - * - else if we are the client, we use the first list element. - */ - if (dccp_feat_clone_sp_val(&fval, val, 1)) - return DCCP_RESET_CODE_TOO_BUSY; - - if (len > 1 && server) { - defval = dccp_feat_default_value(feat); - if (dccp_feat_preflist_match(&defval, 1, val, len) > -1) - fval.sp.vec[0] = defval; - } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) { - kfree(fval.sp.vec); - goto unknown_feature_or_value; - } - - /* Treat unsupported CCIDs like invalid values */ - if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) { - kfree(fval.sp.vec); - goto not_valid_or_not_known; - } - - if (dccp_feat_push_confirm(fn, feat, local, &fval)) { - kfree(fval.sp.vec); - return DCCP_RESET_CODE_TOO_BUSY; - } - - return 0; - } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ - return 0; - } - - if (dccp_feat_reconcile(&entry->val, val, len, server, true)) { - entry->empty_confirm = false; - } else if (is_mandatory) { - return DCCP_RESET_CODE_MANDATORY_ERROR; - } else if (entry->state == FEAT_INITIALISING) { - /* - * Failed simultaneous negotiation (server only): try to `save' - * the connection by checking whether entry contains the default - * value for @feat. If yes, send an empty Confirm to signal that - * the received Change was not understood - which implies using - * the default value. - * If this also fails, we use Reset as the last resort. - */ - WARN_ON(!server); - defval = dccp_feat_default_value(feat); - if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true)) - return DCCP_RESET_CODE_OPTION_ERROR; - entry->empty_confirm = true; - } - entry->needs_confirm = true; - entry->needs_mandatory = false; - entry->state = FEAT_STABLE; - return 0; - -unknown_feature_or_value: - if (!is_mandatory) - return dccp_push_empty_confirm(fn, feat, local); - -not_valid_or_not_known: - return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; -} - -/** - * dccp_feat_confirm_recv - Process received Confirm options - * @fn: feature-negotiation list to update - * @is_mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R - * @feat: one of %dccp_feature_numbers - * @val: NN value or SP value/preference list - * @len: length of @val in bytes - * @server: whether this node is server (1) or client (0) - */ -static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, - u8 feat, u8 *val, u8 len, const bool server) -{ - u8 *plist, plen, type = dccp_feat_type(feat); - const bool local = (opt == DCCPO_CONFIRM_R); - struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); - - dccp_feat_print_opt(opt, feat, val, len, is_mandatory); - - if (entry == NULL) { /* nothing queued: ignore or handle error */ - if (is_mandatory && type == FEAT_UNKNOWN) - return DCCP_RESET_CODE_MANDATORY_ERROR; - - if (!local && type == FEAT_NN) /* 6.3.2 */ - goto confirmation_failed; - return 0; - } - - if (entry->state != FEAT_CHANGING) /* 6.6.2 */ - return 0; - - if (len == 0) { - if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */ - goto confirmation_failed; - /* - * Empty Confirm during connection setup: this means reverting - * to the `old' value, which in this case is the default. Since - * we handle default values automatically when no other values - * have been set, we revert to the old value by removing this - * entry from the list. - */ - dccp_feat_list_pop(entry); - return 0; - } - - if (type == FEAT_NN) { - if (len > sizeof(entry->val.nn)) - goto confirmation_failed; - - if (entry->val.nn == dccp_decode_value_var(val, len)) - goto confirmation_succeeded; - - DCCP_WARN("Bogus Confirm for non-existing value\n"); - goto confirmation_failed; - } - - /* - * Parsing SP Confirms: the first element of @val is the preferred - * SP value which the peer confirms, the remainder depends on @len. - * Note that only the confirmed value need to be a valid SP value. - */ - if (!dccp_feat_is_valid_sp_val(feat, *val)) - goto confirmation_failed; - - if (len == 1) { /* peer didn't supply a preference list */ - plist = val; - plen = len; - } else { /* preferred value + preference list */ - plist = val + 1; - plen = len - 1; - } - - /* Check whether the peer got the reconciliation right (6.6.8) */ - if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) { - DCCP_WARN("Confirm selected the wrong value %u\n", *val); - return DCCP_RESET_CODE_OPTION_ERROR; - } - entry->val.sp.vec[0] = *val; - -confirmation_succeeded: - entry->state = FEAT_STABLE; - return 0; - -confirmation_failed: - DCCP_WARN("Confirmation failed\n"); - return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; -} - -/** - * dccp_feat_handle_nn_established - Fast-path reception of NN options - * @sk: socket of an established DCCP connection - * @mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only) - * @feat: NN number, one of %dccp_feature_numbers - * @val: NN value - * @len: length of @val in bytes - * - * This function combines the functionality of change_recv/confirm_recv, with - * the following differences (reset codes are the same): - * - cleanup after receiving the Confirm; - * - values are directly activated after successful parsing; - * - deliberately restricted to NN features. - * The restriction to NN features is essential since SP features can have non- - * predictable outcomes (depending on the remote configuration), and are inter- - * dependent (CCIDs for instance cause further dependencies). - */ -static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt, - u8 feat, u8 *val, u8 len) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - const bool local = (opt == DCCPO_CONFIRM_R); - struct dccp_feat_entry *entry; - u8 type = dccp_feat_type(feat); - dccp_feat_val fval; - - dccp_feat_print_opt(opt, feat, val, len, mandatory); - - /* Ignore non-mandatory unknown and non-NN features */ - if (type == FEAT_UNKNOWN) { - if (local && !mandatory) - return 0; - goto fast_path_unknown; - } else if (type != FEAT_NN) { - return 0; - } - - /* - * We don't accept empty Confirms, since in fast-path feature - * negotiation the values are enabled immediately after sending - * the Change option. - * Empty Changes on the other hand are invalid (RFC 4340, 6.1). - */ - if (len == 0 || len > sizeof(fval.nn)) - goto fast_path_unknown; - - if (opt == DCCPO_CHANGE_L) { - fval.nn = dccp_decode_value_var(val, len); - if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) - goto fast_path_unknown; - - if (dccp_feat_push_confirm(fn, feat, local, &fval) || - dccp_feat_activate(sk, feat, local, &fval)) - return DCCP_RESET_CODE_TOO_BUSY; - - /* set the `Ack Pending' flag to piggyback a Confirm */ - inet_csk_schedule_ack(sk); - - } else if (opt == DCCPO_CONFIRM_R) { - entry = dccp_feat_list_lookup(fn, feat, local); - if (entry == NULL || entry->state != FEAT_CHANGING) - return 0; - - fval.nn = dccp_decode_value_var(val, len); - /* - * Just ignore a value that doesn't match our current value. - * If the option changes twice within two RTTs, then at least - * one CONFIRM will be received for the old value after a - * new CHANGE was sent. - */ - if (fval.nn != entry->val.nn) - return 0; - - /* Only activate after receiving the Confirm option (6.6.1). */ - dccp_feat_activate(sk, feat, local, &fval); - - /* It has been confirmed - so remove the entry */ - dccp_feat_list_pop(entry); - - } else { - DCCP_WARN("Received illegal option %u\n", opt); - goto fast_path_failed; - } - return 0; - -fast_path_unknown: - if (!mandatory) - return dccp_push_empty_confirm(fn, feat, local); - -fast_path_failed: - return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; -} - -/** - * dccp_feat_parse_options - Process Feature-Negotiation Options - * @sk: for general use and used by the client during connection setup - * @dreq: used by the server during connection setup - * @mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R - * @feat: one of %dccp_feature_numbers - * @val: value contents of @opt - * @len: length of @val in bytes - * - * Returns 0 on success, a Reset code for ending the connection otherwise. - */ -int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; - bool server = false; - - switch (sk->sk_state) { - /* - * Negotiation during connection setup - */ - case DCCP_LISTEN: - server = true; - fallthrough; - case DCCP_REQUESTING: - switch (opt) { - case DCCPO_CHANGE_L: - case DCCPO_CHANGE_R: - return dccp_feat_change_recv(fn, mandatory, opt, feat, - val, len, server); - case DCCPO_CONFIRM_R: - case DCCPO_CONFIRM_L: - return dccp_feat_confirm_recv(fn, mandatory, opt, feat, - val, len, server); - } - break; - /* - * Support for exchanging NN options on an established connection. - */ - case DCCP_OPEN: - case DCCP_PARTOPEN: - return dccp_feat_handle_nn_established(sk, mandatory, opt, feat, - val, len); - } - return 0; /* ignore FN options in all other states */ -} - -/** - * dccp_feat_init - Seed feature negotiation with host-specific defaults - * @sk: Socket to initialize. - * - * This initialises global defaults, depending on the value of the sysctls. - * These can later be overridden by registering changes via setsockopt calls. - * The last link in the chain is finalise_settings, to make sure that between - * here and the start of actual feature negotiation no inconsistencies enter. - * - * All features not appearing below use either defaults or are otherwise - * later adjusted through dccp_feat_finalise_settings(). - */ -int dccp_feat_init(struct sock *sk) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - u8 on = 1, off = 0; - int rc; - struct { - u8 *val; - u8 len; - } tx, rx; - - /* Non-negotiable (NN) features */ - rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, - sysctl_dccp_sequence_window); - if (rc) - return rc; - - /* Server-priority (SP) features */ - - /* Advertise that short seqnos are not supported (7.6.1) */ - rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1); - if (rc) - return rc; - - /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ - rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); - if (rc) - return rc; - - /* - * We advertise the available list of CCIDs and reorder according to - * preferences, to avoid failure resulting from negotiating different - * singleton values (which always leads to failure). - * These settings can still (later) be overridden via sockopts. - */ - if (ccid_get_builtin_ccids(&tx.val, &tx.len)) - return -ENOBUFS; - if (ccid_get_builtin_ccids(&rx.val, &rx.len)) { - kfree(tx.val); - return -ENOBUFS; - } - - if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || - !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) - goto free_ccid_lists; - - rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); - if (rc) - goto free_ccid_lists; - - rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); - -free_ccid_lists: - kfree(tx.val); - kfree(rx.val); - return rc; -} - -int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_feat_entry *cur, *next; - int idx; - dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = { - [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL } - }; - - list_for_each_entry(cur, fn_list, node) { - /* - * An empty Confirm means that either an unknown feature type - * or an invalid value was present. In the first case there is - * nothing to activate, in the other the default value is used. - */ - if (cur->empty_confirm) - continue; - - idx = dccp_feat_index(cur->feat_num); - if (idx < 0) { - DCCP_BUG("Unknown feature %u", cur->feat_num); - goto activation_failed; - } - if (cur->state != FEAT_STABLE) { - DCCP_CRIT("Negotiation of %s %s failed in state %s", - cur->is_local ? "local" : "remote", - dccp_feat_fname(cur->feat_num), - dccp_feat_sname[cur->state]); - goto activation_failed; - } - fvals[idx][cur->is_local] = &cur->val; - } - - /* - * Activate in decreasing order of index, so that the CCIDs are always - * activated as the last feature. This avoids the case where a CCID - * relies on the initialisation of one or more features that it depends - * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features). - */ - for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;) - if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) || - __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) { - DCCP_CRIT("Could not activate %d", idx); - goto activation_failed; - } - - /* Clean up Change options which have been confirmed already */ - list_for_each_entry_safe(cur, next, fn_list, node) - if (!cur->needs_confirm) - dccp_feat_list_pop(cur); - - dccp_pr_debug("Activation OK\n"); - return 0; - -activation_failed: - /* - * We clean up everything that may have been allocated, since - * it is difficult to track at which stage negotiation failed. - * This is ok, since all allocation functions below are robust - * against NULL arguments. - */ - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - return -1; -} diff --git a/net/dccp/feat.h b/net/dccp/feat.h deleted file mode 100644 index 57d9c026aa3f5..0000000000000 --- a/net/dccp/feat.h +++ /dev/null @@ -1,133 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _DCCP_FEAT_H -#define _DCCP_FEAT_H -/* - * net/dccp/feat.h - * - * Feature negotiation for the DCCP protocol (RFC 4340, section 6) - * Copyright (c) 2008 Gerrit Renker - * Copyright (c) 2005 Andrea Bittau - */ -#include -#include "dccp.h" - -/* - * Known limit values - */ -/* Ack Ratio takes 2-byte integer values (11.3) */ -#define DCCPF_ACK_RATIO_MAX 0xFFFF -/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ -#define DCCPF_SEQ_WMIN 32 -#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull -/* Maximum number of SP values that fit in a single (Confirm) option */ -#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2) - -enum dccp_feat_type { - FEAT_AT_RX = 1, /* located at RX side of half-connection */ - FEAT_AT_TX = 2, /* located at TX side of half-connection */ - FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */ - FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */ - FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */ -}; - -enum dccp_feat_state { - FEAT_DEFAULT = 0, /* using default values from 6.4 */ - FEAT_INITIALISING, /* feature is being initialised */ - FEAT_CHANGING, /* Change sent but not confirmed yet */ - FEAT_UNSTABLE, /* local modification in state CHANGING */ - FEAT_STABLE /* both ends (think they) agree */ -}; - -/** - * dccp_feat_val - Container for SP or NN feature values - * @nn: single NN value - * @sp.vec: single SP value plus optional preference list - * @sp.len: length of @sp.vec in bytes - */ -typedef union { - u64 nn; - struct { - u8 *vec; - u8 len; - } sp; -} dccp_feat_val; - -/** - * struct feat_entry - Data structure to perform feature negotiation - * @val: feature's current value (SP features may have preference list) - * @state: feature's current state - * @feat_num: one of %dccp_feature_numbers - * @needs_mandatory: whether Mandatory options should be sent - * @needs_confirm: whether to send a Confirm instead of a Change - * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm) - * @is_local: feature location (1) or feature-remote (0) - * @node: list pointers, entries arranged in FIFO order - */ -struct dccp_feat_entry { - dccp_feat_val val; - enum dccp_feat_state state:8; - u8 feat_num; - - bool needs_mandatory, - needs_confirm, - empty_confirm, - is_local; - - struct list_head node; -}; - -static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry) -{ - if (entry->needs_confirm) - return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R; - return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R; -} - -/** - * struct ccid_dependency - Track changes resulting from choosing a CCID - * @dependent_feat: one of %dccp_feature_numbers - * @is_local: local (1) or remote (0) @dependent_feat - * @is_mandatory: whether presence of @dependent_feat is mission-critical or not - * @val: corresponding default value for @dependent_feat (u8 is sufficient here) - */ -struct ccid_dependency { - u8 dependent_feat; - bool is_local:1, - is_mandatory:1; - u8 val; -}; - -/* - * Sysctls to seed defaults for feature negotiation - */ -extern unsigned long sysctl_dccp_sequence_window; -extern int sysctl_dccp_rx_ccid; -extern int sysctl_dccp_tx_ccid; - -int dccp_feat_init(struct sock *sk); -int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, - u8 const *list, u8 len); -int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, - u8 mand, u8 opt, u8 feat, u8 *val, u8 len); -int dccp_feat_clone_list(struct list_head const *, struct list_head *); - -/* - * Encoding variable-length options and their maximum length. - * - * This affects NN options (SP options are all u8) and other variable-length - * options (see table 3 in RFC 4340). The limit is currently given the Sequence - * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other - * options consume less than 6 bytes (timestamps are 4 bytes). - * When updating this constant (e.g. due to new internet drafts / RFCs), make - * sure that you also update all code which refers to it. - */ -#define DCCP_OPTVAL_MAXLEN 6 - -void dccp_encode_value_var(const u64 value, u8 *to, const u8 len); -u64 dccp_decode_value_var(const u8 *bf, const u8 len); -u64 dccp_feat_nn_get(struct sock *sk, u8 feat); - -int dccp_insert_option_mandatory(struct sk_buff *skb); -int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, u8 *val, u8 len, - bool repeat_first); -#endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/input.c b/net/dccp/input.c deleted file mode 100644 index 2cbb757a894f8..0000000000000 --- a/net/dccp/input.c +++ /dev/null @@ -1,739 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/input.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include - -#include - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" - -/* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */ -int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8; - -static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb) -{ - __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4); - __skb_queue_tail(&sk->sk_receive_queue, skb); - skb_set_owner_r(skb, sk); - sk->sk_data_ready(sk); -} - -static void dccp_fin(struct sock *sk, struct sk_buff *skb) -{ - /* - * On receiving Close/CloseReq, both RD/WR shutdown are performed. - * RFC 4340, 8.3 says that we MAY send further Data/DataAcks after - * receiving the closing segment, but there is no guarantee that such - * data will be processed at all. - */ - sk->sk_shutdown = SHUTDOWN_MASK; - sock_set_flag(sk, SOCK_DONE); - dccp_enqueue_skb(sk, skb); -} - -static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb) -{ - int queued = 0; - - switch (sk->sk_state) { - /* - * We ignore Close when received in one of the following states: - * - CLOSED (may be a late or duplicate packet) - * - PASSIVE_CLOSEREQ (the peer has sent a CloseReq earlier) - * - RESPOND (already handled by dccp_check_req) - */ - case DCCP_CLOSING: - /* - * Simultaneous-close: receiving a Close after sending one. This - * can happen if both client and server perform active-close and - * will result in an endless ping-pong of crossing and retrans- - * mitted Close packets, which only terminates when one of the - * nodes times out (min. 64 seconds). Quicker convergence can be - * achieved when one of the nodes acts as tie-breaker. - * This is ok as both ends are done with data transfer and each - * end is just waiting for the other to acknowledge termination. - */ - if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) - break; - fallthrough; - case DCCP_REQUESTING: - case DCCP_ACTIVE_CLOSEREQ: - dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); - dccp_done(sk); - break; - case DCCP_OPEN: - case DCCP_PARTOPEN: - /* Give waiting application a chance to read pending data */ - queued = 1; - dccp_fin(sk, skb); - dccp_set_state(sk, DCCP_PASSIVE_CLOSE); - fallthrough; - case DCCP_PASSIVE_CLOSE: - /* - * Retransmitted Close: we have already enqueued the first one. - */ - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - } - return queued; -} - -static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) -{ - int queued = 0; - - /* - * Step 7: Check for unexpected packet types - * If (S.is_server and P.type == CloseReq) - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) { - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); - return queued; - } - - /* Step 13: process relevant Client states < CLOSEREQ */ - switch (sk->sk_state) { - case DCCP_REQUESTING: - dccp_send_close(sk, 0); - dccp_set_state(sk, DCCP_CLOSING); - break; - case DCCP_OPEN: - case DCCP_PARTOPEN: - /* Give waiting application a chance to read pending data */ - queued = 1; - dccp_fin(sk, skb); - dccp_set_state(sk, DCCP_PASSIVE_CLOSEREQ); - fallthrough; - case DCCP_PASSIVE_CLOSEREQ: - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - } - return queued; -} - -static u16 dccp_reset_code_convert(const u8 code) -{ - static const u16 error_code[] = { - [DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */ - [DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */ - [DCCP_RESET_CODE_ABORTED] = ECONNRESET, - - [DCCP_RESET_CODE_NO_CONNECTION] = ECONNREFUSED, - [DCCP_RESET_CODE_CONNECTION_REFUSED] = ECONNREFUSED, - [DCCP_RESET_CODE_TOO_BUSY] = EUSERS, - [DCCP_RESET_CODE_AGGRESSION_PENALTY] = EDQUOT, - - [DCCP_RESET_CODE_PACKET_ERROR] = ENOMSG, - [DCCP_RESET_CODE_BAD_INIT_COOKIE] = EBADR, - [DCCP_RESET_CODE_BAD_SERVICE_CODE] = EBADRQC, - [DCCP_RESET_CODE_OPTION_ERROR] = EILSEQ, - [DCCP_RESET_CODE_MANDATORY_ERROR] = EOPNOTSUPP, - }; - - return code >= DCCP_MAX_RESET_CODES ? 0 : error_code[code]; -} - -static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb) -{ - u16 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code); - - sk->sk_err = err; - - /* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */ - dccp_fin(sk, skb); - - if (err && !sock_flag(sk, SOCK_DEAD)) - sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); - dccp_time_wait(sk, DCCP_TIME_WAIT, 0); -} - -static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; - - if (av == NULL) - return; - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); - dccp_ackvec_input(av, skb); -} - -static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) -{ - const struct dccp_sock *dp = dccp_sk(sk); - - /* Don't deliver to RX CCID when node has shut down read end. */ - if (!(sk->sk_shutdown & RCV_SHUTDOWN)) - ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); - /* - * Until the TX queue has been drained, we can not honour SHUT_WR, since - * we need received feedback as input to adjust congestion control. - */ - if (sk->sk_write_queue.qlen > 0 || !(sk->sk_shutdown & SEND_SHUTDOWN)) - ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); -} - -static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - struct dccp_sock *dp = dccp_sk(sk); - u64 lswl, lawl, seqno = DCCP_SKB_CB(skb)->dccpd_seq, - ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; - - /* - * Step 5: Prepare sequence numbers for Sync - * If P.type == Sync or P.type == SyncAck, - * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL, - * / * P is valid, so update sequence number variables - * accordingly. After this update, P will pass the tests - * in Step 6. A SyncAck is generated if necessary in - * Step 15 * / - * Update S.GSR, S.SWL, S.SWH - * Otherwise, - * Drop packet and return - */ - if (dh->dccph_type == DCCP_PKT_SYNC || - dh->dccph_type == DCCP_PKT_SYNCACK) { - if (between48(ackno, dp->dccps_awl, dp->dccps_awh) && - dccp_delta_seqno(dp->dccps_swl, seqno) >= 0) - dccp_update_gsr(sk, seqno); - else - return -1; - } - - /* - * Step 6: Check sequence numbers - * Let LSWL = S.SWL and LAWL = S.AWL - * If P.type == CloseReq or P.type == Close or P.type == Reset, - * LSWL := S.GSR + 1, LAWL := S.GAR - * If LSWL <= P.seqno <= S.SWH - * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH), - * Update S.GSR, S.SWL, S.SWH - * If P.type != Sync, - * Update S.GAR - */ - lswl = dp->dccps_swl; - lawl = dp->dccps_awl; - - if (dh->dccph_type == DCCP_PKT_CLOSEREQ || - dh->dccph_type == DCCP_PKT_CLOSE || - dh->dccph_type == DCCP_PKT_RESET) { - lswl = ADD48(dp->dccps_gsr, 1); - lawl = dp->dccps_gar; - } - - if (between48(seqno, lswl, dp->dccps_swh) && - (ackno == DCCP_PKT_WITHOUT_ACK_SEQ || - between48(ackno, lawl, dp->dccps_awh))) { - dccp_update_gsr(sk, seqno); - - if (dh->dccph_type != DCCP_PKT_SYNC && - ackno != DCCP_PKT_WITHOUT_ACK_SEQ && - after48(ackno, dp->dccps_gar)) - dp->dccps_gar = ackno; - } else { - unsigned long now = jiffies; - /* - * Step 6: Check sequence numbers - * Otherwise, - * If P.type == Reset, - * Send Sync packet acknowledging S.GSR - * Otherwise, - * Send Sync packet acknowledging P.seqno - * Drop packet and return - * - * These Syncs are rate-limited as per RFC 4340, 7.5.4: - * at most 1 / (dccp_sync_rate_limit * HZ) Syncs per second. - */ - if (time_before(now, (dp->dccps_rate_last + - sysctl_dccp_sync_ratelimit))) - return -1; - - DCCP_WARN("Step 6 failed for %s packet, " - "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and " - "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), " - "sending SYNC...\n", dccp_packet_name(dh->dccph_type), - (unsigned long long) lswl, (unsigned long long) seqno, - (unsigned long long) dp->dccps_swh, - (ackno == DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" - : "exists", - (unsigned long long) lawl, (unsigned long long) ackno, - (unsigned long long) dp->dccps_awh); - - dp->dccps_rate_last = now; - - if (dh->dccph_type == DCCP_PKT_RESET) - seqno = dp->dccps_gsr; - dccp_send_sync(sk, seqno, DCCP_PKT_SYNC); - return -1; - } - - return 0; -} - -static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - - switch (dccp_hdr(skb)->dccph_type) { - case DCCP_PKT_DATAACK: - case DCCP_PKT_DATA: - /* - * FIXME: schedule DATA_DROPPED (RFC 4340, 11.7.2) if and when - * - sk_shutdown == RCV_SHUTDOWN, use Code 1, "Not Listening" - * - sk_receive_queue is full, use Code 2, "Receive Buffer" - */ - dccp_enqueue_skb(sk, skb); - return 0; - case DCCP_PKT_ACK: - goto discard; - case DCCP_PKT_RESET: - /* - * Step 9: Process Reset - * If P.type == Reset, - * Tear down connection - * S.state := TIMEWAIT - * Set TIMEWAIT timer - * Drop packet and return - */ - dccp_rcv_reset(sk, skb); - return 0; - case DCCP_PKT_CLOSEREQ: - if (dccp_rcv_closereq(sk, skb)) - return 0; - goto discard; - case DCCP_PKT_CLOSE: - if (dccp_rcv_close(sk, skb)) - return 0; - goto discard; - case DCCP_PKT_REQUEST: - /* Step 7 - * or (S.is_server and P.type == Response) - * or (S.is_client and P.type == Request) - * or (S.state >= OPEN and P.type == Request - * and P.seqno >= S.OSR) - * or (S.state >= OPEN and P.type == Response - * and P.seqno >= S.OSR) - * or (S.state == RESPOND and P.type == Data), - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if (dp->dccps_role != DCCP_ROLE_LISTEN) - goto send_sync; - goto check_seq; - case DCCP_PKT_RESPONSE: - if (dp->dccps_role != DCCP_ROLE_CLIENT) - goto send_sync; -check_seq: - if (dccp_delta_seqno(dp->dccps_osr, - DCCP_SKB_CB(skb)->dccpd_seq) >= 0) { -send_sync: - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_PKT_SYNC); - } - break; - case DCCP_PKT_SYNC: - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_PKT_SYNCACK); - /* - * From RFC 4340, sec. 5.7 - * - * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets - * MAY have non-zero-length application data areas, whose - * contents receivers MUST ignore. - */ - goto discard; - } - - DCCP_INC_STATS(DCCP_MIB_INERRS); -discard: - __kfree_skb(skb); - return 0; -} - -int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned int len) -{ - if (dccp_check_seqno(sk, skb)) - goto discard; - - if (dccp_parse_options(sk, NULL, skb)) - return 1; - - dccp_handle_ackvec_processing(sk, skb); - dccp_deliver_input_to_ccids(sk, skb); - - return __dccp_rcv_established(sk, skb, dh, len); -discard: - __kfree_skb(skb); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_rcv_established); - -static int dccp_rcv_request_sent_state_process(struct sock *sk, - struct sk_buff *skb, - const struct dccp_hdr *dh, - const unsigned int len) -{ - /* - * Step 4: Prepare sequence numbers in REQUEST - * If S.state == REQUEST, - * If (P.type == Response or P.type == Reset) - * and S.AWL <= P.ackno <= S.AWH, - * / * Set sequence number variables corresponding to the - * other endpoint, so P will pass the tests in Step 6 * / - * Set S.GSR, S.ISR, S.SWL, S.SWH - * / * Response processing continues in Step 10; Reset - * processing continues in Step 9 * / - */ - if (dh->dccph_type == DCCP_PKT_RESPONSE) { - const struct inet_connection_sock *icsk = inet_csk(sk); - struct dccp_sock *dp = dccp_sk(sk); - long tstamp = dccp_timestamp(); - - if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, - dp->dccps_awl, dp->dccps_awh)) { - dccp_pr_debug("invalid ackno: S.AWL=%llu, " - "P.ackno=%llu, S.AWH=%llu\n", - (unsigned long long)dp->dccps_awl, - (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, - (unsigned long long)dp->dccps_awh); - goto out_invalid_packet; - } - - /* - * If option processing (Step 8) failed, return 1 here so that - * dccp_v4_do_rcv() sends a Reset. The Reset code depends on - * the option type and is set in dccp_parse_options(). - */ - if (dccp_parse_options(sk, NULL, skb)) - return 1; - - /* Obtain usec RTT sample from SYN exchange (used by TFRC). */ - if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) - dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - - dp->dccps_options_received.dccpor_timestamp_echo)); - - /* Stop the REQUEST timer */ - inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); - WARN_ON(sk->sk_send_head == NULL); - kfree_skb(sk->sk_send_head); - sk->sk_send_head = NULL; - - /* - * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect - * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH - * is done as part of activating the feature values below, since - * these settings depend on the local/remote Sequence Window - * features, which were undefined or not confirmed until now. - */ - dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; - - dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); - - /* - * Step 10: Process REQUEST state (second part) - * If S.state == REQUEST, - * / * If we get here, P is a valid Response from the - * server (see Step 4), and we should move to - * PARTOPEN state. PARTOPEN means send an Ack, - * don't send Data packets, retransmit Acks - * periodically, and always include any Init Cookie - * from the Response * / - * S.state := PARTOPEN - * Set PARTOPEN timer - * Continue with S.state == PARTOPEN - * / * Step 12 will send the Ack completing the - * three-way handshake * / - */ - dccp_set_state(sk, DCCP_PARTOPEN); - - /* - * If feature negotiation was successful, activate features now; - * an activation failure means that this host could not activate - * one ore more features (e.g. insufficient memory), which would - * leave at least one feature in an undefined state. - */ - if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) - goto unable_to_proceed; - - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - - if (!sock_flag(sk, SOCK_DEAD)) { - sk->sk_state_change(sk); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - } - - if (sk->sk_write_pending || inet_csk_in_pingpong_mode(sk) || - icsk->icsk_accept_queue.rskq_defer_accept) { - /* Save one ACK. Data will be ready after - * several ticks, if write_pending is set. - * - * It may be deleted, but with this feature tcpdumps - * look so _wonderfully_ clever, that I was not able - * to stand against the temptation 8) --ANK - */ - /* - * OK, in DCCP we can as well do a similar trick, its - * even in the draft, but there is no need for us to - * schedule an ack here, as dccp_sendmsg does this for - * us, also stated in the draft. -acme - */ - __kfree_skb(skb); - return 0; - } - dccp_send_ack(sk); - return -1; - } - -out_invalid_packet: - /* dccp_v4_do_rcv will send a reset */ - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; - return 1; - -unable_to_proceed: - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; - /* - * We mark this socket as no longer usable, so that the loop in - * dccp_sendmsg() terminates and the application gets notified. - */ - dccp_set_state(sk, DCCP_CLOSED); - sk->sk_err = ECOMM; - return 1; -} - -static int dccp_rcv_respond_partopen_state_process(struct sock *sk, - struct sk_buff *skb, - const struct dccp_hdr *dh, - const unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - u32 sample = dp->dccps_options_received.dccpor_timestamp_echo; - int queued = 0; - - switch (dh->dccph_type) { - case DCCP_PKT_RESET: - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); - break; - case DCCP_PKT_DATA: - if (sk->sk_state == DCCP_RESPOND) - break; - fallthrough; - case DCCP_PKT_DATAACK: - case DCCP_PKT_ACK: - /* - * FIXME: we should be resetting the PARTOPEN (DELACK) timer - * here but only if we haven't used the DELACK timer for - * something else, like sending a delayed ack for a TIMESTAMP - * echo, etc, for now were not clearing it, sending an extra - * ACK when there is nothing else to do in DELACK is not a big - * deal after all. - */ - - /* Stop the PARTOPEN timer */ - if (sk->sk_state == DCCP_PARTOPEN) - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); - - /* Obtain usec RTT sample from SYN exchange (used by TFRC). */ - if (likely(sample)) { - long delta = dccp_timestamp() - sample; - - dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta); - } - - dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq; - dccp_set_state(sk, DCCP_OPEN); - - if (dh->dccph_type == DCCP_PKT_DATAACK || - dh->dccph_type == DCCP_PKT_DATA) { - __dccp_rcv_established(sk, skb, dh, len); - queued = 1; /* packet was queued - (by __dccp_rcv_established) */ - } - break; - } - - return queued; -} - -int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - struct dccp_hdr *dh, unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const int old_state = sk->sk_state; - bool acceptable; - int queued = 0; - - /* - * Step 3: Process LISTEN state - * - * If S.state == LISTEN, - * If P.type == Request or P contains a valid Init Cookie option, - * (* Must scan the packet's options to check for Init - * Cookies. Only Init Cookies are processed here, - * however; other options are processed in Step 8. This - * scan need only be performed if the endpoint uses Init - * Cookies *) - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - * S.state = RESPOND - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init - * Cookies Continue with S.state == RESPOND - * (* A Response packet will be generated in Step 11 *) - * Otherwise, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (sk->sk_state == DCCP_LISTEN) { - if (dh->dccph_type == DCCP_PKT_REQUEST) { - /* It is possible that we process SYN packets from backlog, - * so we need to make sure to disable BH and RCU right there. - */ - rcu_read_lock(); - local_bh_disable(); - acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0; - local_bh_enable(); - rcu_read_unlock(); - if (!acceptable) - return 1; - consume_skb(skb); - return 0; - } - if (dh->dccph_type == DCCP_PKT_RESET) - goto discard; - - /* Caller (dccp_v4_do_rcv) will send Reset */ - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - return 1; - } else if (sk->sk_state == DCCP_CLOSED) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - return 1; - } - - /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */ - if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb)) - goto discard; - - /* - * Step 7: Check for unexpected packet types - * If (S.is_server and P.type == Response) - * or (S.is_client and P.type == Request) - * or (S.state == RESPOND and P.type == Data), - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if ((dp->dccps_role != DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_RESPONSE) || - (dp->dccps_role == DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_REQUEST) || - (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { - dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); - goto discard; - } - - /* Step 8: Process options */ - if (dccp_parse_options(sk, NULL, skb)) - return 1; - - /* - * Step 9: Process Reset - * If P.type == Reset, - * Tear down connection - * S.state := TIMEWAIT - * Set TIMEWAIT timer - * Drop packet and return - */ - if (dh->dccph_type == DCCP_PKT_RESET) { - dccp_rcv_reset(sk, skb); - return 0; - } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */ - if (dccp_rcv_closereq(sk, skb)) - return 0; - goto discard; - } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */ - if (dccp_rcv_close(sk, skb)) - return 0; - goto discard; - } - - switch (sk->sk_state) { - case DCCP_REQUESTING: - queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); - if (queued >= 0) - return queued; - - __kfree_skb(skb); - return 0; - - case DCCP_PARTOPEN: - /* Step 8: if using Ack Vectors, mark packet acknowledgeable */ - dccp_handle_ackvec_processing(sk, skb); - dccp_deliver_input_to_ccids(sk, skb); - fallthrough; - case DCCP_RESPOND: - queued = dccp_rcv_respond_partopen_state_process(sk, skb, - dh, len); - break; - } - - if (dh->dccph_type == DCCP_PKT_ACK || - dh->dccph_type == DCCP_PKT_DATAACK) { - switch (old_state) { - case DCCP_PARTOPEN: - sk->sk_state_change(sk); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - break; - } - } else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) { - dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK); - goto discard; - } - - if (!queued) { -discard: - __kfree_skb(skb); - } - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_rcv_state_process); - -/** - * dccp_sample_rtt - Validate and finalise computation of RTT sample - * @sk: socket structure - * @delta: number of microseconds between packet and acknowledgment - * - * The routine is kept generic to work in different contexts. It should be - * called immediately when the ACK used for the RTT sample arrives. - */ -u32 dccp_sample_rtt(struct sock *sk, long delta) -{ - /* dccpor_elapsed_time is either zeroed out or set and > 0 */ - delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; - - if (unlikely(delta <= 0)) { - DCCP_WARN("unusable RTT sample %ld, using min\n", delta); - return DCCP_SANE_RTT_MIN; - } - if (unlikely(delta > DCCP_SANE_RTT_MAX)) { - DCCP_WARN("RTT sample %ld too large, using max\n", delta); - return DCCP_SANE_RTT_MAX; - } - - return delta; -} diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c deleted file mode 100644 index 2045ddac0fe9f..0000000000000 --- a/net/dccp/ipv4.c +++ /dev/null @@ -1,1101 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/ipv4.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -struct dccp_v4_pernet { - struct sock *v4_ctl_sk; -}; - -static unsigned int dccp_v4_pernet_id __read_mostly; - -/* - * The per-net v4_ctl_sk socket is used for responding to - * the Out-of-the-blue (OOTB) packets. A control sock will be created - * for this socket at the initialization time. - */ - -int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) -{ - const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; - struct inet_sock *inet = inet_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - __be16 orig_sport, orig_dport; - __be32 daddr, nexthop; - struct flowi4 *fl4; - struct rtable *rt; - int err; - struct ip_options_rcu *inet_opt; - - dp->dccps_role = DCCP_ROLE_CLIENT; - - if (addr_len < sizeof(struct sockaddr_in)) - return -EINVAL; - - if (usin->sin_family != AF_INET) - return -EAFNOSUPPORT; - - nexthop = daddr = usin->sin_addr.s_addr; - - inet_opt = rcu_dereference_protected(inet->inet_opt, - lockdep_sock_is_held(sk)); - if (inet_opt != NULL && inet_opt->opt.srr) { - if (daddr == 0) - return -EINVAL; - nexthop = inet_opt->opt.faddr; - } - - orig_sport = inet->inet_sport; - orig_dport = usin->sin_port; - fl4 = &inet->cork.fl.u.ip4; - rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, - sk->sk_bound_dev_if, IPPROTO_DCCP, orig_sport, - orig_dport, sk); - if (IS_ERR(rt)) - return PTR_ERR(rt); - - if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { - ip_rt_put(rt); - return -ENETUNREACH; - } - - if (inet_opt == NULL || !inet_opt->opt.srr) - daddr = fl4->daddr; - - if (inet->inet_saddr == 0) { - err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); - if (err) { - ip_rt_put(rt); - return err; - } - } else { - sk_rcv_saddr_set(sk, inet->inet_saddr); - } - - inet->inet_dport = usin->sin_port; - sk_daddr_set(sk, daddr); - - inet_csk(sk)->icsk_ext_hdr_len = 0; - if (inet_opt) - inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; - /* - * Socket identity is still unknown (sport may be zero). - * However we set state to DCCP_REQUESTING and not releasing socket - * lock select source port, enter ourselves into the hash tables and - * complete initialization after this. - */ - dccp_set_state(sk, DCCP_REQUESTING); - err = inet_hash_connect(&dccp_death_row, sk); - if (err != 0) - goto failure; - - rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, - inet->inet_sport, inet->inet_dport, sk); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - rt = NULL; - goto failure; - } - /* OK, now commit destination to socket. */ - sk_setup_caps(sk, &rt->dst); - - dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - inet->inet_dport); - atomic_set(&inet->inet_id, get_random_u16()); - - err = dccp_connect(sk); - rt = NULL; - if (err != 0) - goto failure; -out: - return err; -failure: - /* - * This unhashes the socket and releases the local port, if necessary. - */ - dccp_set_state(sk, DCCP_CLOSED); - inet_bhash2_reset_saddr(sk); - ip_rt_put(rt); - sk->sk_route_caps = 0; - inet->inet_dport = 0; - goto out; -} -EXPORT_SYMBOL_GPL(dccp_v4_connect); - -/* - * This routine does path mtu discovery as defined in RFC1191. - */ -static inline void dccp_do_pmtu_discovery(struct sock *sk, - const struct iphdr *iph, - u32 mtu) -{ - struct dst_entry *dst; - const struct inet_sock *inet = inet_sk(sk); - const struct dccp_sock *dp = dccp_sk(sk); - - /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs - * send out by Linux are always < 576bytes so they should go through - * unfragmented). - */ - if (sk->sk_state == DCCP_LISTEN) - return; - - dst = inet_csk_update_pmtu(sk, mtu); - if (!dst) - return; - - /* Something is about to be wrong... Remember soft error - * for the case, if this connection will not able to recover. - */ - if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) - WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); - - mtu = dst_mtu(dst); - - if (inet->pmtudisc != IP_PMTUDISC_DONT && - ip_sk_accept_pmtu(sk) && - inet_csk(sk)->icsk_pmtu_cookie > mtu) { - dccp_sync_mss(sk, mtu); - - /* - * From RFC 4340, sec. 14.1: - * - * DCCP-Sync packets are the best choice for upward - * probing, since DCCP-Sync probes do not risk application - * data loss. - */ - dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); - } /* else let the usual retransmit timer handle it */ -} - -static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk) -{ - struct dst_entry *dst = __sk_dst_check(sk, 0); - - if (dst) - dst->ops->redirect(dst, sk, skb); -} - -void dccp_req_err(struct sock *sk, u64 seq) - { - struct request_sock *req = inet_reqsk(sk); - struct net *net = sock_net(sk); - - /* - * ICMPs are not backlogged, hence we cannot get an established - * socket here. - */ - if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) { - __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); - } else { - /* - * Still in RESPOND, just remove it silently. - * There is no good way to pass the error to the newly - * created socket, and POSIX does not want network - * errors returned from accept(). - */ - inet_csk_reqsk_queue_drop(req->rsk_listener, req); - } - reqsk_put(req); -} -EXPORT_SYMBOL(dccp_req_err); - -/* - * This routine is called by the ICMP module when it gets some sort of error - * condition. If err < 0 then the socket should be closed and the error - * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code. - * After adjustment header points to the first 8 bytes of the tcp header. We - * need to find the appropriate port. - * - * The locking strategy used here is very "optimistic". When someone else - * accesses the socket the ICMP is just dropped and for some paths there is no - * check at all. A more general error queue to queue errors for later handling - * is probably better. - */ -static int dccp_v4_err(struct sk_buff *skb, u32 info) -{ - const struct iphdr *iph = (struct iphdr *)skb->data; - const u8 offset = iph->ihl << 2; - const struct dccp_hdr *dh; - struct dccp_sock *dp; - const int type = icmp_hdr(skb)->type; - const int code = icmp_hdr(skb)->code; - struct sock *sk; - __u64 seq; - int err; - struct net *net = dev_net(skb->dev); - - if (!pskb_may_pull(skb, offset + sizeof(*dh))) - return -EINVAL; - dh = (struct dccp_hdr *)(skb->data + offset); - if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) - return -EINVAL; - iph = (struct iphdr *)skb->data; - dh = (struct dccp_hdr *)(skb->data + offset); - - sk = __inet_lookup_established(net, &dccp_hashinfo, - iph->daddr, dh->dccph_dport, - iph->saddr, ntohs(dh->dccph_sport), - inet_iif(skb), 0); - if (!sk) { - __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return -ENOENT; - } - - if (sk->sk_state == DCCP_TIME_WAIT) { - inet_twsk_put(inet_twsk(sk)); - return 0; - } - seq = dccp_hdr_seq(dh); - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - dccp_req_err(sk, seq); - return 0; - } - - bh_lock_sock(sk); - /* If too many ICMPs get dropped on busy - * servers this needs to be solved differently. - */ - if (sock_owned_by_user(sk)) - __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); - - if (sk->sk_state == DCCP_CLOSED) - goto out; - - dp = dccp_sk(sk); - if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && - !between48(seq, dp->dccps_awl, dp->dccps_awh)) { - __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); - goto out; - } - - switch (type) { - case ICMP_REDIRECT: - if (!sock_owned_by_user(sk)) - dccp_do_redirect(skb, sk); - goto out; - case ICMP_SOURCE_QUENCH: - /* Just silently ignore these. */ - goto out; - case ICMP_PARAMETERPROB: - err = EPROTO; - break; - case ICMP_DEST_UNREACH: - if (code > NR_ICMP_UNREACH) - goto out; - - if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ - if (!sock_owned_by_user(sk)) - dccp_do_pmtu_discovery(sk, iph, info); - goto out; - } - - err = icmp_err_convert[code].errno; - break; - case ICMP_TIME_EXCEEDED: - err = EHOSTUNREACH; - break; - default: - goto out; - } - - switch (sk->sk_state) { - case DCCP_REQUESTING: - case DCCP_RESPOND: - if (!sock_owned_by_user(sk)) { - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - sk->sk_err = err; - - sk_error_report(sk); - - dccp_done(sk); - } else { - WRITE_ONCE(sk->sk_err_soft, err); - } - goto out; - } - - /* If we've already connected we will keep trying - * until we time out, or the user gives up. - * - * rfc1122 4.2.3.9 allows to consider as hard errors - * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, - * but it is obsoleted by pmtu discovery). - * - * Note, that in modern internet, where routing is unreliable - * and in each dark corner broken firewalls sit, sending random - * errors ordered by their masters even this two messages finally lose - * their original sense (even Linux sends invalid PORT_UNREACHs) - * - * Now we are in compliance with RFCs. - * --ANK (980905) - */ - - if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) { - sk->sk_err = err; - sk_error_report(sk); - } else { /* Only an error on timeout */ - WRITE_ONCE(sk->sk_err_soft, err); - } -out: - bh_unlock_sock(sk); - sock_put(sk); - return 0; -} - -static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb, - __be32 src, __be32 dst) -{ - return csum_tcpudp_magic(src, dst, skb->len, IPPROTO_DCCP, skb->csum); -} - -void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb) -{ - const struct inet_sock *inet = inet_sk(sk); - struct dccp_hdr *dh = dccp_hdr(skb); - - dccp_csum_outgoing(skb); - dh->dccph_checksum = dccp_v4_csum_finish(skb, - inet->inet_saddr, - inet->inet_daddr); -} -EXPORT_SYMBOL_GPL(dccp_v4_send_check); - -static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb) -{ - return secure_dccp_sequence_number(ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, - dccp_hdr(skb)->dccph_dport, - dccp_hdr(skb)->dccph_sport); -} - -/* - * The three way handshake has completed - we got a valid ACK or DATAACK - - * now create the new socket. - * - * This is the equivalent of TCP's tcp_v4_syn_recv_sock - */ -struct sock *dccp_v4_request_recv_sock(const struct sock *sk, - struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req) -{ - struct inet_request_sock *ireq; - struct inet_sock *newinet; - struct sock *newsk; - - if (sk_acceptq_is_full(sk)) - goto exit_overflow; - - newsk = dccp_create_openreq_child(sk, req, skb); - if (newsk == NULL) - goto exit_nonewsk; - - newinet = inet_sk(newsk); - ireq = inet_rsk(req); - RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt)); - newinet->mc_index = inet_iif(skb); - newinet->mc_ttl = ip_hdr(skb)->ttl; - atomic_set(&newinet->inet_id, get_random_u16()); - - if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) - goto put_and_exit; - - sk_setup_caps(newsk, dst); - - dccp_sync_mss(newsk, dst_mtu(dst)); - - if (__inet_inherit_port(sk, newsk) < 0) - goto put_and_exit; - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); - if (*own_req) - ireq->ireq_opt = NULL; - else - newinet->inet_opt = NULL; - return newsk; - -exit_overflow: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -exit_nonewsk: - dst_release(dst); -exit: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); - return NULL; -put_and_exit: - newinet->inet_opt = NULL; - inet_csk_prepare_forced_close(newsk); - dccp_done(newsk); - goto exit; -} -EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); - -static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, - struct sk_buff *skb) -{ - struct rtable *rt; - const struct iphdr *iph = ip_hdr(skb); - struct flowi4 fl4 = { - .flowi4_oif = inet_iif(skb), - .daddr = iph->saddr, - .saddr = iph->daddr, - .flowi4_tos = inet_dscp_to_dsfield(inet_sk_dscp(inet_sk(sk))), - .flowi4_scope = ip_sock_rt_scope(sk), - .flowi4_proto = sk->sk_protocol, - .fl4_sport = dccp_hdr(skb)->dccph_dport, - .fl4_dport = dccp_hdr(skb)->dccph_sport, - }; - - security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); - rt = ip_route_output_flow(net, &fl4, sk); - if (IS_ERR(rt)) { - IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); - return NULL; - } - - return &rt->dst; -} - -static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req) -{ - int err = -1; - struct sk_buff *skb; - struct dst_entry *dst; - struct flowi4 fl4; - - dst = inet_csk_route_req(sk, &fl4, req); - if (dst == NULL) - goto out; - - skb = dccp_make_response(sk, dst, req); - if (skb != NULL) { - const struct inet_request_sock *ireq = inet_rsk(req); - struct dccp_hdr *dh = dccp_hdr(skb); - - dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr, - ireq->ir_rmt_addr); - rcu_read_lock(); - err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, - ireq->ir_rmt_addr, - rcu_dereference(ireq->ireq_opt), - READ_ONCE(inet_sk(sk)->tos)); - rcu_read_unlock(); - err = net_xmit_eval(err); - } - -out: - dst_release(dst); - return err; -} - -static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb, - enum sk_rst_reason reason) -{ - int err; - const struct iphdr *rxiph; - struct sk_buff *skb; - struct dst_entry *dst; - struct net *net = dev_net(skb_dst(rxskb)->dev); - struct dccp_v4_pernet *pn; - struct sock *ctl_sk; - - /* Never send a reset in response to a reset. */ - if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) - return; - - if (skb_rtable(rxskb)->rt_type != RTN_LOCAL) - return; - - pn = net_generic(net, dccp_v4_pernet_id); - ctl_sk = pn->v4_ctl_sk; - dst = dccp_v4_route_skb(net, ctl_sk, rxskb); - if (dst == NULL) - return; - - skb = dccp_ctl_make_reset(ctl_sk, rxskb); - if (skb == NULL) - goto out; - - rxiph = ip_hdr(rxskb); - dccp_hdr(skb)->dccph_checksum = dccp_v4_csum_finish(skb, rxiph->saddr, - rxiph->daddr); - skb_dst_set(skb, dst_clone(dst)); - - local_bh_disable(); - bh_lock_sock(ctl_sk); - err = ip_build_and_send_pkt(skb, ctl_sk, - rxiph->daddr, rxiph->saddr, NULL, - inet_sk(ctl_sk)->tos); - bh_unlock_sock(ctl_sk); - - if (net_xmit_eval(err) == 0) { - __DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - __DCCP_INC_STATS(DCCP_MIB_OUTRSTS); - } - local_bh_enable(); -out: - dst_release(dst); -} - -static void dccp_v4_reqsk_destructor(struct request_sock *req) -{ - dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); - kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); -} - -void dccp_syn_ack_timeout(const struct request_sock *req) -{ -} -EXPORT_SYMBOL(dccp_syn_ack_timeout); - -static struct request_sock_ops dccp_request_sock_ops __read_mostly = { - .family = PF_INET, - .obj_size = sizeof(struct dccp_request_sock), - .rtx_syn_ack = dccp_v4_send_response, - .send_ack = dccp_reqsk_send_ack, - .destructor = dccp_v4_reqsk_destructor, - .send_reset = dccp_v4_ctl_send_reset, - .syn_ack_timeout = dccp_syn_ack_timeout, -}; - -int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) -{ - struct inet_request_sock *ireq; - struct request_sock *req; - struct dccp_request_sock *dreq; - const __be32 service = dccp_hdr_request(skb)->dccph_req_service; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - - /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */ - if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) - return 0; /* discard, don't send a reset here */ - - if (dccp_bad_service_code(sk, service)) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; - goto drop; - } - /* - * TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; - if (inet_csk_reqsk_queue_is_full(sk)) - goto drop; - - if (sk_acceptq_is_full(sk)) - goto drop; - - req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true); - if (req == NULL) - goto drop; - - if (dccp_reqsk_init(req, dccp_sk(sk), skb)) - goto drop_and_free; - - dreq = dccp_rsk(req); - if (dccp_parse_options(sk, dreq, skb)) - goto drop_and_free; - - ireq = inet_rsk(req); - sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); - sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); - ireq->ir_mark = inet_request_mark(sk, skb); - ireq->ireq_family = AF_INET; - ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if); - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; - - /* - * Step 3: Process LISTEN state - * - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie - * - * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child(). - */ - dreq->dreq_isr = dcb->dccpd_seq; - dreq->dreq_gsr = dreq->dreq_isr; - dreq->dreq_iss = dccp_v4_init_sequence(skb); - dreq->dreq_gss = dreq->dreq_iss; - dreq->dreq_service = service; - - if (dccp_v4_send_response(sk, req)) - goto drop_and_free; - - if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT))) - reqsk_free(req); - else - reqsk_put(req); - - return 0; - -drop_and_free: - reqsk_free(req); -drop: - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - return -1; -} -EXPORT_SYMBOL_GPL(dccp_v4_conn_request); - -int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_hdr *dh = dccp_hdr(skb); - - if (sk->sk_state == DCCP_OPEN) { /* Fast path */ - if (dccp_rcv_established(sk, skb, dh, skb->len)) - goto reset; - return 0; - } - - /* - * Step 3: Process LISTEN state - * If P.type == Request or P contains a valid Init Cookie option, - * (* Must scan the packet's options to check for Init - * Cookies. Only Init Cookies are processed here, - * however; other options are processed in Step 8. This - * scan need only be performed if the endpoint uses Init - * Cookies *) - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - * S.state = RESPOND - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies - * Continue with S.state == RESPOND - * (* A Response packet will be generated in Step 11 *) - * Otherwise, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - * - * NOTE: the check for the packet types is done in - * dccp_rcv_state_process - */ - - if (dccp_rcv_state_process(sk, skb, dh, skb->len)) - goto reset; - return 0; - -reset: - dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - kfree_skb(skb); - return 0; -} -EXPORT_SYMBOL_GPL(dccp_v4_do_rcv); - -/** - * dccp_invalid_packet - check for malformed packets - * @skb: Packet to validate - * - * Implements RFC 4340, 8.5: Step 1: Check header basics - * Packets that fail these checks are ignored and do not receive Resets. - */ -int dccp_invalid_packet(struct sk_buff *skb) -{ - const struct dccp_hdr *dh; - unsigned int cscov; - u8 dccph_doff; - - if (skb->pkt_type != PACKET_HOST) - return 1; - - /* If the packet is shorter than 12 bytes, drop packet and return */ - if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) { - DCCP_WARN("pskb_may_pull failed\n"); - return 1; - } - - dh = dccp_hdr(skb); - - /* If P.type is not understood, drop packet and return */ - if (dh->dccph_type >= DCCP_PKT_INVALID) { - DCCP_WARN("invalid packet type\n"); - return 1; - } - - /* - * If P.Data Offset is too small for packet type, drop packet and return - */ - dccph_doff = dh->dccph_doff; - if (dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) { - DCCP_WARN("P.Data Offset(%u) too small\n", dccph_doff); - return 1; - } - /* - * If P.Data Offset is too large for packet, drop packet and return - */ - if (!pskb_may_pull(skb, dccph_doff * sizeof(u32))) { - DCCP_WARN("P.Data Offset(%u) too large\n", dccph_doff); - return 1; - } - dh = dccp_hdr(skb); - /* - * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet - * has short sequence numbers), drop packet and return - */ - if ((dh->dccph_type < DCCP_PKT_DATA || - dh->dccph_type > DCCP_PKT_DATAACK) && dh->dccph_x == 0) { - DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n", - dccp_packet_name(dh->dccph_type)); - return 1; - } - - /* - * If P.CsCov is too large for the packet size, drop packet and return. - * This must come _before_ checksumming (not as RFC 4340 suggests). - */ - cscov = dccp_csum_coverage(skb); - if (cscov > skb->len) { - DCCP_WARN("P.CsCov %u exceeds packet length %d\n", - dh->dccph_cscov, skb->len); - return 1; - } - - /* If header checksum is incorrect, drop packet and return. - * (This step is completed in the AF-dependent functions.) */ - skb->csum = skb_checksum(skb, 0, cscov, 0); - - return 0; -} -EXPORT_SYMBOL_GPL(dccp_invalid_packet); - -/* this is called when real data arrives */ -static int dccp_v4_rcv(struct sk_buff *skb) -{ - const struct dccp_hdr *dh; - const struct iphdr *iph; - bool refcounted; - struct sock *sk; - int min_cov; - - /* Step 1: Check header basics */ - - if (dccp_invalid_packet(skb)) - goto discard_it; - - iph = ip_hdr(skb); - /* Step 1: If header checksum is incorrect, drop packet and return */ - if (dccp_v4_csum_finish(skb, iph->saddr, iph->daddr)) { - DCCP_WARN("dropped packet with invalid checksum\n"); - goto discard_it; - } - - dh = dccp_hdr(skb); - - DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh); - DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; - - dccp_pr_debug("%8.8s src=%pI4@%-5d dst=%pI4@%-5d seq=%llu", - dccp_packet_name(dh->dccph_type), - &iph->saddr, ntohs(dh->dccph_sport), - &iph->daddr, ntohs(dh->dccph_dport), - (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); - - if (dccp_packet_without_ack(skb)) { - DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; - dccp_pr_debug_cat("\n"); - } else { - DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); - dccp_pr_debug_cat(", ack=%llu\n", (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq); - } - -lookup: - sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), - dh->dccph_sport, dh->dccph_dport, 0, &refcounted); - if (!sk) { - dccp_pr_debug("failed to look up flow ID in table and " - "get corresponding socket\n"); - goto no_dccp_socket; - } - - /* - * Step 2: - * ... or S.state == TIMEWAIT, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (sk->sk_state == DCCP_TIME_WAIT) { - dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n"); - inet_twsk_put(inet_twsk(sk)); - goto no_dccp_socket; - } - - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - struct request_sock *req = inet_reqsk(sk); - struct sock *nsk; - - sk = req->rsk_listener; - if (unlikely(sk->sk_state != DCCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; - } - sock_hold(sk); - refcounted = true; - nsk = dccp_check_req(sk, skb, req); - if (!nsk) { - reqsk_put(req); - goto discard_and_relse; - } - if (nsk == sk) { - reqsk_put(req); - } else if (dccp_child_process(sk, nsk, skb)) { - dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - goto discard_and_relse; - } else { - sock_put(sk); - return 0; - } - } - /* - * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage - * o if MinCsCov = 0, only packets with CsCov = 0 are accepted - * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov - */ - min_cov = dccp_sk(sk)->dccps_pcrlen; - if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) { - dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n", - dh->dccph_cscov, min_cov); - /* FIXME: "Such packets SHOULD be reported using Data Dropped - * options (Section 11.7) with Drop Code 0, Protocol - * Constraints." */ - goto discard_and_relse; - } - - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) - goto discard_and_relse; - nf_reset_ct(skb); - - return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted); - -no_dccp_socket: - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto discard_it; - /* - * Step 2: - * If no socket ... - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (dh->dccph_type != DCCP_PKT_RESET) { - DCCP_SKB_CB(skb)->dccpd_reset_code = - DCCP_RESET_CODE_NO_CONNECTION; - dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - } - -discard_it: - kfree_skb(skb); - return 0; - -discard_and_relse: - if (refcounted) - sock_put(sk); - goto discard_it; -} - -static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { - .queue_xmit = ip_queue_xmit, - .send_check = dccp_v4_send_check, - .rebuild_header = inet_sk_rebuild_header, - .conn_request = dccp_v4_conn_request, - .syn_recv_sock = dccp_v4_request_recv_sock, - .net_header_len = sizeof(struct iphdr), - .setsockopt = ip_setsockopt, - .getsockopt = ip_getsockopt, -}; - -static int dccp_v4_init_sock(struct sock *sk) -{ - static __u8 dccp_v4_ctl_sock_initialized; - int err = dccp_init_sock(sk, dccp_v4_ctl_sock_initialized); - - if (err == 0) { - if (unlikely(!dccp_v4_ctl_sock_initialized)) - dccp_v4_ctl_sock_initialized = 1; - inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops; - } - - return err; -} - -static struct timewait_sock_ops dccp_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct inet_timewait_sock), -}; - -static struct proto dccp_v4_prot = { - .name = "DCCP", - .owner = THIS_MODULE, - .close = dccp_close, - .connect = dccp_v4_connect, - .disconnect = dccp_disconnect, - .ioctl = dccp_ioctl, - .init = dccp_v4_init_sock, - .setsockopt = dccp_setsockopt, - .getsockopt = dccp_getsockopt, - .sendmsg = dccp_sendmsg, - .recvmsg = dccp_recvmsg, - .backlog_rcv = dccp_v4_do_rcv, - .hash = inet_hash, - .unhash = inet_unhash, - .accept = inet_csk_accept, - .get_port = inet_csk_get_port, - .shutdown = dccp_shutdown, - .destroy = dccp_destroy_sock, - .orphan_count = &dccp_orphan_count, - .max_header = MAX_DCCP_HEADER, - .obj_size = sizeof(struct dccp_sock), - .slab_flags = SLAB_TYPESAFE_BY_RCU, - .rsk_prot = &dccp_request_sock_ops, - .twsk_prot = &dccp_timewait_sock_ops, - .h.hashinfo = &dccp_hashinfo, -}; - -static const struct net_protocol dccp_v4_protocol = { - .handler = dccp_v4_rcv, - .err_handler = dccp_v4_err, - .no_policy = 1, - .icmp_strict_tag_validation = 1, -}; - -static const struct proto_ops inet_dccp_ops = { - .family = PF_INET, - .owner = THIS_MODULE, - .release = inet_release, - .bind = inet_bind, - .connect = inet_stream_connect, - .socketpair = sock_no_socketpair, - .accept = inet_accept, - .getname = inet_getname, - /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ - .poll = dccp_poll, - .ioctl = inet_ioctl, - .gettstamp = sock_gettstamp, - /* FIXME: work on inet_listen to rename it to sock_common_listen */ - .listen = inet_dccp_listen, - .shutdown = inet_shutdown, - .setsockopt = sock_common_setsockopt, - .getsockopt = sock_common_getsockopt, - .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, - .mmap = sock_no_mmap, -}; - -static struct inet_protosw dccp_v4_protosw = { - .type = SOCK_DCCP, - .protocol = IPPROTO_DCCP, - .prot = &dccp_v4_prot, - .ops = &inet_dccp_ops, - .flags = INET_PROTOSW_ICSK, -}; - -static int __net_init dccp_v4_init_net(struct net *net) -{ - struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id); - - if (dccp_hashinfo.bhash == NULL) - return -ESOCKTNOSUPPORT; - - return inet_ctl_sock_create(&pn->v4_ctl_sk, PF_INET, - SOCK_DCCP, IPPROTO_DCCP, net); -} - -static void __net_exit dccp_v4_exit_net(struct net *net) -{ - struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id); - - inet_ctl_sock_destroy(pn->v4_ctl_sk); -} - -static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list) -{ - inet_twsk_purge(&dccp_hashinfo); -} - -static struct pernet_operations dccp_v4_ops = { - .init = dccp_v4_init_net, - .exit = dccp_v4_exit_net, - .exit_batch = dccp_v4_exit_batch, - .id = &dccp_v4_pernet_id, - .size = sizeof(struct dccp_v4_pernet), -}; - -static int __init dccp_v4_init(void) -{ - int err = proto_register(&dccp_v4_prot, 1); - - if (err) - goto out; - - inet_register_protosw(&dccp_v4_protosw); - - err = register_pernet_subsys(&dccp_v4_ops); - if (err) - goto out_destroy_ctl_sock; - - err = inet_add_protocol(&dccp_v4_protocol, IPPROTO_DCCP); - if (err) - goto out_proto_unregister; - -out: - return err; -out_proto_unregister: - unregister_pernet_subsys(&dccp_v4_ops); -out_destroy_ctl_sock: - inet_unregister_protosw(&dccp_v4_protosw); - proto_unregister(&dccp_v4_prot); - goto out; -} - -static void __exit dccp_v4_exit(void) -{ - inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP); - unregister_pernet_subsys(&dccp_v4_ops); - inet_unregister_protosw(&dccp_v4_protosw); - proto_unregister(&dccp_v4_prot); -} - -module_init(dccp_v4_init); -module_exit(dccp_v4_exit); - -/* - * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) - * values directly, Also cover the case where the protocol is not specified, - * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP - */ -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 33, 6); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 0, 6); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo "); -MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c deleted file mode 100644 index e24dbffabfc19..0000000000000 --- a/net/dccp/ipv6.c +++ /dev/null @@ -1,1174 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * DCCP over IPv6 - * Linux INET6 implementation - * - * Based on net/dccp6/ipv6.c - * - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "dccp.h" -#include "ipv6.h" -#include "feat.h" - -struct dccp_v6_pernet { - struct sock *v6_ctl_sk; -}; - -static unsigned int dccp_v6_pernet_id __read_mostly; - -/* The per-net v6_ctl_sk is used for sending RSTs and ACKs */ - -static const struct inet_connection_sock_af_ops dccp_ipv6_mapped; -static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops; - -/* add pseudo-header to DCCP checksum stored in skb->csum */ -static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb, - const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum); -} - -static inline void dccp_v6_send_check(struct sock *sk, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct dccp_hdr *dh = dccp_hdr(skb); - - dccp_csum_outgoing(skb); - dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &sk->sk_v6_daddr); -} - -static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb) -{ - return secure_dccpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, - ipv6_hdr(skb)->saddr.s6_addr32, - dccp_hdr(skb)->dccph_dport, - dccp_hdr(skb)->dccph_sport ); - -} - -static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) -{ - const struct ipv6hdr *hdr; - const struct dccp_hdr *dh; - struct dccp_sock *dp; - struct ipv6_pinfo *np; - struct sock *sk; - int err; - __u64 seq; - struct net *net = dev_net(skb->dev); - - if (!pskb_may_pull(skb, offset + sizeof(*dh))) - return -EINVAL; - dh = (struct dccp_hdr *)(skb->data + offset); - if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) - return -EINVAL; - hdr = (const struct ipv6hdr *)skb->data; - dh = (struct dccp_hdr *)(skb->data + offset); - - sk = __inet6_lookup_established(net, &dccp_hashinfo, - &hdr->daddr, dh->dccph_dport, - &hdr->saddr, ntohs(dh->dccph_sport), - inet6_iif(skb), 0); - - if (!sk) { - __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); - return -ENOENT; - } - - if (sk->sk_state == DCCP_TIME_WAIT) { - inet_twsk_put(inet_twsk(sk)); - return 0; - } - seq = dccp_hdr_seq(dh); - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - dccp_req_err(sk, seq); - return 0; - } - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) - __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); - - if (sk->sk_state == DCCP_CLOSED) - goto out; - - dp = dccp_sk(sk); - if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && - !between48(seq, dp->dccps_awl, dp->dccps_awh)) { - __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); - goto out; - } - - np = inet6_sk(sk); - - if (type == NDISC_REDIRECT) { - if (!sock_owned_by_user(sk)) { - struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst) - dst->ops->redirect(dst, sk, skb); - } - goto out; - } - - if (type == ICMPV6_PKT_TOOBIG) { - struct dst_entry *dst = NULL; - - if (!ip6_sk_accept_pmtu(sk)) - goto out; - - if (sock_owned_by_user(sk)) - goto out; - if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED)) - goto out; - - dst = inet6_csk_update_pmtu(sk, ntohl(info)); - if (!dst) - goto out; - - if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) - dccp_sync_mss(sk, dst_mtu(dst)); - goto out; - } - - icmpv6_err_convert(type, code, &err); - - /* Might be for an request_sock */ - switch (sk->sk_state) { - case DCCP_REQUESTING: - case DCCP_RESPOND: /* Cannot happen. - It can, it SYNs are crossed. --ANK */ - if (!sock_owned_by_user(sk)) { - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - sk->sk_err = err; - /* - * Wake people up to see the error - * (see connect in sock.c) - */ - sk_error_report(sk); - dccp_done(sk); - } else { - WRITE_ONCE(sk->sk_err_soft, err); - } - goto out; - } - - if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { - sk->sk_err = err; - sk_error_report(sk); - } else { - WRITE_ONCE(sk->sk_err_soft, err); - } -out: - bh_unlock_sock(sk); - sock_put(sk); - return 0; -} - - -static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req) -{ - struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *np = inet6_sk(sk); - struct sk_buff *skb; - struct in6_addr *final_p, final; - struct flowi6 fl6; - int err = -1; - struct dst_entry *dst; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.daddr = ireq->ir_v6_rmt_addr; - fl6.saddr = ireq->ir_v6_loc_addr; - fl6.flowlabel = 0; - fl6.flowi6_oif = ireq->ir_iif; - fl6.fl6_dport = ireq->ir_rmt_port; - fl6.fl6_sport = htons(ireq->ir_num); - security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); - - - rcu_read_lock(); - final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); - rcu_read_unlock(); - - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - dst = NULL; - goto done; - } - - skb = dccp_make_response(sk, dst, req); - if (skb != NULL) { - struct dccp_hdr *dh = dccp_hdr(skb); - struct ipv6_txoptions *opt; - - dh->dccph_checksum = dccp_v6_csum_finish(skb, - &ireq->ir_v6_loc_addr, - &ireq->ir_v6_rmt_addr); - fl6.daddr = ireq->ir_v6_rmt_addr; - rcu_read_lock(); - opt = ireq->ipv6_opt; - if (!opt) - opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, &fl6, READ_ONCE(sk->sk_mark), opt, - np->tclass, READ_ONCE(sk->sk_priority)); - rcu_read_unlock(); - err = net_xmit_eval(err); - } - -done: - dst_release(dst); - return err; -} - -static void dccp_v6_reqsk_destructor(struct request_sock *req) -{ - dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); - kfree(inet_rsk(req)->ipv6_opt); - kfree_skb(inet_rsk(req)->pktopts); -} - -static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb, - enum sk_rst_reason reason) -{ - const struct ipv6hdr *rxip6h; - struct sk_buff *skb; - struct flowi6 fl6; - struct net *net = dev_net(skb_dst(rxskb)->dev); - struct dccp_v6_pernet *pn; - struct sock *ctl_sk; - struct dst_entry *dst; - - if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) - return; - - if (!ipv6_unicast_destination(rxskb)) - return; - - pn = net_generic(net, dccp_v6_pernet_id); - ctl_sk = pn->v6_ctl_sk; - skb = dccp_ctl_make_reset(ctl_sk, rxskb); - if (skb == NULL) - return; - - rxip6h = ipv6_hdr(rxskb); - dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr, - &rxip6h->daddr); - - memset(&fl6, 0, sizeof(fl6)); - fl6.daddr = rxip6h->saddr; - fl6.saddr = rxip6h->daddr; - - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.flowi6_oif = inet6_iif(rxskb); - fl6.fl6_dport = dccp_hdr(skb)->dccph_dport; - fl6.fl6_sport = dccp_hdr(skb)->dccph_sport; - security_skb_classify_flow(rxskb, flowi6_to_flowi_common(&fl6)); - - /* sk = NULL, but it is safe for now. RST socket required. */ - dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); - if (!IS_ERR(dst)) { - skb_dst_set(skb, dst); - ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0, 0); - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - DCCP_INC_STATS(DCCP_MIB_OUTRSTS); - return; - } - - kfree_skb(skb); -} - -static struct request_sock_ops dccp6_request_sock_ops = { - .family = AF_INET6, - .obj_size = sizeof(struct dccp6_request_sock), - .rtx_syn_ack = dccp_v6_send_response, - .send_ack = dccp_reqsk_send_ack, - .destructor = dccp_v6_reqsk_destructor, - .send_reset = dccp_v6_ctl_send_reset, - .syn_ack_timeout = dccp_syn_ack_timeout, -}; - -static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) -{ - struct request_sock *req; - struct dccp_request_sock *dreq; - struct inet_request_sock *ireq; - struct ipv6_pinfo *np = inet6_sk(sk); - const __be32 service = dccp_hdr_request(skb)->dccph_req_service; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - - if (skb->protocol == htons(ETH_P_IP)) - return dccp_v4_conn_request(sk, skb); - - if (!ipv6_unicast_destination(skb)) - return 0; /* discard, don't send a reset here */ - - if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { - __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); - return 0; - } - - if (dccp_bad_service_code(sk, service)) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; - goto drop; - } - /* - * There are no SYN attacks on IPv6, yet... - */ - dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; - if (inet_csk_reqsk_queue_is_full(sk)) - goto drop; - - if (sk_acceptq_is_full(sk)) - goto drop; - - req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true); - if (req == NULL) - goto drop; - - if (dccp_reqsk_init(req, dccp_sk(sk), skb)) - goto drop_and_free; - - dreq = dccp_rsk(req); - if (dccp_parse_options(sk, dreq, skb)) - goto drop_and_free; - - ireq = inet_rsk(req); - ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; - ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - ireq->ir_rmt_addr = LOOPBACK4_IPV6; - ireq->ir_loc_addr = LOOPBACK4_IPV6; - - ireq->ireq_family = AF_INET6; - ireq->ir_mark = inet_request_mark(sk, skb); - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; - - if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || - np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || - np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { - refcount_inc(&skb->users); - ireq->pktopts = skb; - } - ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if); - - /* So that link locals have meaning */ - if (!ireq->ir_iif && - ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) - ireq->ir_iif = inet6_iif(skb); - - /* - * Step 3: Process LISTEN state - * - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie - * - * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child(). - */ - dreq->dreq_isr = dcb->dccpd_seq; - dreq->dreq_gsr = dreq->dreq_isr; - dreq->dreq_iss = dccp_v6_init_sequence(skb); - dreq->dreq_gss = dreq->dreq_iss; - dreq->dreq_service = service; - - if (dccp_v6_send_response(sk, req)) - goto drop_and_free; - - if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT))) - reqsk_free(req); - else - reqsk_put(req); - - return 0; - -drop_and_free: - reqsk_free(req); -drop: - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - return -1; -} - -static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, - struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req) -{ - struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *newnp; - const struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6_txoptions *opt; - struct inet_sock *newinet; - struct dccp6_sock *newdp6; - struct sock *newsk; - - if (skb->protocol == htons(ETH_P_IP)) { - /* - * v6 mapped - */ - newsk = dccp_v4_request_recv_sock(sk, skb, req, dst, - req_unhash, own_req); - if (newsk == NULL) - return NULL; - - newdp6 = (struct dccp6_sock *)newsk; - newinet = inet_sk(newsk); - newinet->pinet6 = &newdp6->inet6; - newnp = inet6_sk(newsk); - - memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - - newnp->saddr = newsk->sk_v6_rcv_saddr; - - inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped; - newsk->sk_backlog_rcv = dccp_v4_do_rcv; - newnp->pktoptions = NULL; - newnp->opt = NULL; - newnp->ipv6_mc_list = NULL; - newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; - newnp->mcast_oif = inet_iif(skb); - newnp->mcast_hops = ip_hdr(skb)->ttl; - - /* - * No need to charge this sock to the relevant IPv6 refcnt debug socks count - * here, dccp_create_openreq_child now does this for us, see the comment in - * that function for the gory details. -acme - */ - - /* It is tricky place. Until this moment IPv4 tcp - worked with IPv6 icsk.icsk_af_ops. - Sync it now. - */ - dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); - - return newsk; - } - - - if (sk_acceptq_is_full(sk)) - goto out_overflow; - - if (!dst) { - struct flowi6 fl6; - - dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_DCCP); - if (!dst) - goto out; - } - - newsk = dccp_create_openreq_child(sk, req, skb); - if (newsk == NULL) - goto out_nonewsk; - - /* - * No need to charge this sock to the relevant IPv6 refcnt debug socks - * count here, dccp_create_openreq_child now does this for us, see the - * comment in that function for the gory details. -acme - */ - - ip6_dst_store(newsk, dst, NULL, NULL); - newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | - NETIF_F_TSO); - newdp6 = (struct dccp6_sock *)newsk; - newinet = inet_sk(newsk); - newinet->pinet6 = &newdp6->inet6; - newnp = inet6_sk(newsk); - - memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - - newnp->saddr = ireq->ir_v6_loc_addr; - - /* Now IPv6 options... - - First: no IPv4 options. - */ - newinet->inet_opt = NULL; - - /* Clone RX bits */ - newnp->rxopt.all = np->rxopt.all; - - newnp->ipv6_mc_list = NULL; - newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; - newnp->pktoptions = NULL; - newnp->opt = NULL; - newnp->mcast_oif = inet6_iif(skb); - newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; - - /* - * Clone native IPv6 options from listening socket (if any) - * - * Yes, keeping reference count would be much more clever, but we make - * one more one thing there: reattach optmem to newsk. - */ - opt = ireq->ipv6_opt; - if (!opt) - opt = rcu_dereference(np->opt); - if (opt) { - opt = ipv6_dup_options(newsk, opt); - RCU_INIT_POINTER(newnp->opt, opt); - } - inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (opt) - inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + - opt->opt_flen; - - dccp_sync_mss(newsk, dst_mtu(dst)); - - if (__inet_inherit_port(sk, newsk) < 0) { - inet_csk_prepare_forced_close(newsk); - dccp_done(newsk); - goto out; - } - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); - /* Clone pktoptions received with SYN, if we own the req */ - if (*own_req && ireq->pktopts) { - newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); - consume_skb(ireq->pktopts); - ireq->pktopts = NULL; - } - - return newsk; - -out_overflow: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -out_nonewsk: - dst_release(dst); -out: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); - return NULL; -} - -/* The socket must have it's spinlock held when we get - * here. - * - * We have a potential double-lock case here, so even when - * doing backlog processing we use the BH locking scheme. - * This is because we cannot sleep with the original spinlock - * held. - */ -static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct sk_buff *opt_skb = NULL; - - /* Imagine: socket is IPv6. IPv4 packet arrives, - goes to IPv4 receive handler and backlogged. - From backlog it always goes here. Kerboom... - Fortunately, dccp_rcv_established and rcv_established - handle them correctly, but it is not case with - dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK - */ - - if (skb->protocol == htons(ETH_P_IP)) - return dccp_v4_do_rcv(sk, skb); - - if (sk_filter(sk, skb)) - goto discard; - - /* - * socket locking is here for SMP purposes as backlog rcv is currently - * called with bh processing disabled. - */ - - /* Do Stevens' IPV6_PKTOPTIONS. - - Yes, guys, it is the only place in our code, where we - may make it not affecting IPv4. - The rest of code is protocol independent, - and I do not like idea to uglify IPv4. - - Actually, all the idea behind IPV6_PKTOPTIONS - looks not very well thought. For now we latch - options, received in the last packet, enqueued - by tcp. Feel free to propose better solution. - --ANK (980728) - */ - if (np->rxopt.all && sk->sk_state != DCCP_LISTEN) - opt_skb = skb_clone_and_charge_r(skb, sk); - - if (sk->sk_state == DCCP_OPEN) { /* Fast path */ - if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len)) - goto reset; - if (opt_skb) - goto ipv6_pktoptions; - return 0; - } - - /* - * Step 3: Process LISTEN state - * If S.state == LISTEN, - * If P.type == Request or P contains a valid Init Cookie option, - * (* Must scan the packet's options to check for Init - * Cookies. Only Init Cookies are processed here, - * however; other options are processed in Step 8. This - * scan need only be performed if the endpoint uses Init - * Cookies *) - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - * S.state = RESPOND - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies - * Continue with S.state == RESPOND - * (* A Response packet will be generated in Step 11 *) - * Otherwise, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - * - * NOTE: the check for the packet types is done in - * dccp_rcv_state_process - */ - - if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) - goto reset; - if (opt_skb) - goto ipv6_pktoptions; - return 0; - -reset: - dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); -discard: - if (opt_skb != NULL) - __kfree_skb(opt_skb); - kfree_skb(skb); - return 0; - -/* Handling IPV6_PKTOPTIONS skb the similar - * way it's done for net/ipv6/tcp_ipv6.c - */ -ipv6_pktoptions: - if (!((1 << sk->sk_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) { - if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) - WRITE_ONCE(np->mcast_oif, inet6_iif(opt_skb)); - if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) - WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit); - if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) - np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); - if (inet6_test_bit(REPFLOW, sk)) - np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); - if (ipv6_opt_accepted(sk, opt_skb, - &DCCP_SKB_CB(opt_skb)->header.h6)) { - memmove(IP6CB(opt_skb), - &DCCP_SKB_CB(opt_skb)->header.h6, - sizeof(struct inet6_skb_parm)); - opt_skb = xchg(&np->pktoptions, opt_skb); - } else { - __kfree_skb(opt_skb); - opt_skb = xchg(&np->pktoptions, NULL); - } - } - - kfree_skb(opt_skb); - return 0; -} - -static int dccp_v6_rcv(struct sk_buff *skb) -{ - const struct dccp_hdr *dh; - bool refcounted; - struct sock *sk; - int min_cov; - - /* Step 1: Check header basics */ - - if (dccp_invalid_packet(skb)) - goto discard_it; - - /* Step 1: If header checksum is incorrect, drop packet and return. */ - if (dccp_v6_csum_finish(skb, &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr)) { - DCCP_WARN("dropped packet with invalid checksum\n"); - goto discard_it; - } - - dh = dccp_hdr(skb); - - DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh); - DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; - - if (dccp_packet_without_ack(skb)) - DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; - else - DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); - -lookup: - sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), - dh->dccph_sport, dh->dccph_dport, - inet6_iif(skb), 0, &refcounted); - if (!sk) { - dccp_pr_debug("failed to look up flow ID in table and " - "get corresponding socket\n"); - goto no_dccp_socket; - } - - /* - * Step 2: - * ... or S.state == TIMEWAIT, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (sk->sk_state == DCCP_TIME_WAIT) { - dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n"); - inet_twsk_put(inet_twsk(sk)); - goto no_dccp_socket; - } - - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - struct request_sock *req = inet_reqsk(sk); - struct sock *nsk; - - sk = req->rsk_listener; - if (unlikely(sk->sk_state != DCCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; - } - sock_hold(sk); - refcounted = true; - nsk = dccp_check_req(sk, skb, req); - if (!nsk) { - reqsk_put(req); - goto discard_and_relse; - } - if (nsk == sk) { - reqsk_put(req); - } else if (dccp_child_process(sk, nsk, skb)) { - dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - goto discard_and_relse; - } else { - sock_put(sk); - return 0; - } - } - /* - * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage - * o if MinCsCov = 0, only packets with CsCov = 0 are accepted - * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov - */ - min_cov = dccp_sk(sk)->dccps_pcrlen; - if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) { - dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n", - dh->dccph_cscov, min_cov); - /* FIXME: send Data Dropped option (see also dccp_v4_rcv) */ - goto discard_and_relse; - } - - if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) - goto discard_and_relse; - nf_reset_ct(skb); - - return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, - refcounted) ? -1 : 0; - -no_dccp_socket: - if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto discard_it; - /* - * Step 2: - * If no socket ... - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (dh->dccph_type != DCCP_PKT_RESET) { - DCCP_SKB_CB(skb)->dccpd_reset_code = - DCCP_RESET_CODE_NO_CONNECTION; - dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - } - -discard_it: - kfree_skb(skb); - return 0; - -discard_and_relse: - if (refcounted) - sock_put(sk); - goto discard_it; -} - -static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, - int addr_len) -{ - struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr; - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - struct in6_addr *saddr = NULL, *final_p, final; - struct ipv6_txoptions *opt; - struct flowi6 fl6; - struct dst_entry *dst; - int addr_type; - int err; - - dp->dccps_role = DCCP_ROLE_CLIENT; - - if (addr_len < SIN6_LEN_RFC2133) - return -EINVAL; - - if (usin->sin6_family != AF_INET6) - return -EAFNOSUPPORT; - - memset(&fl6, 0, sizeof(fl6)); - - if (inet6_test_bit(SNDFLOW, sk)) { - fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; - IP6_ECN_flow_init(fl6.flowlabel); - if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { - struct ip6_flowlabel *flowlabel; - flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (IS_ERR(flowlabel)) - return -EINVAL; - fl6_sock_release(flowlabel); - } - } - /* - * connect() to INADDR_ANY means loopback (BSD'ism). - */ - if (ipv6_addr_any(&usin->sin6_addr)) - usin->sin6_addr.s6_addr[15] = 1; - - addr_type = ipv6_addr_type(&usin->sin6_addr); - - if (addr_type & IPV6_ADDR_MULTICAST) - return -ENETUNREACH; - - if (addr_type & IPV6_ADDR_LINKLOCAL) { - if (addr_len >= sizeof(struct sockaddr_in6) && - usin->sin6_scope_id) { - /* If interface is set while binding, indices - * must coincide. - */ - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != usin->sin6_scope_id) - return -EINVAL; - - sk->sk_bound_dev_if = usin->sin6_scope_id; - } - - /* Connect to link-local address requires an interface */ - if (!sk->sk_bound_dev_if) - return -EINVAL; - } - - sk->sk_v6_daddr = usin->sin6_addr; - np->flow_label = fl6.flowlabel; - - /* - * DCCP over IPv4 - */ - if (addr_type == IPV6_ADDR_MAPPED) { - u32 exthdrlen = icsk->icsk_ext_hdr_len; - struct sockaddr_in sin; - - net_dbg_ratelimited("connect: ipv4 mapped\n"); - - if (ipv6_only_sock(sk)) - return -ENETUNREACH; - - sin.sin_family = AF_INET; - sin.sin_port = usin->sin6_port; - sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - - icsk->icsk_af_ops = &dccp_ipv6_mapped; - sk->sk_backlog_rcv = dccp_v4_do_rcv; - - err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); - if (err) { - icsk->icsk_ext_hdr_len = exthdrlen; - icsk->icsk_af_ops = &dccp_ipv6_af_ops; - sk->sk_backlog_rcv = dccp_v6_do_rcv; - goto failure; - } - np->saddr = sk->sk_v6_rcv_saddr; - return err; - } - - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) - saddr = &sk->sk_v6_rcv_saddr; - - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.daddr = sk->sk_v6_daddr; - fl6.saddr = saddr ? *saddr : np->saddr; - fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.fl6_dport = usin->sin6_port; - fl6.fl6_sport = inet->inet_sport; - security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); - - opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); - final_p = fl6_update_dst(&fl6, opt, &final); - - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - goto failure; - } - - if (saddr == NULL) { - saddr = &fl6.saddr; - - err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); - if (err) - goto failure; - } - - /* set the source address */ - np->saddr = *saddr; - inet->inet_rcv_saddr = LOOPBACK4_IPV6; - - ip6_dst_store(sk, dst, NULL, NULL); - - icsk->icsk_ext_hdr_len = 0; - if (opt) - icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; - - inet->inet_dport = usin->sin6_port; - - dccp_set_state(sk, DCCP_REQUESTING); - err = inet6_hash_connect(&dccp_death_row, sk); - if (err) - goto late_failure; - - dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32, - inet->inet_sport, - inet->inet_dport); - err = dccp_connect(sk); - if (err) - goto late_failure; - - return 0; - -late_failure: - dccp_set_state(sk, DCCP_CLOSED); - inet_bhash2_reset_saddr(sk); - __sk_dst_reset(sk); -failure: - inet->inet_dport = 0; - sk->sk_route_caps = 0; - return err; -} - -static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = { - .queue_xmit = inet6_csk_xmit, - .send_check = dccp_v6_send_check, - .rebuild_header = inet6_sk_rebuild_header, - .conn_request = dccp_v6_conn_request, - .syn_recv_sock = dccp_v6_request_recv_sock, - .net_header_len = sizeof(struct ipv6hdr), - .setsockopt = ipv6_setsockopt, - .getsockopt = ipv6_getsockopt, -}; - -/* - * DCCP over IPv4 via INET6 API - */ -static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { - .queue_xmit = ip_queue_xmit, - .send_check = dccp_v4_send_check, - .rebuild_header = inet_sk_rebuild_header, - .conn_request = dccp_v6_conn_request, - .syn_recv_sock = dccp_v6_request_recv_sock, - .net_header_len = sizeof(struct iphdr), - .setsockopt = ipv6_setsockopt, - .getsockopt = ipv6_getsockopt, -}; - -static void dccp_v6_sk_destruct(struct sock *sk) -{ - dccp_destruct_common(sk); - inet6_sock_destruct(sk); -} - -/* NOTE: A lot of things set to zero explicitly by call to - * sk_alloc() so need not be done here. - */ -static int dccp_v6_init_sock(struct sock *sk) -{ - static __u8 dccp_v6_ctl_sock_initialized; - int err = dccp_init_sock(sk, dccp_v6_ctl_sock_initialized); - - if (err == 0) { - if (unlikely(!dccp_v6_ctl_sock_initialized)) - dccp_v6_ctl_sock_initialized = 1; - inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; - sk->sk_destruct = dccp_v6_sk_destruct; - } - - return err; -} - -static struct timewait_sock_ops dccp6_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct dccp6_timewait_sock), -}; - -static struct proto dccp_v6_prot = { - .name = "DCCPv6", - .owner = THIS_MODULE, - .close = dccp_close, - .connect = dccp_v6_connect, - .disconnect = dccp_disconnect, - .ioctl = dccp_ioctl, - .init = dccp_v6_init_sock, - .setsockopt = dccp_setsockopt, - .getsockopt = dccp_getsockopt, - .sendmsg = dccp_sendmsg, - .recvmsg = dccp_recvmsg, - .backlog_rcv = dccp_v6_do_rcv, - .hash = inet6_hash, - .unhash = inet_unhash, - .accept = inet_csk_accept, - .get_port = inet_csk_get_port, - .shutdown = dccp_shutdown, - .destroy = dccp_destroy_sock, - .orphan_count = &dccp_orphan_count, - .max_header = MAX_DCCP_HEADER, - .obj_size = sizeof(struct dccp6_sock), - .ipv6_pinfo_offset = offsetof(struct dccp6_sock, inet6), - .slab_flags = SLAB_TYPESAFE_BY_RCU, - .rsk_prot = &dccp6_request_sock_ops, - .twsk_prot = &dccp6_timewait_sock_ops, - .h.hashinfo = &dccp_hashinfo, -}; - -static const struct inet6_protocol dccp_v6_protocol = { - .handler = dccp_v6_rcv, - .err_handler = dccp_v6_err, - .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, -}; - -static const struct proto_ops inet6_dccp_ops = { - .family = PF_INET6, - .owner = THIS_MODULE, - .release = inet6_release, - .bind = inet6_bind, - .connect = inet_stream_connect, - .socketpair = sock_no_socketpair, - .accept = inet_accept, - .getname = inet6_getname, - .poll = dccp_poll, - .ioctl = inet6_ioctl, - .gettstamp = sock_gettstamp, - .listen = inet_dccp_listen, - .shutdown = inet_shutdown, - .setsockopt = sock_common_setsockopt, - .getsockopt = sock_common_getsockopt, - .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, - .mmap = sock_no_mmap, -#ifdef CONFIG_COMPAT - .compat_ioctl = inet6_compat_ioctl, -#endif -}; - -static struct inet_protosw dccp_v6_protosw = { - .type = SOCK_DCCP, - .protocol = IPPROTO_DCCP, - .prot = &dccp_v6_prot, - .ops = &inet6_dccp_ops, - .flags = INET_PROTOSW_ICSK, -}; - -static int __net_init dccp_v6_init_net(struct net *net) -{ - struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id); - - if (dccp_hashinfo.bhash == NULL) - return -ESOCKTNOSUPPORT; - - return inet_ctl_sock_create(&pn->v6_ctl_sk, PF_INET6, - SOCK_DCCP, IPPROTO_DCCP, net); -} - -static void __net_exit dccp_v6_exit_net(struct net *net) -{ - struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id); - - inet_ctl_sock_destroy(pn->v6_ctl_sk); -} - -static struct pernet_operations dccp_v6_ops = { - .init = dccp_v6_init_net, - .exit = dccp_v6_exit_net, - .id = &dccp_v6_pernet_id, - .size = sizeof(struct dccp_v6_pernet), -}; - -static int __init dccp_v6_init(void) -{ - int err = proto_register(&dccp_v6_prot, 1); - - if (err) - goto out; - - inet6_register_protosw(&dccp_v6_protosw); - - err = register_pernet_subsys(&dccp_v6_ops); - if (err) - goto out_destroy_ctl_sock; - - err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP); - if (err) - goto out_unregister_proto; - -out: - return err; -out_unregister_proto: - unregister_pernet_subsys(&dccp_v6_ops); -out_destroy_ctl_sock: - inet6_unregister_protosw(&dccp_v6_protosw); - proto_unregister(&dccp_v6_prot); - goto out; -} - -static void __exit dccp_v6_exit(void) -{ - inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP); - unregister_pernet_subsys(&dccp_v6_ops); - inet6_unregister_protosw(&dccp_v6_protosw); - proto_unregister(&dccp_v6_prot); -} - -module_init(dccp_v6_init); -module_exit(dccp_v6_exit); - -/* - * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) - * values directly, Also cover the case where the protocol is not specified, - * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP - */ -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 33, 6); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 0, 6); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo "); -MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h deleted file mode 100644 index c5d14c48def17..0000000000000 --- a/net/dccp/ipv6.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _DCCP_IPV6_H -#define _DCCP_IPV6_H -/* - * net/dccp/ipv6.h - * - * An implementation of the DCCP protocol - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ - -#include -#include - -struct dccp6_sock { - struct dccp_sock dccp; - struct ipv6_pinfo inet6; -}; - -struct dccp6_request_sock { - struct dccp_request_sock dccp; -}; - -struct dccp6_timewait_sock { - struct inet_timewait_sock inet; -}; - -#endif /* _DCCP_IPV6_H */ diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c deleted file mode 100644 index fecc8190064f8..0000000000000 --- a/net/dccp/minisocks.c +++ /dev/null @@ -1,266 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/minisocks.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -struct inet_timewait_death_row dccp_death_row = { - .tw_refcount = REFCOUNT_INIT(1), - .sysctl_max_tw_buckets = NR_FILE * 2, - .hashinfo = &dccp_hashinfo, -}; - -EXPORT_SYMBOL_GPL(dccp_death_row); - -void dccp_time_wait(struct sock *sk, int state, int timeo) -{ - struct inet_timewait_sock *tw; - - tw = inet_twsk_alloc(sk, &dccp_death_row, state); - - if (tw != NULL) { - const struct inet_connection_sock *icsk = inet_csk(sk); - const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); -#if IS_ENABLED(CONFIG_IPV6) - if (tw->tw_family == PF_INET6) { - tw->tw_v6_daddr = sk->sk_v6_daddr; - tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - tw->tw_ipv6only = sk->sk_ipv6only; - } -#endif - - /* Get the TIME_WAIT timeout firing. */ - if (timeo < rto) - timeo = rto; - - if (state == DCCP_TIME_WAIT) - timeo = DCCP_TIMEWAIT_LEN; - - /* Linkage updates. - * Note that access to tw after this point is illegal. - */ - inet_twsk_hashdance_schedule(tw, sk, &dccp_hashinfo, timeo); - } else { - /* Sorry, if we're out of memory, just CLOSE this - * socket up. We've got bigger problems than - * non-graceful socket closings. - */ - DCCP_WARN("time wait bucket table overflow\n"); - } - - dccp_done(sk); -} - -struct sock *dccp_create_openreq_child(const struct sock *sk, - const struct request_sock *req, - const struct sk_buff *skb) -{ - /* - * Step 3: Process LISTEN state - * - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - */ - struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); - - if (newsk != NULL) { - struct dccp_request_sock *dreq = dccp_rsk(req); - struct inet_connection_sock *newicsk = inet_csk(newsk); - struct dccp_sock *newdp = dccp_sk(newsk); - - newdp->dccps_role = DCCP_ROLE_SERVER; - newdp->dccps_hc_rx_ackvec = NULL; - newdp->dccps_service_list = NULL; - newdp->dccps_hc_rx_ccid = NULL; - newdp->dccps_hc_tx_ccid = NULL; - newdp->dccps_service = dreq->dreq_service; - newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo; - newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; - newicsk->icsk_rto = DCCP_TIMEOUT_INIT; - - INIT_LIST_HEAD(&newdp->dccps_featneg); - /* - * Step 3: Process LISTEN state - * - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR from packet (or Init Cookies) - * - * Setting AWL/AWH and SWL/SWH happens as part of the feature - * activation below, as these windows all depend on the local - * and remote Sequence Window feature values (7.5.2). - */ - newdp->dccps_iss = dreq->dreq_iss; - newdp->dccps_gss = dreq->dreq_gss; - newdp->dccps_gar = newdp->dccps_iss; - newdp->dccps_isr = dreq->dreq_isr; - newdp->dccps_gsr = dreq->dreq_gsr; - - /* - * Activate features: initialise CCIDs, sequence windows etc. - */ - if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { - sk_free_unlock_clone(newsk); - return NULL; - } - dccp_init_xmit_timers(newsk); - - __DCCP_INC_STATS(DCCP_MIB_PASSIVEOPENS); - } - return newsk; -} - -EXPORT_SYMBOL_GPL(dccp_create_openreq_child); - -/* - * Process an incoming packet for RESPOND sockets represented - * as an request_sock. - */ -struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, - struct request_sock *req) -{ - struct sock *child = NULL; - struct dccp_request_sock *dreq = dccp_rsk(req); - bool own_req; - - /* TCP/DCCP listeners became lockless. - * DCCP stores complex state in its request_sock, so we need - * a protection for them, now this code runs without being protected - * by the parent (listener) lock. - */ - spin_lock_bh(&dreq->dreq_lock); - - /* Check for retransmitted REQUEST */ - if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { - - if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_gsr)) { - dccp_pr_debug("Retransmitted REQUEST\n"); - dreq->dreq_gsr = DCCP_SKB_CB(skb)->dccpd_seq; - /* - * Send another RESPONSE packet - * To protect against Request floods, increment retrans - * counter (backoff, monitored by dccp_response_timer). - */ - inet_rtx_syn_ack(sk, req); - } - /* Network Duplicate, discard packet */ - goto out; - } - - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; - - if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK && - dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK) - goto drop; - - /* Invalid ACK */ - if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, - dreq->dreq_iss, dreq->dreq_gss)) { - dccp_pr_debug("Invalid ACK number: ack_seq=%llu, " - "dreq_iss=%llu, dreq_gss=%llu\n", - (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq, - (unsigned long long) dreq->dreq_iss, - (unsigned long long) dreq->dreq_gss); - goto drop; - } - - if (dccp_parse_options(sk, dreq, skb)) - goto drop; - - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, - req, &own_req); - if (child) { - child = inet_csk_complete_hashdance(sk, child, req, own_req); - goto out; - } - - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; -drop: - if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET) - req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - - inet_csk_reqsk_queue_drop(sk, req); -out: - spin_unlock_bh(&dreq->dreq_lock); - return child; -} - -EXPORT_SYMBOL_GPL(dccp_check_req); - -/* - * Queue segment on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket. - */ -int dccp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb) - __releases(child) -{ - int ret = 0; - const int state = child->sk_state; - - if (!sock_owned_by_user(child)) { - ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb), - skb->len); - - /* Wakeup parent, send SIGIO */ - if (state == DCCP_RESPOND && child->sk_state != state) - parent->sk_data_ready(parent); - } else { - /* Alas, it is possible again, because we do lookup - * in main socket hash table and lock on listening - * socket does not protect us more. - */ - __sk_add_backlog(child, skb); - } - - bh_unlock_sock(child); - sock_put(child); - return ret; -} - -EXPORT_SYMBOL_GPL(dccp_child_process); - -void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, - struct request_sock *rsk) -{ - DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state"); -} - -EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); - -int dccp_reqsk_init(struct request_sock *req, - struct dccp_sock const *dp, struct sk_buff const *skb) -{ - struct dccp_request_sock *dreq = dccp_rsk(req); - - spin_lock_init(&dreq->dreq_lock); - inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport; - inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport); - inet_rsk(req)->acked = 0; - dreq->dreq_timestamp_echo = 0; - - /* inherit feature negotiation options from listening socket */ - return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg); -} - -EXPORT_SYMBOL_GPL(dccp_reqsk_init); diff --git a/net/dccp/options.c b/net/dccp/options.c deleted file mode 100644 index db62d47670249..0000000000000 --- a/net/dccp/options.c +++ /dev/null @@ -1,609 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/options.c - * - * An implementation of the DCCP protocol - * Copyright (c) 2005 Aristeu Sergio Rozanski Filho - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2005 Ian McDonald - */ -#include -#include -#include -#include -#include -#include - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -u64 dccp_decode_value_var(const u8 *bf, const u8 len) -{ - u64 value = 0; - - if (len >= DCCP_OPTVAL_MAXLEN) - value += ((u64)*bf++) << 40; - if (len > 4) - value += ((u64)*bf++) << 32; - if (len > 3) - value += ((u64)*bf++) << 24; - if (len > 2) - value += ((u64)*bf++) << 16; - if (len > 1) - value += ((u64)*bf++) << 8; - if (len > 0) - value += *bf; - - return value; -} - -/** - * dccp_parse_options - Parse DCCP options present in @skb - * @sk: client|server|listening dccp socket (when @dreq != NULL) - * @dreq: request socket to use during connection setup, or NULL - * @skb: frame to parse - */ -int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - const struct dccp_hdr *dh = dccp_hdr(skb); - const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; - unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); - unsigned char *opt_ptr = options; - const unsigned char *opt_end = (unsigned char *)dh + - (dh->dccph_doff * 4); - struct dccp_options_received *opt_recv = &dp->dccps_options_received; - unsigned char opt, len; - unsigned char *value; - u32 elapsed_time; - __be32 opt_val; - int rc; - int mandatory = 0; - - memset(opt_recv, 0, sizeof(*opt_recv)); - - opt = len = 0; - while (opt_ptr != opt_end) { - opt = *opt_ptr++; - len = 0; - value = NULL; - - /* Check if this isn't a single byte option */ - if (opt > DCCPO_MAX_RESERVED) { - if (opt_ptr == opt_end) - goto out_nonsensical_length; - - len = *opt_ptr++; - if (len < 2) - goto out_nonsensical_length; - /* - * Remove the type and len fields, leaving - * just the value size - */ - len -= 2; - value = opt_ptr; - opt_ptr += len; - - if (opt_ptr > opt_end) - goto out_nonsensical_length; - } - - /* - * CCID-specific options are ignored during connection setup, as - * negotiation may still be in progress (see RFC 4340, 10.3). - * The same applies to Ack Vectors, as these depend on the CCID. - */ - if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || - opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) - goto ignore_option; - - switch (opt) { - case DCCPO_PADDING: - break; - case DCCPO_MANDATORY: - if (mandatory) - goto out_invalid_option; - if (pkt_type != DCCP_PKT_DATA) - mandatory = 1; - break; - case DCCPO_NDP_COUNT: - if (len > 6) - goto out_invalid_option; - - opt_recv->dccpor_ndp = dccp_decode_value_var(value, len); - dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), - (unsigned long long)opt_recv->dccpor_ndp); - break; - case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: - if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ - break; - if (len == 0) - goto out_invalid_option; - rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, - *value, value + 1, len - 1); - if (rc) - goto out_featneg_failed; - break; - case DCCPO_TIMESTAMP: - if (len != 4) - goto out_invalid_option; - /* - * RFC 4340 13.1: "The precise time corresponding to - * Timestamp Value zero is not specified". We use - * zero to indicate absence of a meaningful timestamp. - */ - opt_val = get_unaligned((__be32 *)value); - if (unlikely(opt_val == 0)) { - DCCP_WARN("Timestamp with zero value\n"); - break; - } - - if (dreq != NULL) { - dreq->dreq_timestamp_echo = ntohl(opt_val); - dreq->dreq_timestamp_time = dccp_timestamp(); - } else { - opt_recv->dccpor_timestamp = - dp->dccps_timestamp_echo = ntohl(opt_val); - dp->dccps_timestamp_time = dccp_timestamp(); - } - dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n", - dccp_role(sk), ntohl(opt_val), - (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq); - /* schedule an Ack in case this sender is quiescent */ - inet_csk_schedule_ack(sk); - break; - case DCCPO_TIMESTAMP_ECHO: - if (len != 4 && len != 6 && len != 8) - goto out_invalid_option; - - opt_val = get_unaligned((__be32 *)value); - opt_recv->dccpor_timestamp_echo = ntohl(opt_val); - - dccp_pr_debug("%s rx opt: TIMESTAMP_ECHO=%u, len=%d, " - "ackno=%llu", dccp_role(sk), - opt_recv->dccpor_timestamp_echo, - len + 2, - (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq); - - value += 4; - - if (len == 4) { /* no elapsed time included */ - dccp_pr_debug_cat("\n"); - break; - } - - if (len == 6) { /* 2-byte elapsed time */ - __be16 opt_val2 = get_unaligned((__be16 *)value); - elapsed_time = ntohs(opt_val2); - } else { /* 4-byte elapsed time */ - opt_val = get_unaligned((__be32 *)value); - elapsed_time = ntohl(opt_val); - } - - dccp_pr_debug_cat(", ELAPSED_TIME=%u\n", elapsed_time); - - /* Give precedence to the biggest ELAPSED_TIME */ - if (elapsed_time > opt_recv->dccpor_elapsed_time) - opt_recv->dccpor_elapsed_time = elapsed_time; - break; - case DCCPO_ELAPSED_TIME: - if (dccp_packet_without_ack(skb)) /* RFC 4340, 13.2 */ - break; - - if (len == 2) { - __be16 opt_val2 = get_unaligned((__be16 *)value); - elapsed_time = ntohs(opt_val2); - } else if (len == 4) { - opt_val = get_unaligned((__be32 *)value); - elapsed_time = ntohl(opt_val); - } else { - goto out_invalid_option; - } - - if (elapsed_time > opt_recv->dccpor_elapsed_time) - opt_recv->dccpor_elapsed_time = elapsed_time; - - dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", - dccp_role(sk), elapsed_time); - break; - case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: - if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, - pkt_type, opt, value, len)) - goto out_invalid_option; - break; - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ - break; - /* - * Ack vectors are processed by the TX CCID if it is - * interested. The RX CCID need not parse Ack Vectors, - * since it is only interested in clearing old state. - */ - fallthrough; - case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: - if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, - pkt_type, opt, value, len)) - goto out_invalid_option; - break; - default: - DCCP_CRIT("DCCP(%p): option %d(len=%d) not " - "implemented, ignoring", sk, opt, len); - break; - } -ignore_option: - if (opt != DCCPO_MANDATORY) - mandatory = 0; - } - - /* mandatory was the last byte in option list -> reset connection */ - if (mandatory) - goto out_invalid_option; - -out_nonsensical_length: - /* RFC 4340, 5.8: ignore option and all remaining option space */ - return 0; - -out_invalid_option: - DCCP_INC_STATS(DCCP_MIB_INVALIDOPT); - rc = DCCP_RESET_CODE_OPTION_ERROR; -out_featneg_failed: - DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); - DCCP_SKB_CB(skb)->dccpd_reset_code = rc; - DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; - DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; - DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; - return -1; -} - -EXPORT_SYMBOL_GPL(dccp_parse_options); - -void dccp_encode_value_var(const u64 value, u8 *to, const u8 len) -{ - if (len >= DCCP_OPTVAL_MAXLEN) - *to++ = (value & 0xFF0000000000ull) >> 40; - if (len > 4) - *to++ = (value & 0xFF00000000ull) >> 32; - if (len > 3) - *to++ = (value & 0xFF000000) >> 24; - if (len > 2) - *to++ = (value & 0xFF0000) >> 16; - if (len > 1) - *to++ = (value & 0xFF00) >> 8; - if (len > 0) - *to++ = (value & 0xFF); -} - -static inline u8 dccp_ndp_len(const u64 ndp) -{ - if (likely(ndp <= 0xFF)) - return 1; - return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6); -} - -int dccp_insert_option(struct sk_buff *skb, const unsigned char option, - const void *value, const unsigned char len) -{ - unsigned char *to; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2; - - to = skb_push(skb, len + 2); - *to++ = option; - *to++ = len + 2; - - memcpy(to, value, len); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_insert_option); - -static int dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - u64 ndp = dp->dccps_ndp_count; - - if (dccp_non_data_packet(skb)) - ++dp->dccps_ndp_count; - else - dp->dccps_ndp_count = 0; - - if (ndp > 0) { - unsigned char *ptr; - const int ndp_len = dccp_ndp_len(ndp); - const int len = ndp_len + 2; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len; - - ptr = skb_push(skb, len); - *ptr++ = DCCPO_NDP_COUNT; - *ptr++ = len; - dccp_encode_value_var(ndp, ptr, ndp_len); - } - - return 0; -} - -static inline int dccp_elapsed_time_len(const u32 elapsed_time) -{ - return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4; -} - -static int dccp_insert_option_timestamp(struct sk_buff *skb) -{ - __be32 now = htonl(dccp_timestamp()); - /* yes this will overflow but that is the point as we want a - * 10 usec 32 bit timer which mean it wraps every 11.9 hours */ - - return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now)); -} - -static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, - struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - __be32 tstamp_echo; - unsigned char *to; - u32 elapsed_time, elapsed_time_len, len; - - if (dreq != NULL) { - elapsed_time = dccp_timestamp() - dreq->dreq_timestamp_time; - tstamp_echo = htonl(dreq->dreq_timestamp_echo); - dreq->dreq_timestamp_echo = 0; - } else { - elapsed_time = dccp_timestamp() - dp->dccps_timestamp_time; - tstamp_echo = htonl(dp->dccps_timestamp_echo); - dp->dccps_timestamp_echo = 0; - } - - elapsed_time_len = dccp_elapsed_time_len(elapsed_time); - len = 6 + elapsed_time_len; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len; - - to = skb_push(skb, len); - *to++ = DCCPO_TIMESTAMP_ECHO; - *to++ = len; - - memcpy(to, &tstamp_echo, 4); - to += 4; - - if (elapsed_time_len == 2) { - const __be16 var16 = htons((u16)elapsed_time); - memcpy(to, &var16, 2); - } else if (elapsed_time_len == 4) { - const __be32 var32 = htonl(elapsed_time); - memcpy(to, &var32, 4); - } - - return 0; -} - -static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const u16 buflen = dccp_ackvec_buflen(av); - /* Figure out how many options do we need to represent the ackvec */ - const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); - u16 len = buflen + 2 * nr_opts; - u8 i, nonce = 0; - const unsigned char *tail, *from; - unsigned char *to; - - if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { - DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, - dccp_packet_name(dcb->dccpd_type)); - return -1; - } - /* - * Since Ack Vectors are variable-length, we can not always predict - * their size. To catch exception cases where the space is running out - * on the skb, a separate Sync is scheduled to carry the Ack Vector. - */ - if (len > DCCPAV_MIN_OPTLEN && - len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) { - DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), " - "MPS=%u ==> reduce payload size?\n", len, skb->len, - dcb->dccpd_opt_len, dp->dccps_mss_cache); - dp->dccps_sync_scheduled = 1; - return 0; - } - dcb->dccpd_opt_len += len; - - to = skb_push(skb, len); - len = buflen; - from = av->av_buf + av->av_buf_head; - tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; - - for (i = 0; i < nr_opts; ++i) { - int copylen = len; - - if (len > DCCP_SINGLE_OPT_MAXLEN) - copylen = DCCP_SINGLE_OPT_MAXLEN; - - /* - * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via - * its type; ack_nonce is the sum of all individual buf_nonce's. - */ - nonce ^= av->av_buf_nonce[i]; - - *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; - *to++ = copylen + 2; - - /* Check if buf_head wraps */ - if (from + copylen > tail) { - const u16 tailsize = tail - from; - - memcpy(to, from, tailsize); - to += tailsize; - len -= tailsize; - copylen -= tailsize; - from = av->av_buf; - } - - memcpy(to, from, copylen); - from += copylen; - to += copylen; - len -= copylen; - } - /* - * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. - */ - if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce)) - return -ENOBUFS; - return 0; -} - -/** - * dccp_insert_option_mandatory - Mandatory option (5.8.2) - * @skb: frame into which to insert option - * - * Note that since we are using skb_push, this function needs to be called - * _after_ inserting the option it is supposed to influence (stack order). - */ -int dccp_insert_option_mandatory(struct sk_buff *skb) -{ - if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len++; - *(u8 *)skb_push(skb, 1) = DCCPO_MANDATORY; - return 0; -} - -/** - * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb - * @skb: frame to insert feature negotiation option into - * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R - * @feat: one out of %dccp_feature_numbers - * @val: NN value or SP array (preferred element first) to copy - * @len: true length of @val in bytes (excluding first element repetition) - * @repeat_first: whether to copy the first element of @val twice - * - * The last argument is used to construct Confirm options, where the preferred - * value and the preference list appear separately (RFC 4340, 6.3.1). Preference - * lists are kept such that the preferred entry is always first, so we only need - * to copy twice, and avoid the overhead of cloning into a bigger array. - */ -int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, - u8 *val, u8 len, bool repeat_first) -{ - u8 tot_len, *to; - - /* take the `Feature' field and possible repetition into account */ - if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) { - DCCP_WARN("length %u for feature %u too large\n", len, feat); - return -1; - } - - if (unlikely(val == NULL || len == 0)) - len = repeat_first = false; - tot_len = 3 + repeat_first + len; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) { - DCCP_WARN("packet too small for feature %d option!\n", feat); - return -1; - } - DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len; - - to = skb_push(skb, tot_len); - *to++ = type; - *to++ = tot_len; - *to++ = feat; - - if (repeat_first) - *to++ = *val; - if (len) - memcpy(to, val, len); - return 0; -} - -/* The length of all options needs to be a multiple of 4 (5.8) */ -static void dccp_insert_option_padding(struct sk_buff *skb) -{ - int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4; - - if (padding != 0) { - padding = 4 - padding; - memset(skb_push(skb, padding), 0, padding); - DCCP_SKB_CB(skb)->dccpd_opt_len += padding; - } -} - -int dccp_insert_options(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - - DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - - if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) - return -1; - - if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { - - /* Feature Negotiation */ - if (dccp_feat_insert_opts(dp, NULL, skb)) - return -1; - - if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) { - /* - * Obtain RTT sample from Request/Response exchange. - * This is currently used for TFRC initialisation. - */ - if (dccp_insert_option_timestamp(skb)) - return -1; - - } else if (dccp_ackvec_pending(sk) && - dccp_insert_option_ackvec(sk, skb)) { - return -1; - } - } - - if (dp->dccps_hc_rx_insert_options) { - if (ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb)) - return -1; - dp->dccps_hc_rx_insert_options = 0; - } - - if (dp->dccps_timestamp_echo != 0 && - dccp_insert_option_timestamp_echo(dp, NULL, skb)) - return -1; - - dccp_insert_option_padding(skb); - return 0; -} - -int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) -{ - DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - - if (dccp_feat_insert_opts(NULL, dreq, skb)) - return -1; - - /* Obtain RTT sample from Response/Ack exchange (used by TFRC). */ - if (dccp_insert_option_timestamp(skb)) - return -1; - - if (dreq->dreq_timestamp_echo != 0 && - dccp_insert_option_timestamp_echo(NULL, dreq, skb)) - return -1; - - dccp_insert_option_padding(skb); - return 0; -} diff --git a/net/dccp/output.c b/net/dccp/output.c deleted file mode 100644 index 39cf3430177ac..0000000000000 --- a/net/dccp/output.c +++ /dev/null @@ -1,708 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/output.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include -#include - -#include -#include - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" - -static inline void dccp_event_ack_sent(struct sock *sk) -{ - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); -} - -/* enqueue @skb on sk_send_head for retransmission, return clone to send now */ -static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb) -{ - skb_set_owner_w(skb, sk); - WARN_ON(sk->sk_send_head); - sk->sk_send_head = skb; - return skb_clone(sk->sk_send_head, gfp_any()); -} - -/* - * All SKB's seen here are completely headerless. It is our - * job to build the DCCP header, and pass the packet down to - * IP so it can do the same plus pass the packet off to the - * device. - */ -static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) -{ - if (likely(skb != NULL)) { - struct inet_sock *inet = inet_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - struct dccp_hdr *dh; - /* XXX For now we're using only 48 bits sequence numbers */ - const u32 dccp_header_size = sizeof(*dh) + - sizeof(struct dccp_hdr_ext) + - dccp_packet_hdr_len(dcb->dccpd_type); - int err, set_ack = 1; - u64 ackno = dp->dccps_gsr; - /* - * Increment GSS here already in case the option code needs it. - * Update GSS for real only if option processing below succeeds. - */ - dcb->dccpd_seq = ADD48(dp->dccps_gss, 1); - - switch (dcb->dccpd_type) { - case DCCP_PKT_DATA: - set_ack = 0; - fallthrough; - case DCCP_PKT_DATAACK: - case DCCP_PKT_RESET: - break; - - case DCCP_PKT_REQUEST: - set_ack = 0; - /* Use ISS on the first (non-retransmitted) Request. */ - if (icsk->icsk_retransmits == 0) - dcb->dccpd_seq = dp->dccps_iss; - fallthrough; - - case DCCP_PKT_SYNC: - case DCCP_PKT_SYNCACK: - ackno = dcb->dccpd_ack_seq; - fallthrough; - default: - /* - * Set owner/destructor: some skbs are allocated via - * alloc_skb (e.g. when retransmission may happen). - * Only Data, DataAck, and Reset packets should come - * through here with skb->sk set. - */ - WARN_ON(skb->sk); - skb_set_owner_w(skb, sk); - break; - } - - if (dccp_insert_options(sk, skb)) { - kfree_skb(skb); - return -EPROTO; - } - - - /* Build DCCP header and checksum it. */ - dh = dccp_zeroed_hdr(skb, dccp_header_size); - dh->dccph_type = dcb->dccpd_type; - dh->dccph_sport = inet->inet_sport; - dh->dccph_dport = inet->inet_dport; - dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4; - dh->dccph_ccval = dcb->dccpd_ccval; - dh->dccph_cscov = dp->dccps_pcslen; - /* XXX For now we're using only 48 bits sequence numbers */ - dh->dccph_x = 1; - - dccp_update_gss(sk, dcb->dccpd_seq); - dccp_hdr_set_seq(dh, dp->dccps_gss); - if (set_ack) - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno); - - switch (dcb->dccpd_type) { - case DCCP_PKT_REQUEST: - dccp_hdr_request(skb)->dccph_req_service = - dp->dccps_service; - /* - * Limit Ack window to ISS <= P.ackno <= GSS, so that - * only Responses to Requests we sent are considered. - */ - dp->dccps_awl = dp->dccps_iss; - break; - case DCCP_PKT_RESET: - dccp_hdr_reset(skb)->dccph_reset_code = - dcb->dccpd_reset_code; - break; - } - - icsk->icsk_af_ops->send_check(sk, skb); - - if (set_ack) - dccp_event_ack_sent(sk); - - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - - err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); - return net_xmit_eval(err); - } - return -ENOBUFS; -} - -/** - * dccp_determine_ccmps - Find out about CCID-specific packet-size limits - * @dp: socket to find packet size limits of - * - * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.), - * since the RX CCID is restricted to feedback packets (Acks), which are small - * in comparison with the data traffic. A value of 0 means "no current CCMPS". - */ -static u32 dccp_determine_ccmps(const struct dccp_sock *dp) -{ - const struct ccid *tx_ccid = dp->dccps_hc_tx_ccid; - - if (tx_ccid == NULL || tx_ccid->ccid_ops == NULL) - return 0; - return tx_ccid->ccid_ops->ccid_ccmps; -} - -unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct dccp_sock *dp = dccp_sk(sk); - u32 ccmps = dccp_determine_ccmps(dp); - u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; - - /* Account for header lengths and IPv4/v6 option overhead */ - cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + - sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); - - /* - * Leave enough headroom for common DCCP header options. - * This only considers options which may appear on DCCP-Data packets, as - * per table 3 in RFC 4340, 5.8. When running out of space for other - * options (eg. Ack Vector which can take up to 255 bytes), it is better - * to schedule a separate Ack. Thus we leave headroom for the following: - * - 1 byte for Slow Receiver (11.6) - * - 6 bytes for Timestamp (13.1) - * - 10 bytes for Timestamp Echo (13.3) - * - 8 bytes for NDP count (7.7, when activated) - * - 6 bytes for Data Checksum (9.3) - * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled) - */ - cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + - (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); - - /* And store cached results */ - icsk->icsk_pmtu_cookie = pmtu; - WRITE_ONCE(dp->dccps_mss_cache, cur_mps); - - return cur_mps; -} - -EXPORT_SYMBOL_GPL(dccp_sync_mss); - -void dccp_write_space(struct sock *sk) -{ - struct socket_wq *wq; - - rcu_read_lock(); - wq = rcu_dereference(sk->sk_wq); - if (skwq_has_sleeper(wq)) - wake_up_interruptible(&wq->wait); - /* Should agree with poll, otherwise some programs break */ - if (sock_writeable(sk)) - sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); - - rcu_read_unlock(); -} - -/** - * dccp_wait_for_ccid - Await CCID send permission - * @sk: socket to wait for - * @delay: timeout in jiffies - * - * This is used by CCIDs which need to delay the send time in process context. - */ -static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) -{ - DEFINE_WAIT(wait); - long remaining; - - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - sk->sk_write_pending++; - release_sock(sk); - - remaining = schedule_timeout(delay); - - lock_sock(sk); - sk->sk_write_pending--; - finish_wait(sk_sleep(sk), &wait); - - if (signal_pending(current) || sk->sk_err) - return -1; - return remaining; -} - -/** - * dccp_xmit_packet - Send data packet under control of CCID - * @sk: socket to send data packet on - * - * Transmits next-queued payload and informs CCID to account for the packet. - */ -static void dccp_xmit_packet(struct sock *sk) -{ - int err, len; - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb = dccp_qpolicy_pop(sk); - - if (unlikely(skb == NULL)) - return; - len = skb->len; - - if (sk->sk_state == DCCP_PARTOPEN) { - const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; - /* - * See 8.1.5 - Handshake Completion. - * - * For robustness we resend Confirm options until the client has - * entered OPEN. During the initial feature negotiation, the MPS - * is smaller than usual, reduced by the Change/Confirm options. - */ - if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { - DCCP_WARN("Payload too large (%d) for featneg.\n", len); - dccp_send_ack(sk); - dccp_feat_list_purge(&dp->dccps_featneg); - } - - inet_csk_schedule_ack(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - inet_csk(sk)->icsk_rto, - DCCP_RTO_MAX); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; - } else if (dccp_ack_pending(sk)) { - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; - } else { - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; - } - - err = dccp_transmit_skb(sk, skb); - if (err) - dccp_pr_debug("transmit_skb() returned err=%d\n", err); - /* - * Register this one as sent even if an error occurred. To the remote - * end a local packet drop is indistinguishable from network loss, i.e. - * any local drop will eventually be reported via receiver feedback. - */ - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); - - /* - * If the CCID needs to transfer additional header options out-of-band - * (e.g. Ack Vectors or feature-negotiation options), it activates this - * flag to schedule a Sync. The Sync will automatically incorporate all - * currently pending header options, thus clearing the backlog. - */ - if (dp->dccps_sync_scheduled) - dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); -} - -/** - * dccp_flush_write_queue - Drain queue at end of connection - * @sk: socket to be drained - * @time_budget: time allowed to drain the queue - * - * Since dccp_sendmsg queues packets without waiting for them to be sent, it may - * happen that the TX queue is not empty at the end of a connection. We give the - * HC-sender CCID a grace period of up to @time_budget jiffies. If this function - * returns with a non-empty write queue, it will be purged later. - */ -void dccp_flush_write_queue(struct sock *sk, long *time_budget) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - long delay, rc; - - while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { - rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - - switch (ccid_packet_dequeue_eval(rc)) { - case CCID_PACKET_WILL_DEQUEUE_LATER: - /* - * If the CCID determines when to send, the next sending - * time is unknown or the CCID may not even send again - * (e.g. remote host crashes or lost Ack packets). - */ - DCCP_WARN("CCID did not manage to send all packets\n"); - return; - case CCID_PACKET_DELAY: - delay = msecs_to_jiffies(rc); - if (delay > *time_budget) - return; - rc = dccp_wait_for_ccid(sk, delay); - if (rc < 0) - return; - *time_budget -= (delay - rc); - /* check again if we can send now */ - break; - case CCID_PACKET_SEND_AT_ONCE: - dccp_xmit_packet(sk); - break; - case CCID_PACKET_ERR: - skb_dequeue(&sk->sk_write_queue); - kfree_skb(skb); - dccp_pr_debug("packet discarded due to err=%ld\n", rc); - } - } -} - -void dccp_write_xmit(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - - while ((skb = dccp_qpolicy_top(sk))) { - int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - - switch (ccid_packet_dequeue_eval(rc)) { - case CCID_PACKET_WILL_DEQUEUE_LATER: - return; - case CCID_PACKET_DELAY: - sk_reset_timer(sk, &dp->dccps_xmit_timer, - jiffies + msecs_to_jiffies(rc)); - return; - case CCID_PACKET_SEND_AT_ONCE: - dccp_xmit_packet(sk); - break; - case CCID_PACKET_ERR: - dccp_qpolicy_drop(sk, skb); - dccp_pr_debug("packet discarded due to err=%d\n", rc); - } - } -} - -/** - * dccp_retransmit_skb - Retransmit Request, Close, or CloseReq packets - * @sk: socket to perform retransmit on - * - * There are only four retransmittable packet types in DCCP: - * - Request in client-REQUEST state (sec. 8.1.1), - * - CloseReq in server-CLOSEREQ state (sec. 8.3), - * - Close in node-CLOSING state (sec. 8.3), - * - Acks in client-PARTOPEN state (sec. 8.1.5, handled by dccp_delack_timer()). - * This function expects sk->sk_send_head to contain the original skb. - */ -int dccp_retransmit_skb(struct sock *sk) -{ - WARN_ON(sk->sk_send_head == NULL); - - if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0) - return -EHOSTUNREACH; /* Routing failure or similar. */ - - /* this count is used to distinguish original and retransmitted skb */ - inet_csk(sk)->icsk_retransmits++; - - return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC)); -} - -struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, - struct request_sock *req) -{ - struct dccp_hdr *dh; - struct dccp_request_sock *dreq; - const u32 dccp_header_size = sizeof(struct dccp_hdr) + - sizeof(struct dccp_hdr_ext) + - sizeof(struct dccp_hdr_response); - struct sk_buff *skb; - - /* sk is marked const to clearly express we dont hold socket lock. - * sock_wmalloc() will atomically change sk->sk_wmem_alloc, - * it is safe to promote sk to non const. - */ - skb = sock_wmalloc((struct sock *)sk, MAX_DCCP_HEADER, 1, - GFP_ATOMIC); - if (!skb) - return NULL; - - skb_reserve(skb, MAX_DCCP_HEADER); - - skb_dst_set(skb, dst_clone(dst)); - - dreq = dccp_rsk(req); - if (inet_rsk(req)->acked) /* increase GSS upon retransmission */ - dccp_inc_seqno(&dreq->dreq_gss); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; - DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_gss; - - /* Resolve feature dependencies resulting from choice of CCID */ - if (dccp_feat_server_ccid_dependencies(dreq)) - goto response_failed; - - if (dccp_insert_options_rsk(dreq, skb)) - goto response_failed; - - /* Build and checksum header */ - dh = dccp_zeroed_hdr(skb, dccp_header_size); - - dh->dccph_sport = htons(inet_rsk(req)->ir_num); - dh->dccph_dport = inet_rsk(req)->ir_rmt_port; - dh->dccph_doff = (dccp_header_size + - DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; - dh->dccph_type = DCCP_PKT_RESPONSE; - dh->dccph_x = 1; - dccp_hdr_set_seq(dh, dreq->dreq_gss); - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_gsr); - dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service; - - dccp_csum_outgoing(skb); - - /* We use `acked' to remember that a Response was already sent. */ - inet_rsk(req)->acked = 1; - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - return skb; -response_failed: - kfree_skb(skb); - return NULL; -} - -EXPORT_SYMBOL_GPL(dccp_make_response); - -/* answer offending packet in @rcv_skb with Reset from control socket @ctl */ -struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *rcv_skb) -{ - struct dccp_hdr *rxdh = dccp_hdr(rcv_skb), *dh; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(rcv_skb); - const u32 dccp_hdr_reset_len = sizeof(struct dccp_hdr) + - sizeof(struct dccp_hdr_ext) + - sizeof(struct dccp_hdr_reset); - struct dccp_hdr_reset *dhr; - struct sk_buff *skb; - - skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC); - if (skb == NULL) - return NULL; - - skb_reserve(skb, sk->sk_prot->max_header); - - /* Swap the send and the receive. */ - dh = dccp_zeroed_hdr(skb, dccp_hdr_reset_len); - dh->dccph_type = DCCP_PKT_RESET; - dh->dccph_sport = rxdh->dccph_dport; - dh->dccph_dport = rxdh->dccph_sport; - dh->dccph_doff = dccp_hdr_reset_len / 4; - dh->dccph_x = 1; - - dhr = dccp_hdr_reset(skb); - dhr->dccph_reset_code = dcb->dccpd_reset_code; - - switch (dcb->dccpd_reset_code) { - case DCCP_RESET_CODE_PACKET_ERROR: - dhr->dccph_reset_data[0] = rxdh->dccph_type; - break; - case DCCP_RESET_CODE_OPTION_ERROR: - case DCCP_RESET_CODE_MANDATORY_ERROR: - memcpy(dhr->dccph_reset_data, dcb->dccpd_reset_data, 3); - break; - } - /* - * From RFC 4340, 8.3.1: - * If P.ackno exists, set R.seqno := P.ackno + 1. - * Else set R.seqno := 0. - */ - if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_hdr_set_seq(dh, ADD48(dcb->dccpd_ack_seq, 1)); - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dcb->dccpd_seq); - - dccp_csum_outgoing(skb); - return skb; -} - -EXPORT_SYMBOL_GPL(dccp_ctl_make_reset); - -/* send Reset on established socket, to close or abort the connection */ -int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code) -{ - struct sk_buff *skb; - /* - * FIXME: what if rebuild_header fails? - * Should we be doing a rebuild_header here? - */ - int err = inet_csk(sk)->icsk_af_ops->rebuild_header(sk); - - if (err != 0) - return err; - - skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1, GFP_ATOMIC); - if (skb == NULL) - return -ENOBUFS; - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, sk->sk_prot->max_header); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET; - DCCP_SKB_CB(skb)->dccpd_reset_code = code; - - return dccp_transmit_skb(sk, skb); -} - -/* - * Do all connect socket setups that can be done AF independent. - */ -int dccp_connect(struct sock *sk) -{ - struct sk_buff *skb; - struct dccp_sock *dp = dccp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - - sk->sk_err = 0; - sock_reset_flag(sk, SOCK_DONE); - - dccp_sync_mss(sk, dst_mtu(dst)); - - /* do not connect if feature negotiation setup fails */ - if (dccp_feat_finalise_settings(dccp_sk(sk))) - return -EPROTO; - - /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ - dp->dccps_gar = dp->dccps_iss; - - skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); - if (unlikely(skb == NULL)) - return -ENOBUFS; - - /* Reserve space for headers. */ - skb_reserve(skb, sk->sk_prot->max_header); - - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; - - dccp_transmit_skb(sk, dccp_skb_entail(sk, skb)); - DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); - - /* Timer for repeating the REQUEST until an answer. */ - icsk->icsk_retransmits = 0; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - icsk->icsk_rto, DCCP_RTO_MAX); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_connect); - -void dccp_send_ack(struct sock *sk) -{ - /* If we have been reset, we may not send again. */ - if (sk->sk_state != DCCP_CLOSED) { - struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, - GFP_ATOMIC); - - if (skb == NULL) { - inet_csk_schedule_ack(sk); - inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - TCP_DELACK_MAX, - DCCP_RTO_MAX); - return; - } - - /* Reserve space for headers */ - skb_reserve(skb, sk->sk_prot->max_header); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK; - dccp_transmit_skb(sk, skb); - } -} - -EXPORT_SYMBOL_GPL(dccp_send_ack); - -#if 0 -/* FIXME: Is this still necessary (11.3) - currently nowhere used by DCCP. */ -void dccp_send_delayed_ack(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - /* - * FIXME: tune this timer. elapsed time fixes the skew, so no problem - * with using 2s, and active senders also piggyback the ACK into a - * DATAACK packet, so this is really for quiescent senders. - */ - unsigned long timeout = jiffies + 2 * HZ; - - /* Use new timeout only if there wasn't a older one earlier. */ - if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { - /* If delack timer was blocked or is about to expire, - * send ACK now. - * - * FIXME: check the "about to expire" part - */ - if (icsk->icsk_ack.blocked) { - dccp_send_ack(sk); - return; - } - - if (!time_before(timeout, icsk_delack_timeout(icsk))) - timeout = icsk_delack_timeout(icsk); - } - icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; - sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); -} -#endif - -void dccp_send_sync(struct sock *sk, const u64 ackno, - const enum dccp_pkt_type pkt_type) -{ - /* - * We are not putting this on the write queue, so - * dccp_transmit_skb() will set the ownership to this - * sock. - */ - struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC); - - if (skb == NULL) { - /* FIXME: how to make sure the sync is sent? */ - DCCP_CRIT("could not send %s", dccp_packet_name(pkt_type)); - return; - } - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, sk->sk_prot->max_header); - DCCP_SKB_CB(skb)->dccpd_type = pkt_type; - DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; - - /* - * Clear the flag in case the Sync was scheduled for out-of-band data, - * such as carrying a long Ack Vector. - */ - dccp_sk(sk)->dccps_sync_scheduled = 0; - - dccp_transmit_skb(sk, skb); -} - -EXPORT_SYMBOL_GPL(dccp_send_sync); - -/* - * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This - * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under - * any circumstances. - */ -void dccp_send_close(struct sock *sk, const int active) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - const gfp_t prio = active ? GFP_KERNEL : GFP_ATOMIC; - - skb = alloc_skb(sk->sk_prot->max_header, prio); - if (skb == NULL) - return; - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, sk->sk_prot->max_header); - if (dp->dccps_role == DCCP_ROLE_SERVER && !dp->dccps_server_timewait) - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSEREQ; - else - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; - - if (active) { - skb = dccp_skb_entail(sk, skb); - /* - * Retransmission timer for active-close: RFC 4340, 8.3 requires - * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ - * state can be left. The initial timeout is 2 RTTs. - * Since RTT measurement is done by the CCIDs, there is no easy - * way to get an RTT sample. The fallback RTT from RFC 4340, 3.4 - * is too low (200ms); we use a high value to avoid unnecessary - * retransmissions when the link RTT is > 0.2 seconds. - * FIXME: Let main module sample RTTs and use that instead. - */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); - } - dccp_transmit_skb(sk, skb); -} diff --git a/net/dccp/proto.c b/net/dccp/proto.c deleted file mode 100644 index fcc5c9d64f466..0000000000000 --- a/net/dccp/proto.c +++ /dev/null @@ -1,1293 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/proto.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -#define CREATE_TRACE_POINTS -#include "trace.h" - -DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; - -EXPORT_SYMBOL_GPL(dccp_statistics); - -DEFINE_PER_CPU(unsigned int, dccp_orphan_count); -EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count); - -struct inet_hashinfo dccp_hashinfo; -EXPORT_SYMBOL_GPL(dccp_hashinfo); - -/* the maximum queue length for tx in packets. 0 is no limit */ -int sysctl_dccp_tx_qlen __read_mostly = 5; - -#ifdef CONFIG_IP_DCCP_DEBUG -static const char *dccp_state_name(const int state) -{ - static const char *const dccp_state_names[] = { - [DCCP_OPEN] = "OPEN", - [DCCP_REQUESTING] = "REQUESTING", - [DCCP_PARTOPEN] = "PARTOPEN", - [DCCP_LISTEN] = "LISTEN", - [DCCP_RESPOND] = "RESPOND", - [DCCP_CLOSING] = "CLOSING", - [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ", - [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE", - [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ", - [DCCP_TIME_WAIT] = "TIME_WAIT", - [DCCP_CLOSED] = "CLOSED", - }; - - if (state >= DCCP_MAX_STATES) - return "INVALID STATE!"; - else - return dccp_state_names[state]; -} -#endif - -void dccp_set_state(struct sock *sk, const int state) -{ - const int oldstate = sk->sk_state; - - dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk, - dccp_state_name(oldstate), dccp_state_name(state)); - WARN_ON(state == oldstate); - - switch (state) { - case DCCP_OPEN: - if (oldstate != DCCP_OPEN) - DCCP_INC_STATS(DCCP_MIB_CURRESTAB); - /* Client retransmits all Confirm options until entering OPEN */ - if (oldstate == DCCP_PARTOPEN) - dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); - break; - - case DCCP_CLOSED: - if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ || - oldstate == DCCP_CLOSING) - DCCP_INC_STATS(DCCP_MIB_ESTABRESETS); - - sk->sk_prot->unhash(sk); - if (inet_csk(sk)->icsk_bind_hash != NULL && - !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) - inet_put_port(sk); - fallthrough; - default: - if (oldstate == DCCP_OPEN) - DCCP_DEC_STATS(DCCP_MIB_CURRESTAB); - } - - /* Change state AFTER socket is unhashed to avoid closed - * socket sitting in hash tables. - */ - inet_sk_set_state(sk, state); -} - -EXPORT_SYMBOL_GPL(dccp_set_state); - -static void dccp_finish_passive_close(struct sock *sk) -{ - switch (sk->sk_state) { - case DCCP_PASSIVE_CLOSE: - /* Node (client or server) has received Close packet. */ - dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); - dccp_set_state(sk, DCCP_CLOSED); - break; - case DCCP_PASSIVE_CLOSEREQ: - /* - * Client received CloseReq. We set the `active' flag so that - * dccp_send_close() retransmits the Close as per RFC 4340, 8.3. - */ - dccp_send_close(sk, 1); - dccp_set_state(sk, DCCP_CLOSING); - } -} - -void dccp_done(struct sock *sk) -{ - dccp_set_state(sk, DCCP_CLOSED); - dccp_clear_xmit_timers(sk); - - sk->sk_shutdown = SHUTDOWN_MASK; - - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_state_change(sk); - else - inet_csk_destroy_sock(sk); -} - -EXPORT_SYMBOL_GPL(dccp_done); - -const char *dccp_packet_name(const int type) -{ - static const char *const dccp_packet_names[] = { - [DCCP_PKT_REQUEST] = "REQUEST", - [DCCP_PKT_RESPONSE] = "RESPONSE", - [DCCP_PKT_DATA] = "DATA", - [DCCP_PKT_ACK] = "ACK", - [DCCP_PKT_DATAACK] = "DATAACK", - [DCCP_PKT_CLOSEREQ] = "CLOSEREQ", - [DCCP_PKT_CLOSE] = "CLOSE", - [DCCP_PKT_RESET] = "RESET", - [DCCP_PKT_SYNC] = "SYNC", - [DCCP_PKT_SYNCACK] = "SYNCACK", - }; - - if (type >= DCCP_NR_PKT_TYPES) - return "INVALID"; - else - return dccp_packet_names[type]; -} - -EXPORT_SYMBOL_GPL(dccp_packet_name); - -void dccp_destruct_common(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_tx_ccid = NULL; -} -EXPORT_SYMBOL_GPL(dccp_destruct_common); - -static void dccp_sk_destruct(struct sock *sk) -{ - dccp_destruct_common(sk); - inet_sock_destruct(sk); -} - -int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - - pr_warn_once("DCCP is deprecated and scheduled to be removed in 2025, " - "please contact the netdev mailing list\n"); - - icsk->icsk_rto = DCCP_TIMEOUT_INIT; - icsk->icsk_syn_retries = sysctl_dccp_request_retries; - sk->sk_state = DCCP_CLOSED; - sk->sk_write_space = dccp_write_space; - sk->sk_destruct = dccp_sk_destruct; - icsk->icsk_sync_mss = dccp_sync_mss; - dp->dccps_mss_cache = 536; - dp->dccps_rate_last = jiffies; - dp->dccps_role = DCCP_ROLE_UNDEFINED; - dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; - dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; - - dccp_init_xmit_timers(sk); - - INIT_LIST_HEAD(&dp->dccps_featneg); - /* control socket doesn't need feat nego */ - if (likely(ctl_sock_initialized)) - return dccp_feat_init(sk); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_init_sock); - -void dccp_destroy_sock(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - - __skb_queue_purge(&sk->sk_write_queue); - if (sk->sk_send_head != NULL) { - kfree_skb(sk->sk_send_head); - sk->sk_send_head = NULL; - } - - /* Clean up a referenced DCCP bind bucket. */ - if (inet_csk(sk)->icsk_bind_hash != NULL) - inet_put_port(sk); - - kfree(dp->dccps_service_list); - dp->dccps_service_list = NULL; - - if (dp->dccps_hc_rx_ackvec != NULL) { - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - } - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = NULL; - - /* clean up feature negotiation state */ - dccp_feat_list_purge(&dp->dccps_featneg); -} - -EXPORT_SYMBOL_GPL(dccp_destroy_sock); - -static inline int dccp_need_reset(int state) -{ - return state != DCCP_CLOSED && state != DCCP_LISTEN && - state != DCCP_REQUESTING; -} - -int dccp_disconnect(struct sock *sk, int flags) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - const int old_state = sk->sk_state; - - if (old_state != DCCP_CLOSED) - dccp_set_state(sk, DCCP_CLOSED); - - /* - * This corresponds to the ABORT function of RFC793, sec. 3.8 - * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted". - */ - if (old_state == DCCP_LISTEN) { - inet_csk_listen_stop(sk); - } else if (dccp_need_reset(old_state)) { - dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); - sk->sk_err = ECONNRESET; - } else if (old_state == DCCP_REQUESTING) - sk->sk_err = ECONNRESET; - - dccp_clear_xmit_timers(sk); - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = NULL; - - __skb_queue_purge(&sk->sk_receive_queue); - __skb_queue_purge(&sk->sk_write_queue); - if (sk->sk_send_head != NULL) { - __kfree_skb(sk->sk_send_head); - sk->sk_send_head = NULL; - } - - inet->inet_dport = 0; - - inet_bhash2_reset_saddr(sk); - - sk->sk_shutdown = 0; - sock_reset_flag(sk, SOCK_DONE); - - icsk->icsk_backoff = 0; - inet_csk_delack_init(sk); - __sk_dst_reset(sk); - - WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); - - sk_error_report(sk); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_disconnect); - -/* - * Wait for a DCCP event. - * - * Note that we don't need to lock the socket, as the upper poll layers - * take care of normal races (between the test and the event) and we don't - * go look at any of the socket buffers directly. - */ -__poll_t dccp_poll(struct file *file, struct socket *sock, - poll_table *wait) -{ - struct sock *sk = sock->sk; - __poll_t mask; - u8 shutdown; - int state; - - sock_poll_wait(file, sock, wait); - - state = inet_sk_state_load(sk); - if (state == DCCP_LISTEN) - return inet_csk_listen_poll(sk); - - /* Socket is not locked. We are protected from async events - by poll logic and correct handling of state changes - made by another threads is impossible in any case. - */ - - mask = 0; - if (READ_ONCE(sk->sk_err)) - mask = EPOLLERR; - shutdown = READ_ONCE(sk->sk_shutdown); - - if (shutdown == SHUTDOWN_MASK || state == DCCP_CLOSED) - mask |= EPOLLHUP; - if (shutdown & RCV_SHUTDOWN) - mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - - /* Connected? */ - if ((1 << state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { - if (atomic_read(&sk->sk_rmem_alloc) > 0) - mask |= EPOLLIN | EPOLLRDNORM; - - if (!(shutdown & SEND_SHUTDOWN)) { - if (sk_stream_is_writeable(sk)) { - mask |= EPOLLOUT | EPOLLWRNORM; - } else { /* send SIGIO later */ - sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - - /* Race breaker. If space is freed after - * wspace test but before the flags are set, - * IO signal will be lost. - */ - if (sk_stream_is_writeable(sk)) - mask |= EPOLLOUT | EPOLLWRNORM; - } - } - } - return mask; -} -EXPORT_SYMBOL_GPL(dccp_poll); - -int dccp_ioctl(struct sock *sk, int cmd, int *karg) -{ - int rc = -ENOTCONN; - - lock_sock(sk); - - if (sk->sk_state == DCCP_LISTEN) - goto out; - - switch (cmd) { - case SIOCOUTQ: { - *karg = sk_wmem_alloc_get(sk); - /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and - * always 0, comparably to UDP. - */ - - rc = 0; - } - break; - case SIOCINQ: { - struct sk_buff *skb; - *karg = 0; - - skb = skb_peek(&sk->sk_receive_queue); - if (skb != NULL) { - /* - * We will only return the amount of this packet since - * that is all that will be read. - */ - *karg = skb->len; - } - rc = 0; - } - break; - default: - rc = -ENOIOCTLCMD; - break; - } -out: - release_sock(sk); - return rc; -} - -EXPORT_SYMBOL_GPL(dccp_ioctl); - -static int dccp_setsockopt_service(struct sock *sk, const __be32 service, - sockptr_t optval, unsigned int optlen) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_service_list *sl = NULL; - - if (service == DCCP_SERVICE_INVALID_VALUE || - optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32)) - return -EINVAL; - - if (optlen > sizeof(service)) { - sl = kmalloc(optlen, GFP_KERNEL); - if (sl == NULL) - return -ENOMEM; - - sl->dccpsl_nr = optlen / sizeof(u32) - 1; - if (copy_from_sockptr_offset(sl->dccpsl_list, optval, - sizeof(service), optlen - sizeof(service)) || - dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) { - kfree(sl); - return -EFAULT; - } - } - - lock_sock(sk); - dp->dccps_service = service; - - kfree(dp->dccps_service_list); - - dp->dccps_service_list = sl; - release_sock(sk); - return 0; -} - -static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) -{ - u8 *list, len; - int i, rc; - - if (cscov < 0 || cscov > 15) - return -EINVAL; - /* - * Populate a list of permissible values, in the range cscov...15. This - * is necessary since feature negotiation of single values only works if - * both sides incidentally choose the same value. Since the list starts - * lowest-value first, negotiation will pick the smallest shared value. - */ - if (cscov == 0) - return 0; - len = 16 - cscov; - - list = kmalloc(len, GFP_KERNEL); - if (list == NULL) - return -ENOBUFS; - - for (i = 0; i < len; i++) - list[i] = cscov++; - - rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); - - if (rc == 0) { - if (rx) - dccp_sk(sk)->dccps_pcrlen = cscov; - else - dccp_sk(sk)->dccps_pcslen = cscov; - } - kfree(list); - return rc; -} - -static int dccp_setsockopt_ccid(struct sock *sk, int type, - sockptr_t optval, unsigned int optlen) -{ - u8 *val; - int rc = 0; - - if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) - return -EINVAL; - - val = memdup_sockptr(optval, optlen); - if (IS_ERR(val)) - return PTR_ERR(val); - - lock_sock(sk); - if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) - rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); - - if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) - rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); - release_sock(sk); - - kfree(val); - return rc; -} - -static int do_dccp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) -{ - struct dccp_sock *dp = dccp_sk(sk); - int val, err = 0; - - switch (optname) { - case DCCP_SOCKOPT_PACKET_SIZE: - DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_CHANGE_L: - case DCCP_SOCKOPT_CHANGE_R: - DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_CCID: - case DCCP_SOCKOPT_RX_CCID: - case DCCP_SOCKOPT_TX_CCID: - return dccp_setsockopt_ccid(sk, optname, optval, optlen); - } - - if (optlen < (int)sizeof(int)) - return -EINVAL; - - if (copy_from_sockptr(&val, optval, sizeof(int))) - return -EFAULT; - - if (optname == DCCP_SOCKOPT_SERVICE) - return dccp_setsockopt_service(sk, val, optval, optlen); - - lock_sock(sk); - switch (optname) { - case DCCP_SOCKOPT_SERVER_TIMEWAIT: - if (dp->dccps_role != DCCP_ROLE_SERVER) - err = -EOPNOTSUPP; - else - dp->dccps_server_timewait = (val != 0); - break; - case DCCP_SOCKOPT_SEND_CSCOV: - err = dccp_setsockopt_cscov(sk, val, false); - break; - case DCCP_SOCKOPT_RECV_CSCOV: - err = dccp_setsockopt_cscov(sk, val, true); - break; - case DCCP_SOCKOPT_QPOLICY_ID: - if (sk->sk_state != DCCP_CLOSED) - err = -EISCONN; - else if (val < 0 || val >= DCCPQ_POLICY_MAX) - err = -EINVAL; - else - dp->dccps_qpolicy = val; - break; - case DCCP_SOCKOPT_QPOLICY_TXQLEN: - if (val < 0) - err = -EINVAL; - else - dp->dccps_tx_qlen = val; - break; - default: - err = -ENOPROTOOPT; - break; - } - release_sock(sk); - - return err; -} - -int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, - unsigned int optlen) -{ - if (level != SOL_DCCP) - return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, - optname, optval, - optlen); - return do_dccp_setsockopt(sk, level, optname, optval, optlen); -} - -EXPORT_SYMBOL_GPL(dccp_setsockopt); - -static int dccp_getsockopt_service(struct sock *sk, int len, - __be32 __user *optval, - int __user *optlen) -{ - const struct dccp_sock *dp = dccp_sk(sk); - const struct dccp_service_list *sl; - int err = -ENOENT, slen = 0, total_len = sizeof(u32); - - lock_sock(sk); - if ((sl = dp->dccps_service_list) != NULL) { - slen = sl->dccpsl_nr * sizeof(u32); - total_len += slen; - } - - err = -EINVAL; - if (total_len > len) - goto out; - - err = 0; - if (put_user(total_len, optlen) || - put_user(dp->dccps_service, optval) || - (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen))) - err = -EFAULT; -out: - release_sock(sk); - return err; -} - -static int do_dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - struct dccp_sock *dp; - int val, len; - - if (get_user(len, optlen)) - return -EFAULT; - - if (len < (int)sizeof(int)) - return -EINVAL; - - dp = dccp_sk(sk); - - switch (optname) { - case DCCP_SOCKOPT_PACKET_SIZE: - DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_SERVICE: - return dccp_getsockopt_service(sk, len, - (__be32 __user *)optval, optlen); - case DCCP_SOCKOPT_GET_CUR_MPS: - val = READ_ONCE(dp->dccps_mss_cache); - break; - case DCCP_SOCKOPT_AVAILABLE_CCIDS: - return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); - case DCCP_SOCKOPT_TX_CCID: - val = ccid_get_current_tx_ccid(dp); - if (val < 0) - return -ENOPROTOOPT; - break; - case DCCP_SOCKOPT_RX_CCID: - val = ccid_get_current_rx_ccid(dp); - if (val < 0) - return -ENOPROTOOPT; - break; - case DCCP_SOCKOPT_SERVER_TIMEWAIT: - val = dp->dccps_server_timewait; - break; - case DCCP_SOCKOPT_SEND_CSCOV: - val = dp->dccps_pcslen; - break; - case DCCP_SOCKOPT_RECV_CSCOV: - val = dp->dccps_pcrlen; - break; - case DCCP_SOCKOPT_QPOLICY_ID: - val = dp->dccps_qpolicy; - break; - case DCCP_SOCKOPT_QPOLICY_TXQLEN: - val = dp->dccps_tx_qlen; - break; - case 128 ... 191: - return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, - len, (u32 __user *)optval, optlen); - case 192 ... 255: - return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname, - len, (u32 __user *)optval, optlen); - default: - return -ENOPROTOOPT; - } - - len = sizeof(val); - if (put_user(len, optlen) || copy_to_user(optval, &val, len)) - return -EFAULT; - - return 0; -} - -int dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level != SOL_DCCP) - return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level, - optname, optval, - optlen); - return do_dccp_getsockopt(sk, level, optname, optval, optlen); -} - -EXPORT_SYMBOL_GPL(dccp_getsockopt); - -static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) -{ - struct cmsghdr *cmsg; - - /* - * Assign an (opaque) qpolicy priority value to skb->priority. - * - * We are overloading this skb field for use with the qpolicy subystem. - * The skb->priority is normally used for the SO_PRIORITY option, which - * is initialised from sk_priority. Since the assignment of sk_priority - * to skb->priority happens later (on layer 3), we overload this field - * for use with queueing priorities as long as the skb is on layer 4. - * The default priority value (if nothing is set) is 0. - */ - skb->priority = 0; - - for_each_cmsghdr(cmsg, msg) { - if (!CMSG_OK(msg, cmsg)) - return -EINVAL; - - if (cmsg->cmsg_level != SOL_DCCP) - continue; - - if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && - !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) - return -EINVAL; - - switch (cmsg->cmsg_type) { - case DCCP_SCM_PRIORITY: - if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) - return -EINVAL; - skb->priority = *(__u32 *)CMSG_DATA(cmsg); - break; - default: - return -EINVAL; - } - } - return 0; -} - -int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) -{ - const struct dccp_sock *dp = dccp_sk(sk); - const int flags = msg->msg_flags; - const int noblock = flags & MSG_DONTWAIT; - struct sk_buff *skb; - int rc, size; - long timeo; - - trace_dccp_probe(sk, len); - - if (len > READ_ONCE(dp->dccps_mss_cache)) - return -EMSGSIZE; - - lock_sock(sk); - - timeo = sock_sndtimeo(sk, noblock); - - /* - * We have to use sk_stream_wait_connect here to set sk_write_pending, - * so that the trick in dccp_rcv_request_sent_state_process. - */ - /* Wait for a connection to finish. */ - if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) - if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0) - goto out_release; - - size = sk->sk_prot->max_header + len; - release_sock(sk); - skb = sock_alloc_send_skb(sk, size, noblock, &rc); - lock_sock(sk); - if (skb == NULL) - goto out_release; - - if (dccp_qpolicy_full(sk)) { - rc = -EAGAIN; - goto out_discard; - } - - if (sk->sk_state == DCCP_CLOSED) { - rc = -ENOTCONN; - goto out_discard; - } - - /* We need to check dccps_mss_cache after socket is locked. */ - if (len > dp->dccps_mss_cache) { - rc = -EMSGSIZE; - goto out_discard; - } - - skb_reserve(skb, sk->sk_prot->max_header); - rc = memcpy_from_msg(skb_put(skb, len), msg, len); - if (rc != 0) - goto out_discard; - - rc = dccp_msghdr_parse(msg, skb); - if (rc != 0) - goto out_discard; - - dccp_qpolicy_push(sk, skb); - /* - * The xmit_timer is set if the TX CCID is rate-based and will expire - * when congestion control permits to release further packets into the - * network. Window-based CCIDs do not use this timer. - */ - if (!timer_pending(&dp->dccps_xmit_timer)) - dccp_write_xmit(sk); -out_release: - release_sock(sk); - return rc ? : len; -out_discard: - kfree_skb(skb); - goto out_release; -} - -EXPORT_SYMBOL_GPL(dccp_sendmsg); - -int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, - int *addr_len) -{ - const struct dccp_hdr *dh; - long timeo; - - lock_sock(sk); - - if (sk->sk_state == DCCP_LISTEN) { - len = -ENOTCONN; - goto out; - } - - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - - do { - struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - - if (skb == NULL) - goto verify_sock_status; - - dh = dccp_hdr(skb); - - switch (dh->dccph_type) { - case DCCP_PKT_DATA: - case DCCP_PKT_DATAACK: - goto found_ok_skb; - - case DCCP_PKT_CLOSE: - case DCCP_PKT_CLOSEREQ: - if (!(flags & MSG_PEEK)) - dccp_finish_passive_close(sk); - fallthrough; - case DCCP_PKT_RESET: - dccp_pr_debug("found fin (%s) ok!\n", - dccp_packet_name(dh->dccph_type)); - len = 0; - goto found_fin_ok; - default: - dccp_pr_debug("packet_type=%s\n", - dccp_packet_name(dh->dccph_type)); - sk_eat_skb(sk, skb); - } -verify_sock_status: - if (sock_flag(sk, SOCK_DONE)) { - len = 0; - break; - } - - if (sk->sk_err) { - len = sock_error(sk); - break; - } - - if (sk->sk_shutdown & RCV_SHUTDOWN) { - len = 0; - break; - } - - if (sk->sk_state == DCCP_CLOSED) { - if (!sock_flag(sk, SOCK_DONE)) { - /* This occurs when user tries to read - * from never connected socket. - */ - len = -ENOTCONN; - break; - } - len = 0; - break; - } - - if (!timeo) { - len = -EAGAIN; - break; - } - - if (signal_pending(current)) { - len = sock_intr_errno(timeo); - break; - } - - sk_wait_data(sk, &timeo, NULL); - continue; - found_ok_skb: - if (len > skb->len) - len = skb->len; - else if (len < skb->len) - msg->msg_flags |= MSG_TRUNC; - - if (skb_copy_datagram_msg(skb, 0, msg, len)) { - /* Exception. Bailout! */ - len = -EFAULT; - break; - } - if (flags & MSG_TRUNC) - len = skb->len; - found_fin_ok: - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); - break; - } while (1); -out: - release_sock(sk); - return len; -} - -EXPORT_SYMBOL_GPL(dccp_recvmsg); - -int inet_dccp_listen(struct socket *sock, int backlog) -{ - struct sock *sk = sock->sk; - unsigned char old_state; - int err; - - lock_sock(sk); - - err = -EINVAL; - if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP) - goto out; - - old_state = sk->sk_state; - if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) - goto out; - - WRITE_ONCE(sk->sk_max_ack_backlog, backlog); - /* Really, if the socket is already in listen state - * we can only allow the backlog to be adjusted. - */ - if (old_state != DCCP_LISTEN) { - struct dccp_sock *dp = dccp_sk(sk); - - dp->dccps_role = DCCP_ROLE_LISTEN; - - /* do not start to listen if feature negotiation setup fails */ - if (dccp_feat_finalise_settings(dp)) { - err = -EPROTO; - goto out; - } - - err = inet_csk_listen_start(sk); - if (err) - goto out; - } - err = 0; - -out: - release_sock(sk); - return err; -} - -EXPORT_SYMBOL_GPL(inet_dccp_listen); - -static void dccp_terminate_connection(struct sock *sk) -{ - u8 next_state = DCCP_CLOSED; - - switch (sk->sk_state) { - case DCCP_PASSIVE_CLOSE: - case DCCP_PASSIVE_CLOSEREQ: - dccp_finish_passive_close(sk); - break; - case DCCP_PARTOPEN: - dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk); - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); - fallthrough; - case DCCP_OPEN: - dccp_send_close(sk, 1); - - if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER && - !dccp_sk(sk)->dccps_server_timewait) - next_state = DCCP_ACTIVE_CLOSEREQ; - else - next_state = DCCP_CLOSING; - fallthrough; - default: - dccp_set_state(sk, next_state); - } -} - -void dccp_close(struct sock *sk, long timeout) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - u32 data_was_unread = 0; - int state; - - lock_sock(sk); - - sk->sk_shutdown = SHUTDOWN_MASK; - - if (sk->sk_state == DCCP_LISTEN) { - dccp_set_state(sk, DCCP_CLOSED); - - /* Special case. */ - inet_csk_listen_stop(sk); - - goto adjudge_to_death; - } - - sk_stop_timer(sk, &dp->dccps_xmit_timer); - - /* - * We need to flush the recv. buffs. We do this only on the - * descriptor close, not protocol-sourced closes, because the - *reader process may not have drained the data yet! - */ - while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { - data_was_unread += skb->len; - __kfree_skb(skb); - } - - /* If socket has been already reset kill it. */ - if (sk->sk_state == DCCP_CLOSED) - goto adjudge_to_death; - - if (data_was_unread) { - /* Unread data was tossed, send an appropriate Reset Code */ - DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread); - dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); - dccp_set_state(sk, DCCP_CLOSED); - } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { - /* Check zero linger _after_ checking for unread data. */ - sk->sk_prot->disconnect(sk, 0); - } else if (sk->sk_state != DCCP_CLOSED) { - /* - * Normal connection termination. May need to wait if there are - * still packets in the TX queue that are delayed by the CCID. - */ - dccp_flush_write_queue(sk, &timeout); - dccp_terminate_connection(sk); - } - - /* - * Flush write queue. This may be necessary in several cases: - * - we have been closed by the peer but still have application data; - * - abortive termination (unread data or zero linger time), - * - normal termination but queue could not be flushed within time limit - */ - __skb_queue_purge(&sk->sk_write_queue); - - sk_stream_wait_close(sk, timeout); - -adjudge_to_death: - state = sk->sk_state; - sock_hold(sk); - sock_orphan(sk); - - /* - * It is the last release_sock in its life. It will remove backlog. - */ - release_sock(sk); - /* - * Now socket is owned by kernel and we acquire BH lock - * to finish close. No need to check for user refs. - */ - local_bh_disable(); - bh_lock_sock(sk); - WARN_ON(sock_owned_by_user(sk)); - - this_cpu_inc(dccp_orphan_count); - - /* Have we already been destroyed by a softirq or backlog? */ - if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) - goto out; - - if (sk->sk_state == DCCP_CLOSED) - inet_csk_destroy_sock(sk); - - /* Otherwise, socket is reprieved until protocol close. */ - -out: - bh_unlock_sock(sk); - local_bh_enable(); - sock_put(sk); -} - -EXPORT_SYMBOL_GPL(dccp_close); - -void dccp_shutdown(struct sock *sk, int how) -{ - dccp_pr_debug("called shutdown(%x)\n", how); -} - -EXPORT_SYMBOL_GPL(dccp_shutdown); - -static inline int __init dccp_mib_init(void) -{ - dccp_statistics = alloc_percpu(struct dccp_mib); - if (!dccp_statistics) - return -ENOMEM; - return 0; -} - -static inline void dccp_mib_exit(void) -{ - free_percpu(dccp_statistics); -} - -static int thash_entries; -module_param(thash_entries, int, 0444); -MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); - -#ifdef CONFIG_IP_DCCP_DEBUG -bool dccp_debug; -module_param(dccp_debug, bool, 0644); -MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); - -EXPORT_SYMBOL_GPL(dccp_debug); -#endif - -static int __init dccp_init(void) -{ - unsigned long goal; - unsigned long nr_pages = totalram_pages(); - int ehash_order, bhash_order, i; - int rc; - - BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > - sizeof_field(struct sk_buff, cb)); - rc = inet_hashinfo2_init_mod(&dccp_hashinfo); - if (rc) - goto out_fail; - rc = -ENOBUFS; - dccp_hashinfo.bind_bucket_cachep = - kmem_cache_create("dccp_bind_bucket", - sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); - if (!dccp_hashinfo.bind_bucket_cachep) - goto out_free_hashinfo2; - dccp_hashinfo.bind2_bucket_cachep = - kmem_cache_create("dccp_bind2_bucket", - sizeof(struct inet_bind2_bucket), 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); - if (!dccp_hashinfo.bind2_bucket_cachep) - goto out_free_bind_bucket_cachep; - - /* - * Size and allocate the main established and bind bucket - * hash tables. - * - * The methodology is similar to that of the buffer cache. - */ - if (nr_pages >= (128 * 1024)) - goal = nr_pages >> (21 - PAGE_SHIFT); - else - goal = nr_pages >> (23 - PAGE_SHIFT); - - if (thash_entries) - goal = (thash_entries * - sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT; - for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++) - ; - do { - unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE / - sizeof(struct inet_ehash_bucket); - - while (hash_size & (hash_size - 1)) - hash_size--; - dccp_hashinfo.ehash_mask = hash_size - 1; - dccp_hashinfo.ehash = (struct inet_ehash_bucket *) - __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order); - } while (!dccp_hashinfo.ehash && --ehash_order > 0); - - if (!dccp_hashinfo.ehash) { - DCCP_CRIT("Failed to allocate DCCP established hash table"); - goto out_free_bind2_bucket_cachep; - } - - for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) - INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i); - - if (inet_ehash_locks_alloc(&dccp_hashinfo)) - goto out_free_dccp_ehash; - - bhash_order = ehash_order; - - do { - dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE / - sizeof(struct inet_bind_hashbucket); - if ((dccp_hashinfo.bhash_size > (64 * 1024)) && - bhash_order > 0) - continue; - dccp_hashinfo.bhash = (struct inet_bind_hashbucket *) - __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order); - } while (!dccp_hashinfo.bhash && --bhash_order >= 0); - - if (!dccp_hashinfo.bhash) { - DCCP_CRIT("Failed to allocate DCCP bind hash table"); - goto out_free_dccp_locks; - } - - dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *) - __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order); - - if (!dccp_hashinfo.bhash2) { - DCCP_CRIT("Failed to allocate DCCP bind2 hash table"); - goto out_free_dccp_bhash; - } - - for (i = 0; i < dccp_hashinfo.bhash_size; i++) { - spin_lock_init(&dccp_hashinfo.bhash[i].lock); - INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); - spin_lock_init(&dccp_hashinfo.bhash2[i].lock); - INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain); - } - - dccp_hashinfo.pernet = false; - - rc = dccp_mib_init(); - if (rc) - goto out_free_dccp_bhash2; - - rc = dccp_ackvec_init(); - if (rc) - goto out_free_dccp_mib; - - rc = dccp_sysctl_init(); - if (rc) - goto out_ackvec_exit; - - rc = ccid_initialize_builtins(); - if (rc) - goto out_sysctl_exit; - - dccp_timestamping_init(); - - return 0; - -out_sysctl_exit: - dccp_sysctl_exit(); -out_ackvec_exit: - dccp_ackvec_exit(); -out_free_dccp_mib: - dccp_mib_exit(); -out_free_dccp_bhash2: - free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); -out_free_dccp_bhash: - free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); -out_free_dccp_locks: - inet_ehash_locks_free(&dccp_hashinfo); -out_free_dccp_ehash: - free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); -out_free_bind2_bucket_cachep: - kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep); -out_free_bind_bucket_cachep: - kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); -out_free_hashinfo2: - inet_hashinfo2_free_mod(&dccp_hashinfo); -out_fail: - dccp_hashinfo.bhash = NULL; - dccp_hashinfo.bhash2 = NULL; - dccp_hashinfo.ehash = NULL; - dccp_hashinfo.bind_bucket_cachep = NULL; - dccp_hashinfo.bind2_bucket_cachep = NULL; - return rc; -} - -static void __exit dccp_fini(void) -{ - int bhash_order = get_order(dccp_hashinfo.bhash_size * - sizeof(struct inet_bind_hashbucket)); - - ccid_cleanup_builtins(); - dccp_mib_exit(); - free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); - free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); - free_pages((unsigned long)dccp_hashinfo.ehash, - get_order((dccp_hashinfo.ehash_mask + 1) * - sizeof(struct inet_ehash_bucket))); - inet_ehash_locks_free(&dccp_hashinfo); - kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); - dccp_ackvec_exit(); - dccp_sysctl_exit(); - inet_hashinfo2_free_mod(&dccp_hashinfo); -} - -module_init(dccp_init); -module_exit(dccp_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo "); -MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c deleted file mode 100644 index 5ba204ec0aca4..0000000000000 --- a/net/dccp/qpolicy.c +++ /dev/null @@ -1,136 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/qpolicy.c - * - * Policy-based packet dequeueing interface for DCCP. - * - * Copyright (c) 2008 Tomasz Grobelny - */ -#include "dccp.h" - -/* - * Simple Dequeueing Policy: - * If tx_qlen is different from 0, enqueue up to tx_qlen elements. - */ -static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) -{ - skb_queue_tail(&sk->sk_write_queue, skb); -} - -static bool qpolicy_simple_full(struct sock *sk) -{ - return dccp_sk(sk)->dccps_tx_qlen && - sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; -} - -static struct sk_buff *qpolicy_simple_top(struct sock *sk) -{ - return skb_peek(&sk->sk_write_queue); -} - -/* - * Priority-based Dequeueing Policy: - * If tx_qlen is different from 0 and the queue has reached its upper bound - * of tx_qlen elements, replace older packets lowest-priority-first. - */ -static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) -{ - struct sk_buff *skb, *best = NULL; - - skb_queue_walk(&sk->sk_write_queue, skb) - if (best == NULL || skb->priority > best->priority) - best = skb; - return best; -} - -static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) -{ - struct sk_buff *skb, *worst = NULL; - - skb_queue_walk(&sk->sk_write_queue, skb) - if (worst == NULL || skb->priority < worst->priority) - worst = skb; - return worst; -} - -static bool qpolicy_prio_full(struct sock *sk) -{ - if (qpolicy_simple_full(sk)) - dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); - return false; -} - -/** - * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface - * @push: add a new @skb to the write queue - * @full: indicates that no more packets will be admitted - * @top: peeks at whatever the queueing policy defines as its `top' - * @params: parameter passed to policy operation - */ -struct dccp_qpolicy_operations { - void (*push) (struct sock *sk, struct sk_buff *skb); - bool (*full) (struct sock *sk); - struct sk_buff* (*top) (struct sock *sk); - __be32 params; -}; - -static struct dccp_qpolicy_operations qpol_table[DCCPQ_POLICY_MAX] = { - [DCCPQ_POLICY_SIMPLE] = { - .push = qpolicy_simple_push, - .full = qpolicy_simple_full, - .top = qpolicy_simple_top, - .params = 0, - }, - [DCCPQ_POLICY_PRIO] = { - .push = qpolicy_simple_push, - .full = qpolicy_prio_full, - .top = qpolicy_prio_best_skb, - .params = DCCP_SCM_PRIORITY, - }, -}; - -/* - * Externally visible interface - */ -void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) -{ - qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); -} - -bool dccp_qpolicy_full(struct sock *sk) -{ - return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); -} - -void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) -{ - if (skb != NULL) { - skb_unlink(skb, &sk->sk_write_queue); - kfree_skb(skb); - } -} - -struct sk_buff *dccp_qpolicy_top(struct sock *sk) -{ - return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); -} - -struct sk_buff *dccp_qpolicy_pop(struct sock *sk) -{ - struct sk_buff *skb = dccp_qpolicy_top(sk); - - if (skb != NULL) { - /* Clear any skb fields that we used internally */ - skb->priority = 0; - skb_unlink(skb, &sk->sk_write_queue); - } - return skb; -} - -bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param) -{ - /* check if exactly one bit is set */ - if (!param || (param & (param - 1))) - return false; - return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param; -} diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c deleted file mode 100644 index b15845fd63001..0000000000000 --- a/net/dccp/sysctl.c +++ /dev/null @@ -1,107 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/sysctl.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include "dccp.h" -#include "feat.h" - -/* Boundary values */ -static int u8_max = 0xFF; -static unsigned long seqw_min = DCCPF_SEQ_WMIN, - seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */ - -static struct ctl_table dccp_default_table[] = { - { - .procname = "seq_window", - .data = &sysctl_dccp_sequence_window, - .maxlen = sizeof(sysctl_dccp_sequence_window), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */ - .extra2 = &seqw_max, - }, - { - .procname = "rx_ccid", - .data = &sysctl_dccp_rx_ccid, - .maxlen = sizeof(sysctl_dccp_rx_ccid), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, /* RFC 4340, 10. */ - }, - { - .procname = "tx_ccid", - .data = &sysctl_dccp_tx_ccid, - .maxlen = sizeof(sysctl_dccp_tx_ccid), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, /* RFC 4340, 10. */ - }, - { - .procname = "request_retries", - .data = &sysctl_dccp_request_retries, - .maxlen = sizeof(sysctl_dccp_request_retries), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = &u8_max, - }, - { - .procname = "retries1", - .data = &sysctl_dccp_retries1, - .maxlen = sizeof(sysctl_dccp_retries1), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, - }, - { - .procname = "retries2", - .data = &sysctl_dccp_retries2, - .maxlen = sizeof(sysctl_dccp_retries2), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, - }, - { - .procname = "tx_qlen", - .data = &sysctl_dccp_tx_qlen, - .maxlen = sizeof(sysctl_dccp_tx_qlen), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "sync_ratelimit", - .data = &sysctl_dccp_sync_ratelimit, - .maxlen = sizeof(sysctl_dccp_sync_ratelimit), - .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - }, -}; - -static struct ctl_table_header *dccp_table_header; - -int __init dccp_sysctl_init(void) -{ - dccp_table_header = register_net_sysctl(&init_net, "net/dccp/default", - dccp_default_table); - - return dccp_table_header != NULL ? 0 : -ENOMEM; -} - -void dccp_sysctl_exit(void) -{ - if (dccp_table_header != NULL) { - unregister_net_sysctl_table(dccp_table_header); - dccp_table_header = NULL; - } -} diff --git a/net/dccp/timer.c b/net/dccp/timer.c deleted file mode 100644 index 232ac4ae0a73f..0000000000000 --- a/net/dccp/timer.c +++ /dev/null @@ -1,272 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/timer.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include - -#include "dccp.h" - -/* sysctl variables governing numbers of retransmission attempts */ -int sysctl_dccp_request_retries __read_mostly = TCP_SYN_RETRIES; -int sysctl_dccp_retries1 __read_mostly = TCP_RETR1; -int sysctl_dccp_retries2 __read_mostly = TCP_RETR2; - -static void dccp_write_err(struct sock *sk) -{ - sk->sk_err = READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT; - sk_error_report(sk); - - dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); - dccp_done(sk); - __DCCP_INC_STATS(DCCP_MIB_ABORTONTIMEOUT); -} - -/* A write timeout has occurred. Process the after effects. */ -static int dccp_write_timeout(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - int retry_until; - - if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { - if (icsk->icsk_retransmits != 0) - dst_negative_advice(sk); - retry_until = icsk->icsk_syn_retries ? - : sysctl_dccp_request_retries; - } else { - if (icsk->icsk_retransmits >= sysctl_dccp_retries1) { - /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu - black hole detection. :-( - - It is place to make it. It is not made. I do not want - to make it. It is disguisting. It does not work in any - case. Let me to cite the same draft, which requires for - us to implement this: - - "The one security concern raised by this memo is that ICMP black holes - are often caused by over-zealous security administrators who block - all ICMP messages. It is vitally important that those who design and - deploy security systems understand the impact of strict filtering on - upper-layer protocols. The safest web site in the world is worthless - if most TCP implementations cannot transfer data from it. It would - be far nicer to have all of the black holes fixed rather than fixing - all of the TCP implementations." - - Golden words :-). - */ - - dst_negative_advice(sk); - } - - retry_until = sysctl_dccp_retries2; - /* - * FIXME: see tcp_write_timout and tcp_out_of_resources - */ - } - - if (icsk->icsk_retransmits >= retry_until) { - /* Has it gone just too far? */ - dccp_write_err(sk); - return 1; - } - return 0; -} - -/* - * The DCCP retransmit timer. - */ -static void dccp_retransmit_timer(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - /* - * More than 4MSL (8 minutes) has passed, a RESET(aborted) was - * sent, no need to retransmit, this sock is dead. - */ - if (dccp_write_timeout(sk)) - return; - - /* - * We want to know the number of packets retransmitted, not the - * total number of retransmissions of clones of original packets. - */ - if (icsk->icsk_retransmits == 0) - __DCCP_INC_STATS(DCCP_MIB_TIMEOUTS); - - if (dccp_retransmit_skb(sk) != 0) { - /* - * Retransmission failed because of local congestion, - * do not backoff. - */ - if (--icsk->icsk_retransmits == 0) - icsk->icsk_retransmits = 1; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - min(icsk->icsk_rto, - TCP_RESOURCE_PROBE_INTERVAL), - DCCP_RTO_MAX); - return; - } - - icsk->icsk_backoff++; - - icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, - DCCP_RTO_MAX); - if (icsk->icsk_retransmits > sysctl_dccp_retries1) - __sk_dst_reset(sk); -} - -static void dccp_write_timer(struct timer_list *t) -{ - struct inet_connection_sock *icsk = - from_timer(icsk, t, icsk_retransmit_timer); - struct sock *sk = &icsk->icsk_inet.sk; - int event = 0; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later */ - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, - jiffies + (HZ / 20)); - goto out; - } - - if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending) - goto out; - - if (time_after(icsk_timeout(icsk), jiffies)) { - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, - icsk_timeout(icsk)); - goto out; - } - - event = icsk->icsk_pending; - icsk->icsk_pending = 0; - - switch (event) { - case ICSK_TIME_RETRANS: - dccp_retransmit_timer(sk); - break; - } -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -static void dccp_keepalive_timer(struct timer_list *t) -{ - struct sock *sk = from_timer(sk, t, sk_timer); - - pr_err("dccp should not use a keepalive timer !\n"); - sock_put(sk); -} - -/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */ -static void dccp_delack_timer(struct timer_list *t) -{ - struct inet_connection_sock *icsk = - from_timer(icsk, t, icsk_delack_timer); - struct sock *sk = &icsk->icsk_inet.sk; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later. */ - __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); - sk_reset_timer(sk, &icsk->icsk_delack_timer, - jiffies + TCP_DELACK_MIN); - goto out; - } - - if (sk->sk_state == DCCP_CLOSED || - !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) - goto out; - if (time_after(icsk_delack_timeout(icsk), jiffies)) { - sk_reset_timer(sk, &icsk->icsk_delack_timer, - icsk_delack_timeout(icsk)); - goto out; - } - - icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; - - if (inet_csk_ack_scheduled(sk)) { - if (!inet_csk_in_pingpong_mode(sk)) { - /* Delayed ACK missed: inflate ATO. */ - icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1, - icsk->icsk_rto); - } else { - /* Delayed ACK missed: leave pingpong mode and - * deflate ATO. - */ - inet_csk_exit_pingpong_mode(sk); - icsk->icsk_ack.ato = TCP_ATO_MIN; - } - dccp_send_ack(sk); - __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); - } -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -/** - * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface - * @t: pointer to the tasklet associated with this handler - * - * See the comments above %ccid_dequeueing_decision for supported modes. - */ -static void dccp_write_xmitlet(struct tasklet_struct *t) -{ - struct dccp_sock *dp = from_tasklet(dp, t, dccps_xmitlet); - struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) - sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); - else - dccp_write_xmit(sk); - bh_unlock_sock(sk); - sock_put(sk); -} - -static void dccp_write_xmit_timer(struct timer_list *t) -{ - struct dccp_sock *dp = from_timer(dp, t, dccps_xmit_timer); - - dccp_write_xmitlet(&dp->dccps_xmitlet); -} - -void dccp_init_xmit_timers(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - - tasklet_setup(&dp->dccps_xmitlet, dccp_write_xmitlet); - timer_setup(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 0); - inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, - &dccp_keepalive_timer); -} - -static ktime_t dccp_timestamp_seed; -/** - * dccp_timestamp - 10s of microseconds time source - * Returns the number of 10s of microseconds since loading DCCP. This is native - * DCCP time difference format (RFC 4340, sec. 13). - * Please note: This will wrap around about circa every 11.9 hours. - */ -u32 dccp_timestamp(void) -{ - u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); - - do_div(delta, 10); - return delta; -} -EXPORT_SYMBOL_GPL(dccp_timestamp); - -void __init dccp_timestamping_init(void) -{ - dccp_timestamp_seed = ktime_get_real(); -} diff --git a/net/dccp/trace.h b/net/dccp/trace.h deleted file mode 100644 index 5a43b3508c7f1..0000000000000 --- a/net/dccp/trace.h +++ /dev/null @@ -1,82 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM dccp - -#if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_DCCP_H - -#include -#include "dccp.h" -#include "ccids/ccid3.h" -#include -#include - -TRACE_EVENT(dccp_probe, - - TP_PROTO(struct sock *sk, size_t size), - - TP_ARGS(sk, size), - - TP_STRUCT__entry( - /* sockaddr_in6 is always bigger than sockaddr_in */ - __array(__u8, saddr, sizeof(struct sockaddr_in6)) - __array(__u8, daddr, sizeof(struct sockaddr_in6)) - __field(__u16, sport) - __field(__u16, dport) - __field(__u16, size) - __field(__u16, tx_s) - __field(__u32, tx_rtt) - __field(__u32, tx_p) - __field(__u32, tx_x_calc) - __field(__u64, tx_x_recv) - __field(__u64, tx_x) - __field(__u32, tx_t_ipi) - ), - - TP_fast_assign( - const struct inet_sock *inet = inet_sk(sk); - struct ccid3_hc_tx_sock *hc = NULL; - - if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3) - hc = ccid3_hc_tx_sk(sk); - - memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); - memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); - - TP_STORE_ADDR_PORTS(__entry, inet, sk); - - /* For filtering use */ - __entry->sport = ntohs(inet->inet_sport); - __entry->dport = ntohs(inet->inet_dport); - - __entry->size = size; - if (hc) { - __entry->tx_s = hc->tx_s; - __entry->tx_rtt = hc->tx_rtt; - __entry->tx_p = hc->tx_p; - __entry->tx_x_calc = hc->tx_x_calc; - __entry->tx_x_recv = hc->tx_x_recv >> 6; - __entry->tx_x = hc->tx_x >> 6; - __entry->tx_t_ipi = hc->tx_t_ipi; - } else { - __entry->tx_s = 0; - memset_startat(__entry, 0, tx_rtt); - } - ), - - TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d " - "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d", - __entry->saddr, __entry->daddr, __entry->size, - __entry->tx_s, __entry->tx_rtt, __entry->tx_p, - __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x, - __entry->tx_t_ipi) -); - -#endif /* _TRACE_TCP_H */ - -/* This part must be outside protection */ -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace -#include diff --git a/net/devlink/dev.c b/net/devlink/dev.c index d6e3db300acbe..02602704bdeaa 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -775,7 +775,7 @@ static int devlink_info_version_put(struct devlink_info_req *req, int attr, req->version_cb(version_name, version_type, req->version_cb_priv); - if (!req->msg) + if (!req->msg || !*version_value) return 0; nest = nla_nest_start_noflag(req->msg, attr); diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 1b1b700ec05ea..0d1e56965af0a 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -761,6 +761,11 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], if (res) goto err_unregister; + if (protocol_version == PRP_V1) { + eth_hw_addr_set(slave[1], slave[0]->dev_addr); + call_netdevice_notifiers(NETDEV_CHANGEADDR, slave[1]); + } + if (interlink) { res = hsr_add_port(hsr, interlink, HSR_PT_INTERLINK, extack); if (res) diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index d7ae32473c41a..192893c3f2ec7 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -78,6 +78,15 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, eth_hw_addr_set(master->dev, dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, master->dev); + + if (hsr->prot_version == PRP_V1) { + port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); + if (port) { + eth_hw_addr_set(port->dev, dev->dev_addr); + call_netdevice_notifiers(NETDEV_CHANGEADDR, + port->dev); + } + } } /* Make sure we recognize frames from ourselves in hsr_rcv() */ diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 1bc47b17a2968..135ec5fce0196 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -155,6 +155,7 @@ struct hsr_port { struct hsr_priv *hsr; enum hsr_port_type type; struct rcu_head rcu; + unsigned char original_macaddress[ETH_ALEN]; }; struct hsr_frame_info; diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 2a802a5de2acc..b87b6a6fe0700 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -196,6 +196,7 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, port->hsr = hsr; port->dev = dev; port->type = type; + ether_addr_copy(port->original_macaddress, dev->dev_addr); if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(hsr, dev, port, extack); @@ -232,6 +233,7 @@ void hsr_del_port(struct hsr_port *port) if (!port->hsr->fwd_offloaded) dev_set_promiscuity(port->dev, -1); netdev_upper_dev_unlink(port->dev, master->dev); + eth_hw_addr_set(port->dev, port->original_macaddress); } kfree_rcu(port, rcu); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 6d2c97f8e9ef9..12850a277251d 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -425,7 +425,7 @@ config INET_DIAG tristate "INET: socket monitoring interface" default y help - Support for INET (TCP, DCCP, etc) socket monitoring interface used by + Support for INET (TCP, UDP, etc) socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable at: diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5df1f1325259d..76e38092cd8a3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1328,10 +1328,7 @@ int inet_sk_rebuild_header(struct sock *sk) /* Routing failed... */ sk->sk_route_caps = 0; - /* - * Other protocols have to map its equivalent state to TCP_SYN_SENT. - * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme - */ + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) || sk->sk_state != TCP_SYN_SENT || (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 77e5705ac799e..c47d3828d4f65 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1792,12 +1792,12 @@ static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, struct ifaddrmsg *ifm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request"); return -EINVAL; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 3f4e629998fab..57f088e5540e2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -948,12 +948,12 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, if (filter->rtnl_held) ASSERT_RTNL(); - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request"); return -EINVAL; } - rtm = nlmsg_data(nlh); if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_scope) { NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index dd5cf8914a28d..20915895bdaaf 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -330,7 +330,7 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, struct inet_bind2_bucket **tb2_ret, struct inet_bind_hashbucket **head2_ret, int *port_ret) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); int i, low, high, attempt_half, port, l3mdev; struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); @@ -512,10 +512,10 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; bool found_port = false, check_bind_conflict = true; bool bhash_created = false, bhash2_created = false; + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); int ret = -EADDRINUSE, port = snum, l3mdev; struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2 = NULL; @@ -767,7 +767,6 @@ void inet_csk_init_xmit_timers(struct sock *sk, timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } -EXPORT_SYMBOL(inet_csk_init_xmit_timers); void inet_csk_clear_xmit_timers(struct sock *sk) { @@ -780,7 +779,6 @@ void inet_csk_clear_xmit_timers(struct sock *sk) sk_stop_timer(sk, &icsk->icsk_delack_timer); sk_stop_timer(sk, &sk->sk_timer); } -EXPORT_SYMBOL(inet_csk_clear_xmit_timers); void inet_csk_clear_xmit_timers_sync(struct sock *sk) { @@ -831,7 +829,6 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } -EXPORT_SYMBOL_GPL(inet_csk_route_req); struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, @@ -898,7 +895,6 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) req->num_retrans++; return err; } -EXPORT_SYMBOL(inet_rtx_syn_ack); static struct request_sock * reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener, @@ -1026,9 +1022,10 @@ static bool reqsk_queue_unlink(struct request_sock *req) bool found = false; if (sk_hashed(sk)) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); - spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); + spinlock_t *lock; + lock = inet_ehash_lockp(hashinfo, req->rsk_hash); spin_lock(lock); found = __sk_nulls_del_node_init_rcu(sk); spin_unlock(lock); @@ -1058,14 +1055,13 @@ bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { return __inet_csk_reqsk_queue_drop(sk, req, false); } -EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) { inet_csk_reqsk_queue_drop(sk, req); reqsk_put(req); } -EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); +EXPORT_IPV6_MOD(inet_csk_reqsk_queue_drop_and_put); static void reqsk_timer_handler(struct timer_list *t) { @@ -1209,7 +1205,6 @@ bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, inet_csk_reqsk_queue_added(sk); return true; } -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, const gfp_t priority) @@ -1290,7 +1285,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, return newsk; } -EXPORT_SYMBOL_GPL(inet_csk_clone_lock); /* * At this point, there should be no process reference to this @@ -1322,7 +1316,7 @@ void inet_csk_destroy_sock(struct sock *sk) EXPORT_SYMBOL(inet_csk_destroy_sock); /* This function allows to force a closure of a socket after the call to - * tcp/dccp_create_openreq_child(). + * tcp_create_openreq_child(). */ void inet_csk_prepare_forced_close(struct sock *sk) __releases(&sk->sk_lock.slock) @@ -1380,7 +1374,6 @@ int inet_csk_listen_start(struct sock *sk) inet_sk_set_state(sk, TCP_CLOSE); return err; } -EXPORT_SYMBOL_GPL(inet_csk_listen_start); static void inet_child_forget(struct sock *sk, struct request_sock *req, struct sock *child) @@ -1475,7 +1468,6 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, sock_put(child); return NULL; } -EXPORT_SYMBOL(inet_csk_complete_hashdance); /* * This routine closes sockets which have been at least partially @@ -1590,4 +1582,3 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) out: return dst; } -EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c2bb91d9e9ffd..907bad776b423 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1369,8 +1369,6 @@ static int inet_diag_type2proto(int type) switch (type) { case TCPDIAG_GETSOCK: return IPPROTO_TCP; - case DCCPDIAG_GETSOCK: - return IPPROTO_DCCP; default: return 0; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 5bf163f756e95..da85cc30e3824 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -176,7 +176,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, */ static void __inet_put_port(struct sock *sk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); struct inet_bind_bucket *tb; @@ -215,7 +215,7 @@ EXPORT_SYMBOL(inet_put_port); int __inet_inherit_port(const struct sock *sk, struct sock *child) { - struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *table = tcp_get_hashinfo(sk); unsigned short port = inet_sk(child)->inet_num; struct inet_bind_hashbucket *head, *head2; bool created_inet_bind_bucket = false; @@ -668,7 +668,7 @@ static bool inet_ehash_lookup_by_sk(struct sock *sk, */ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_ehash_bucket *head; struct hlist_nulls_head *list; spinlock_t *lock; @@ -713,7 +713,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) } return ok; } -EXPORT_SYMBOL_GPL(inet_ehash_nolisten); +EXPORT_IPV6_MOD(inet_ehash_nolisten); static int inet_reuseport_add_sock(struct sock *sk, struct inet_listen_hashbucket *ilb) @@ -740,7 +740,7 @@ static int inet_reuseport_add_sock(struct sock *sk, int __inet_hash(struct sock *sk, struct sock *osk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; int err = 0; @@ -771,7 +771,7 @@ int __inet_hash(struct sock *sk, struct sock *osk) return err; } -EXPORT_SYMBOL(__inet_hash); +EXPORT_IPV6_MOD(__inet_hash); int inet_hash(struct sock *sk) { @@ -782,11 +782,10 @@ int inet_hash(struct sock *sk) return err; } -EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); if (sk_unhashed(sk)) return; @@ -823,7 +822,7 @@ void inet_unhash(struct sock *sk) spin_unlock_bh(lock); } } -EXPORT_SYMBOL_GPL(inet_unhash); +EXPORT_IPV6_MOD(inet_unhash); static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, @@ -874,7 +873,7 @@ inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); u32 hash; #if IS_ENABLED(CONFIG_IPV6) @@ -902,7 +901,7 @@ static void inet_update_saddr(struct sock *sk, void *saddr, int family) static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2, *new_tb2; int l3mdev = inet_sk_bound_l3mdev(sk); @@ -982,14 +981,14 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) { return __inet_bhash2_update_saddr(sk, saddr, family, false); } -EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); +EXPORT_IPV6_MOD(inet_bhash2_update_saddr); void inet_bhash2_reset_saddr(struct sock *sk) { if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) __inet_bhash2_update_saddr(sk, NULL, 0, true); } -EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); +EXPORT_IPV6_MOD(inet_bhash2_reset_saddr); /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') @@ -1214,7 +1213,6 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, return __inet_hash_connect(death_row, sk, port_offset, hash_port0, __inet_check_established); } -EXPORT_SYMBOL_GPL(inet_hash_connect); static void init_hashinfo_lhash2(struct inet_hashinfo *h) { @@ -1265,7 +1263,6 @@ int inet_hashinfo2_init_mod(struct inet_hashinfo *h) init_hashinfo_lhash2(h); return 0; } -EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { @@ -1305,7 +1302,6 @@ int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) hashinfo->ehash_locks_mask = nblocks - 1; return 0; } -EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, unsigned int ehash_entries) @@ -1341,7 +1337,6 @@ struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, err: return NULL; } -EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc); void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) { @@ -1352,4 +1347,3 @@ void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) vfree(hashinfo->ehash); kfree(hashinfo); } -EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index aded4bf1bc16d..67efe95015816 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -166,7 +166,6 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, spin_unlock(lock); local_bh_enable(); } -EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule); static void tw_timer_handler(struct timer_list *t) { @@ -223,7 +222,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, return tw; } -EXPORT_SYMBOL_GPL(inet_twsk_alloc); /* These are always called from BH context. See callers in * tcp_input.c to verify this. @@ -306,7 +304,6 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) mod_timer_pending(&tw->tw_timer, jiffies + timeo); } } -EXPORT_SYMBOL_GPL(__inet_twsk_schedule); /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ void inet_twsk_purge(struct inet_hashinfo *hashinfo) @@ -365,4 +362,3 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo) rcu_read_unlock(); } } -EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 26d15f907551e..f5b9004d6938b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1066,16 +1066,15 @@ static int __net_init ipgre_init_net(struct net *net) return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } -static void __net_exit ipgre_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit ipgre_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, ipgre_net_id, &ipgre_link_ops, dev_to_kill); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, - .exit_batch_rtnl = ipgre_exit_batch_rtnl, + .exit_rtnl = ipgre_exit_rtnl, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1752,16 +1751,15 @@ static int __net_init ipgre_tap_init_net(struct net *net) return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } -static void __net_exit ipgre_tap_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit ipgre_tap_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops, - dev_to_kill); + ip_tunnel_delete_net(net, gre_tap_net_id, &ipgre_tap_ops, dev_to_kill); } static struct pernet_operations ipgre_tap_net_ops = { .init = ipgre_tap_init_net, - .exit_batch_rtnl = ipgre_tap_exit_batch_rtnl, + .exit_rtnl = ipgre_tap_exit_rtnl, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1772,16 +1770,15 @@ static int __net_init erspan_init_net(struct net *net) &erspan_link_ops, "erspan0"); } -static void __net_exit erspan_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit erspan_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, erspan_net_id, &erspan_link_ops, dev_to_kill); } static struct pernet_operations erspan_net_ops = { .init = erspan_init_net, - .exit_batch_rtnl = erspan_exit_batch_rtnl, + .exit_rtnl = erspan_exit_rtnl, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 1024f961ec9ad..3913ec89ad207 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1174,13 +1174,16 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); -static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, - struct list_head *head, - struct rtnl_link_ops *ops) +void ip_tunnel_delete_net(struct net *net, unsigned int id, + struct rtnl_link_ops *ops, + struct list_head *head) { + struct ip_tunnel_net *itn = net_generic(net, id); struct net_device *dev, *aux; int h; + ASSERT_RTNL_NET(net); + for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == ops) unregister_netdevice_queue(dev, head); @@ -1198,21 +1201,7 @@ static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, unregister_netdevice_queue(t->dev, head); } } - -void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, - struct rtnl_link_ops *ops, - struct list_head *dev_to_kill) -{ - struct ip_tunnel_net *itn; - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) { - itn = net_generic(net, id); - ip_tunnel_destroy(net, itn, dev_to_kill, ops); - } -} -EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); +EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); int ip_tunnel_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 159b4473290e0..686e4f3d83aa7 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -523,16 +523,15 @@ static int __net_init vti_init_net(struct net *net) return 0; } -static void __net_exit vti_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit vti_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, vti_net_id, &vti_link_ops, dev_to_kill); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, - .exit_batch_rtnl = vti_exit_batch_rtnl, + .exit_rtnl = vti_exit_rtnl, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index bab0bf90c908d..3e03af073a1cc 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -604,16 +604,15 @@ static int __net_init ipip_init_net(struct net *net) return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } -static void __net_exit ipip_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit ipip_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, ipip_net_id, &ipip_link_ops, dev_to_kill); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, - .exit_batch_rtnl = ipip_exit_batch_rtnl, + .exit_rtnl = ipip_exit_rtnl, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a8b04d4abcaae..3b5677d8467ef 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2511,7 +2511,8 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request"); return -EINVAL; } @@ -2520,7 +2521,6 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || @@ -2836,7 +2836,8 @@ static int ipmr_valid_dumplink(const struct nlmsghdr *nlh, { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump"); return -EINVAL; } @@ -2846,7 +2847,6 @@ static int ipmr_valid_dumplink(const struct nlmsghdr *nlh, return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request"); diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 467151517023d..d9cf06b297d11 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -4040,14 +4040,11 @@ void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, } EXPORT_SYMBOL(nexthop_res_grp_activity_update); -static void __net_exit nexthop_net_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit nexthop_net_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - flush_all_nexthops(net); + ASSERT_RTNL_NET(net); + flush_all_nexthops(net); } static void __net_exit nexthop_net_exit(struct net *net) @@ -4072,7 +4069,7 @@ static int __net_init nexthop_net_init(struct net *net) static struct pernet_operations nexthop_net_ops = { .init = nexthop_net_init, .exit = nexthop_net_exit, - .exit_batch_rtnl = nexthop_net_exit_batch_rtnl, + .exit_rtnl = nexthop_net_exit_rtnl, }; static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers[] __initconst = { diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 10cbeb76c2745..ea2f01584379a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -191,6 +191,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED), SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK), + SNMP_MIB_ITEM("PAWSTimewait", LINUX_MIB_PAWS_TW_REJECTED), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 753704f75b2c6..49cffbe838029 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2699,8 +2699,7 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, if (fl4->saddr) { if (ipv4_is_multicast(fl4->saddr) || - ipv4_is_lbcast(fl4->saddr) || - ipv4_is_zeronet(fl4->saddr)) { + ipv4_is_lbcast(fl4->saddr)) { rth = ERR_PTR(-EINVAL); goto out; } @@ -3206,7 +3205,8 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for route get request"); return -EINVAL; @@ -3216,7 +3216,6 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_table || rtm->rtm_protocol || diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6edc441b37023..e0e96f8fd47cb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1160,6 +1160,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (skb) copy = size_goal - skb->len; + trace_tcp_sendmsg_locked(sk, msg, skb, size_goal); + if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8cce0d5489dae..d5b5c32115d2e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2417,7 +2417,8 @@ int tcp_v4_rcv(struct sk_buff *skb) goto csum_error; } - tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn); + tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, + &drop_reason); switch (tw_status) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(net, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index fb9349be36b80..43d7852ce07e0 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -97,7 +97,8 @@ static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq, */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th, u32 *tw_isn) + const struct tcphdr *th, u32 *tw_isn, + enum skb_drop_reason *drop_reason) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); @@ -245,8 +246,10 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, return TCP_TW_SYN; } - if (paws_reject) - __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); + if (paws_reject) { + *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; + __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED); + } if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2742cc7602bb5..f9f5b92cf4b61 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1942,8 +1942,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, error = -EAGAIN; do { spin_lock_bh(&queue->lock); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, - err, &last); + skb = __skb_try_recv_from_queue(queue, flags, off, err, + &last); if (skb) { if (!(flags & MSG_PEEK)) udp_skb_destructor(sk, skb); @@ -1964,8 +1964,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, queue); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, - err, &last); + skb = __skb_try_recv_from_queue(queue, flags, off, err, + &last); if (skb && !(flags & MSG_PEEK)) udp_skb_dtor_locked(sk, skb); spin_unlock(&sk_queue->lock); @@ -2897,20 +2897,40 @@ void udp_destroy_sock(struct sock *sk) if (encap_destroy) encap_destroy(sk); } - if (udp_test_bit(ENCAP_ENABLED, sk)) + if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udp_encap_needed_key); + udp_tunnel_cleanup_gro(sk); + } } } +typedef struct sk_buff *(*udp_gro_receive_t)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); + static void set_xfrm_gro_udp_encap_rcv(__u16 encap_type, unsigned short family, struct sock *sk) { #ifdef CONFIG_XFRM + udp_gro_receive_t new_gro_receive; + if (udp_test_bit(GRO_ENABLED, sk) && encap_type == UDP_ENCAP_ESPINUDP) { - if (family == AF_INET) - WRITE_ONCE(udp_sk(sk)->gro_receive, xfrm4_gro_udp_encap_rcv); - else if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) - WRITE_ONCE(udp_sk(sk)->gro_receive, ipv6_stub->xfrm6_gro_udp_encap_rcv); + if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) + new_gro_receive = ipv6_stub->xfrm6_gro_udp_encap_rcv; + else + new_gro_receive = xfrm4_gro_udp_encap_rcv; + + if (udp_sk(sk)->gro_receive != new_gro_receive) { + /* + * With IPV6_ADDRFORM the gro callback could change + * after being set, unregister the old one, if valid. + */ + if (udp_sk(sk)->gro_receive) + udp_tunnel_update_gro_rcv(sk, false); + + WRITE_ONCE(udp_sk(sk)->gro_receive, new_gro_receive); + udp_tunnel_update_gro_rcv(sk, true); + } } #endif } @@ -2960,6 +2980,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, break; case UDP_ENCAP: + sockopt_lock_sock(sk); switch (val) { case 0: #ifdef CONFIG_XFRM @@ -2983,6 +3004,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, err = -ENOPROTOOPT; break; } + sockopt_release_sock(sk); break; case UDP_NO_CHECK6_TX: @@ -3000,13 +3022,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, break; case UDP_GRO: - + sockopt_lock_sock(sk); /* when enabling GRO, accept the related GSO packet type */ if (valbool) udp_tunnel_encap_enable(sk); udp_assign_bit(GRO_ENABLED, sk, valbool); udp_assign_bit(ACCEPT_L4, sk, valbool); set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk); + sockopt_release_sock(sk); break; /* @@ -3810,6 +3833,15 @@ static void __net_init udp_set_table(struct net *net) static int __net_init udp_pernet_init(struct net *net) { +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + int i; + + /* No tunnel is configured */ + for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) { + INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list); + RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL); + } +#endif udp_sysctl_init(net); udp_set_table(net); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 2c0725583be39..787b2947d3a04 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -12,6 +12,169 @@ #include #include #include +#include + +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + +/* + * Dummy GRO tunnel callback, exists mainly to avoid dangling/NULL + * values for the udp tunnel static call. + */ +static struct sk_buff *dummy_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->flush = 1; + return NULL; +} + +typedef struct sk_buff *(*udp_tunnel_gro_rcv_t)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); + +struct udp_tunnel_type_entry { + udp_tunnel_gro_rcv_t gro_receive; + refcount_t count; +}; + +#define UDP_MAX_TUNNEL_TYPES (IS_ENABLED(CONFIG_GENEVE) + \ + IS_ENABLED(CONFIG_VXLAN) * 2 + \ + IS_ENABLED(CONFIG_NET_FOU) * 2 + \ + IS_ENABLED(CONFIG_XFRM) * 2) + +DEFINE_STATIC_CALL(udp_tunnel_gro_rcv, dummy_gro_rcv); +static DEFINE_STATIC_KEY_FALSE(udp_tunnel_static_call); +static struct mutex udp_tunnel_gro_type_lock; +static struct udp_tunnel_type_entry udp_tunnel_gro_types[UDP_MAX_TUNNEL_TYPES]; +static unsigned int udp_tunnel_gro_type_nr; +static DEFINE_SPINLOCK(udp_tunnel_gro_lock); + +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) +{ + bool is_ipv6 = sk->sk_family == AF_INET6; + struct udp_sock *tup, *up = udp_sk(sk); + struct udp_tunnel_gro *udp_tunnel_gro; + + spin_lock(&udp_tunnel_gro_lock); + udp_tunnel_gro = &net->ipv4.udp_tunnel_gro[is_ipv6]; + if (add) + hlist_add_head(&up->tunnel_list, &udp_tunnel_gro->list); + else if (up->tunnel_list.pprev) + hlist_del_init(&up->tunnel_list); + + if (udp_tunnel_gro->list.first && + !udp_tunnel_gro->list.first->next) { + tup = hlist_entry(udp_tunnel_gro->list.first, struct udp_sock, + tunnel_list); + + rcu_assign_pointer(udp_tunnel_gro->sk, (struct sock *)tup); + } else { + RCU_INIT_POINTER(udp_tunnel_gro->sk, NULL); + } + + spin_unlock(&udp_tunnel_gro_lock); +} +EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup); + +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) +{ + struct udp_tunnel_type_entry *cur = NULL; + struct udp_sock *up = udp_sk(sk); + int i, old_gro_type_nr; + + if (!UDP_MAX_TUNNEL_TYPES || !up->gro_receive) + return; + + mutex_lock(&udp_tunnel_gro_type_lock); + + /* Check if the static call is permanently disabled. */ + if (udp_tunnel_gro_type_nr > UDP_MAX_TUNNEL_TYPES) + goto out; + + for (i = 0; i < udp_tunnel_gro_type_nr; i++) + if (udp_tunnel_gro_types[i].gro_receive == up->gro_receive) + cur = &udp_tunnel_gro_types[i]; + + old_gro_type_nr = udp_tunnel_gro_type_nr; + if (add) { + /* + * Update the matching entry, if found, or add a new one + * if needed + */ + if (cur) { + refcount_inc(&cur->count); + goto out; + } + + if (unlikely(udp_tunnel_gro_type_nr == UDP_MAX_TUNNEL_TYPES)) { + pr_err_once("Too many UDP tunnel types, please increase UDP_MAX_TUNNEL_TYPES\n"); + /* Ensure static call will never be enabled */ + udp_tunnel_gro_type_nr = UDP_MAX_TUNNEL_TYPES + 1; + } else { + cur = &udp_tunnel_gro_types[udp_tunnel_gro_type_nr++]; + refcount_set(&cur->count, 1); + cur->gro_receive = up->gro_receive; + } + } else { + /* + * The stack cleanups only successfully added tunnel, the + * lookup on removal should never fail. + */ + if (WARN_ON_ONCE(!cur)) + goto out; + + if (!refcount_dec_and_test(&cur->count)) + goto out; + + /* Avoid gaps, so that the enable tunnel has always id 0 */ + *cur = udp_tunnel_gro_types[--udp_tunnel_gro_type_nr]; + } + + if (udp_tunnel_gro_type_nr == 1) { + static_call_update(udp_tunnel_gro_rcv, + udp_tunnel_gro_types[0].gro_receive); + static_branch_enable(&udp_tunnel_static_call); + } else if (old_gro_type_nr == 1) { + static_branch_disable(&udp_tunnel_static_call); + static_call_update(udp_tunnel_gro_rcv, dummy_gro_rcv); + } + +out: + mutex_unlock(&udp_tunnel_gro_type_lock); +} +EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_rcv); + +static void udp_tunnel_gro_init(void) +{ + mutex_init(&udp_tunnel_gro_type_lock); +} + +static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + if (static_branch_likely(&udp_tunnel_static_call)) { + if (unlikely(gro_recursion_inc_test(skb))) { + NAPI_GRO_CB(skb)->flush |= 1; + return NULL; + } + return static_call(udp_tunnel_gro_rcv)(sk, head, skb); + } + return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); +} + +#else + +static void udp_tunnel_gro_init(void) {} + +static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); +} + +#endif static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, @@ -622,7 +785,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); - pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); + pp = udp_tunnel_gro_rcv(sk, head, skb); out: skb_gro_flush_final(skb, pp, flush); @@ -635,8 +798,13 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct iphdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, false); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, @@ -767,5 +935,7 @@ int __init udpv4_offload_init(void) .gro_complete = udp4_gro_complete, }, }; + + udp_tunnel_gro_init(); return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP); } diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 619a53eb672da..2326548997d3f 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -58,6 +58,15 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, } EXPORT_SYMBOL(udp_sock_create4); +static bool sk_saddr_any(struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); +#else + return !sk->sk_rcv_saddr; +#endif +} + void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *cfg) { @@ -80,6 +89,12 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->gro_complete = cfg->gro_complete; udp_tunnel_encap_enable(sk); + + udp_tunnel_update_gro_rcv(sk, true); + + if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sk) && + sk->sk_kern_sock) + udp_tunnel_update_gro_lookup(net, sk, true); } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9ba83f0c99283..ec6c4ca6d6d99 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5346,12 +5346,12 @@ static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, struct ifaddrmsg *ifm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for address dump request"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address dump request"); return -EINVAL; @@ -5484,7 +5484,8 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, struct ifaddrmsg *ifm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get address request"); return -EINVAL; } @@ -5493,7 +5494,6 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, extack); - ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get address request"); return -EINVAL; @@ -6112,7 +6112,8 @@ static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh, { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for link dump request"); return -EINVAL; } @@ -6122,7 +6123,6 @@ static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh, return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for dump request"); diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index ab054f329e12d..fb63ffbcfc647 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -473,12 +473,12 @@ static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh, { struct ifaddrlblmsg *ifal; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { + ifal = nlmsg_payload(nlh, sizeof(*ifal)); + if (!ifal) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for address label dump request"); return -EINVAL; } - ifal = nlmsg_data(nlh); if (ifal->__ifal_reserved || ifal->ifal_prefixlen || ifal->ifal_flags || ifal->ifal_index || ifal->ifal_seq) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address label dump request"); @@ -543,7 +543,8 @@ static int ip6addrlbl_valid_get_req(struct sk_buff *skb, struct ifaddrlblmsg *ifal; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { + ifal = nlmsg_payload(nlh, sizeof(*ifal)); + if (!ifal) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for addrlabel get request"); return -EINVAL; } @@ -552,7 +553,6 @@ static int ip6addrlbl_valid_get_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy, extack); - ifal = nlmsg_data(nlh); if (ifal->__ifal_reserved || ifal->ifal_flags || ifal->ifal_seq) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for addrlabel get request"); return -EINVAL; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index f60ec8b0f8ea4..acaff12967835 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -715,6 +715,7 @@ const struct proto_ops inet6_stream_ops = { #endif .set_rcvlowat = tcp_set_rcvlowat, }; +EXPORT_SYMBOL_GPL(inet6_stream_ops); const struct proto_ops inet6_dgram_ops = { .family = PF_INET6, @@ -881,7 +882,6 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, } return false; } -EXPORT_SYMBOL_GPL(ipv6_opt_accepted); static struct packet_type ipv6_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index dbcf556a35bbc..8f500eaf33cfc 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -54,7 +54,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, return dst; } -EXPORT_SYMBOL(inet6_csk_route_req); static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) @@ -137,4 +136,3 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) dst = inet6_csk_route_socket(sk, &fl6); return IS_ERR(dst) ? NULL : dst; } -EXPORT_SYMBOL_GPL(inet6_csk_update_pmtu); diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 09065187378e6..40df8bdfaacdd 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -38,6 +38,7 @@ struct ioam6_lwt_freq { }; struct ioam6_lwt { + struct dst_entry null_dst; struct dst_cache cache; struct ioam6_lwt_freq freq; atomic_t pkt_cnt; @@ -177,6 +178,14 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, if (err) goto free_lwt; + /* This "fake" dst_entry will be stored in a dst_cache, which will call + * dst_hold() and dst_release() on it. We must ensure that dst_destroy() + * will never be called. For that, its initial refcount is 1 and +1 when + * it is stored in the cache. Then, +1/-1 each time we read the cache + * and release it. Long story short, we're fine. + */ + dst_init(&ilwt->null_dst, NULL, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT); + atomic_set(&ilwt->pkt_cnt, 0); ilwt->freq.k = freq_k; ilwt->freq.n = freq_n; @@ -336,7 +345,8 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb), *cache_dst = NULL; + struct dst_entry *orig_dst = skb_dst(skb); + struct dst_entry *dst = NULL; struct ioam6_lwt *ilwt; int err = -EINVAL; u32 pkt_cnt; @@ -344,7 +354,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - ilwt = ioam6_lwt_state(dst->lwtstate); + ilwt = ioam6_lwt_state(orig_dst->lwtstate); /* Check for insertion frequency (i.e., "k over n" insertions) */ pkt_cnt = atomic_fetch_inc(&ilwt->pkt_cnt); @@ -352,9 +362,20 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto out; local_bh_disable(); - cache_dst = dst_cache_get(&ilwt->cache); + dst = dst_cache_get(&ilwt->cache); local_bh_enable(); + /* This is how we notify that the destination does not change after + * transformation and that we need to use orig_dst instead of the cache + */ + if (dst == &ilwt->null_dst) { + dst_release(dst); + + dst = orig_dst; + /* keep refcount balance: dst_release() is called at the end */ + dst_hold(dst); + } + switch (ilwt->mode) { case IOAM6_IPTUNNEL_MODE_INLINE: do_inline: @@ -362,7 +383,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) goto out; - err = ioam6_do_inline(net, skb, &ilwt->tuninfo, cache_dst); + err = ioam6_do_inline(net, skb, &ilwt->tuninfo, dst); if (unlikely(err)) goto drop; @@ -372,7 +393,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) /* Encapsulation (ip6ip6) */ err = ioam6_do_encap(net, skb, &ilwt->tuninfo, ilwt->has_tunsrc, &ilwt->tunsrc, - &ilwt->tundst, cache_dst); + &ilwt->tundst, dst); if (unlikely(err)) goto drop; @@ -390,7 +411,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - if (unlikely(!cache_dst)) { + if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; @@ -401,20 +422,27 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = hdr->nexthdr; - cache_dst = ip6_route_output(net, NULL, &fl6); - if (cache_dst->error) { - err = cache_dst->error; + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + err = dst->error; goto drop; } - /* cache only if we don't create a dst reference loop */ - if (dst->lwtstate != cache_dst->lwtstate) { - local_bh_disable(); - dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr); - local_bh_enable(); - } - - err = skb_cow_head(skb, LL_RESERVED_SPACE(cache_dst->dev)); + /* If the destination is the same after transformation (which is + * a valid use case for IOAM), then we don't want to add it to + * the cache in order to avoid a reference loop. Instead, we add + * our fake dst_entry to the cache as a way to detect this case. + * Otherwise, we add the resolved destination to the cache. + */ + local_bh_disable(); + if (orig_dst->lwtstate == dst->lwtstate) + dst_cache_set_ip6(&ilwt->cache, + &ilwt->null_dst, &fl6.saddr); + else + dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); + local_bh_enable(); + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) goto drop; } @@ -422,22 +450,26 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) /* avoid lwtunnel_output() reentry loop when destination is the same * after transformation (e.g., with the inline mode) */ - if (dst->lwtstate != cache_dst->lwtstate) { + if (orig_dst->lwtstate != dst->lwtstate) { skb_dst_drop(skb); - skb_dst_set(skb, cache_dst); + skb_dst_set(skb, dst); return dst_output(net, sk, skb); } out: - dst_release(cache_dst); - return dst->lwtstate->orig_output(net, sk, skb); + dst_release(dst); + return orig_dst->lwtstate->orig_output(net, sk, skb); drop: - dst_release(cache_dst); + dst_release(dst); kfree_skb(skb); return err; } static void ioam6_destroy_state(struct lwtunnel_state *lwt) { + /* Since the refcount of per-cpu dst_entry caches will never be 0 (see + * why above) when our "fake" dst_entry is used, it is not necessary to + * remove them before calling dst_cache_destroy() + */ dst_cache_destroy(&ioam6_lwt_state(lwt)->cache); } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 957ca98fa70f9..2dc9dcffe2cab 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1570,7 +1570,7 @@ static struct inet6_protocol ip6gre_protocol __read_mostly = { .flags = INET6_PROTO_FINAL, }; -static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) +static void __net_exit ip6gre_exit_rtnl_net(struct net *net, struct list_head *head) { struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); struct net_device *dev, *aux; @@ -1587,16 +1587,16 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) for (h = 0; h < IP6_GRE_HASH_SIZE; h++) { struct ip6_tnl *t; - t = rtnl_dereference(ign->tunnels[prio][h]); + t = rtnl_net_dereference(net, ign->tunnels[prio][h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, - head); - t = rtnl_dereference(t->next); + unregister_netdevice_queue(t->dev, head); + + t = rtnl_net_dereference(net, t->next); } } } @@ -1640,19 +1640,9 @@ static int __net_init ip6gre_init_net(struct net *net) return err; } -static void __net_exit ip6gre_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - ip6gre_destroy_tunnels(net, dev_to_kill); -} - static struct pernet_operations ip6gre_net_ops = { .init = ip6gre_init_net, - .exit_batch_rtnl = ip6gre_exit_batch_rtnl, + .exit_rtnl = ip6gre_exit_rtnl_net, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), }; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 581bc62890818..ef052ccd93800 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -259,7 +259,7 @@ bool ip6_autoflowlabel(struct net *net, const struct sock *sk) } /* - * xmit an sk_buff (used by TCP, SCTP and DCCP) + * xmit an sk_buff (used by TCP and SCTP) * Note : socket lock is not held for SYNACK packets, but might be modified * by calls to skb_set_owner_w() and ipv6_local_error(), * which are using proper atomic operations or spinlocks. diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a04dd1bb4b195..894d3158a6f05 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2210,7 +2210,7 @@ static struct xfrm6_tunnel mplsip6_handler __read_mostly = { .priority = 1, }; -static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) +static void __net_exit ip6_tnl_exit_rtnl_net(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; @@ -2222,25 +2222,27 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { - t = rtnl_dereference(ip6n->tnls_r_l[h]); + t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + + t = rtnl_net_dereference(net, t->next); } } - t = rtnl_dereference(ip6n->tnls_wc[0]); + t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + + t = rtnl_net_dereference(net, t->next); } } @@ -2287,19 +2289,9 @@ static int __net_init ip6_tnl_init_net(struct net *net) return err; } -static void __net_exit ip6_tnl_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - ip6_tnl_destroy_tunnels(net, dev_to_kill); -} - static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, - .exit_batch_rtnl = ip6_tnl_exit_batch_rtnl, + .exit_rtnl = ip6_tnl_exit_rtnl_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 09ec4b0ad7dc6..40464a88bca65 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -1112,21 +1112,21 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = { .get_link_net = ip6_tnl_get_link_net, }; -static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n, - struct list_head *list) +static void __net_exit vti6_exit_rtnl_net(struct net *net, struct list_head *list) { - int h; + struct vti6_net *ip6n = net_generic(net, vti6_net_id); struct ip6_tnl *t; + int h; for (h = 0; h < IP6_VTI_HASH_SIZE; h++) { - t = rtnl_dereference(ip6n->tnls_r_l[h]); + t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + t = rtnl_net_dereference(net, t->next); } } - t = rtnl_dereference(ip6n->tnls_wc[0]); + t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); if (t) unregister_netdevice_queue(t->dev, list); } @@ -1170,22 +1170,9 @@ static int __net_init vti6_init_net(struct net *net) return err; } -static void __net_exit vti6_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct vti6_net *ip6n; - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) { - ip6n = net_generic(net, vti6_net_id); - vti6_destroy_tunnels(ip6n, dev_to_kill); - } -} - static struct pernet_operations vti6_net_ops = { .init = vti6_init_net, - .exit_batch_rtnl = vti6_exit_batch_rtnl, + .exit_rtnl = vti6_exit_rtnl_net, .id = &vti6_net_id, .size = sizeof(struct vti6_net), }; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 96f1621e2381c..945857a8bfe38 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6030,7 +6030,8 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; @@ -6040,7 +6041,6 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 9a0f32acb7500..a72dbca9e8fca 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1804,8 +1804,7 @@ static struct xfrm_tunnel mplsip_handler __read_mostly = { }; #endif -static void __net_exit sit_destroy_tunnels(struct net *net, - struct list_head *head) +static void __net_exit sit_exit_rtnl_net(struct net *net, struct list_head *head) { struct sit_net *sitn = net_generic(net, sit_net_id); struct net_device *dev, *aux; @@ -1820,15 +1819,15 @@ static void __net_exit sit_destroy_tunnels(struct net *net, for (h = 0; h < (prio ? IP6_SIT_HASH_SIZE : 1); h++) { struct ip_tunnel *t; - t = rtnl_dereference(sitn->tunnels[prio][h]); + t = rtnl_net_dereference(net, sitn->tunnels[prio][h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, - head); - t = rtnl_dereference(t->next); + unregister_netdevice_queue(t->dev, head); + + t = rtnl_net_dereference(net, t->next); } } } @@ -1881,19 +1880,9 @@ static int __net_init sit_init_net(struct net *net) return err; } -static void __net_exit sit_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - sit_destroy_tunnels(net, dev_to_kill); -} - static struct pernet_operations sit_net_ops = { .init = sit_init_net, - .exit_batch_rtnl = sit_exit_batch_rtnl, + .exit_rtnl = sit_exit_rtnl_net, .id = &sit_net_id, .size = sizeof(struct sit_net), }; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b03c223eda4fa..7dcb33f879ee1 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1970,7 +1970,8 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) goto csum_error; } - tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn); + tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, + &drop_reason); switch (tw_status) { case TCP_TW_SYN: { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 024458ef163c9..7317f8e053f1c 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -1825,6 +1826,7 @@ void udpv6_destroy_sock(struct sock *sk) if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udpv6_encap_needed_key); udp_encap_disable(); + udp_tunnel_cleanup_gro(sk); } } } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 404212dfc99ab..d8445ac1b2e43 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -118,8 +118,13 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct ipv6hdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, true); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 1f63b32d76d67..d536c97144e9d 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2095,12 +2095,12 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct rtmsg *rtm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for FIB dump request"); return -EINVAL; } - rtm = nlmsg_data(nlh); if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { @@ -2288,7 +2288,8 @@ static int mpls_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; @@ -2298,7 +2299,6 @@ static int mpls_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_dst_len && rtm->rtm_dst_len != 20) || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 19eb9292bd609..0c24545f0e8df 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -28,6 +28,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), + SNMP_MIB_ITEM("MPJoinRejected", MPTCP_MIB_JOINREJECTED), SNMP_MIB_ITEM("MPJoinSynTx", MPTCP_MIB_JOINSYNTX), SNMP_MIB_ITEM("MPJoinSynTxCreatSkErr", MPTCP_MIB_JOINSYNTXCREATSKERR), SNMP_MIB_ITEM("MPJoinSynTxBindErr", MPTCP_MIB_JOINSYNTXBINDERR), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 128282982843a..250c6b77977e8 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -23,6 +23,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */ MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ + MPTCP_MIB_JOINREJECTED, /* The PM rejected the JOIN request */ MPTCP_MIB_JOINSYNTX, /* Sending a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNTXCREATSKERR, /* Not able to create a socket when sending a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNTXBINDERR, /* Not able to bind() the address when sending a SYN + MP_JOIN */ diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 31747f974941f..1306d4dc287b8 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -151,10 +151,13 @@ bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; + bool ret; entry = mptcp_pm_del_add_timer(msk, addr, false); + ret = entry; kfree(entry); - return entry; + + return ret; } bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 44f7ab463d755..26ffa06c21e8d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3527,8 +3527,10 @@ bool mptcp_finish_join(struct sock *ssk) return true; } - if (!mptcp_pm_allow_new_subflow(msk)) + if (!mptcp_pm_allow_new_subflow(msk)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_JOINREJECTED); goto err_prohibited; + } /* If we can't acquire msk socket lock here, let the release callback * handle it diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d409586b5977f..7aa38d74fef6b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -744,6 +744,7 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); struct mptcp_sched_ops *mptcp_sched_find(const char *name); +int mptcp_validate_scheduler(struct mptcp_sched_ops *sched); int mptcp_register_scheduler(struct mptcp_sched_ops *sched); void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); void mptcp_sched_init(void); diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index c16c6fbd4ba2f..1e59072d478c9 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -16,8 +16,7 @@ static DEFINE_SPINLOCK(mptcp_sched_list_lock); static LIST_HEAD(mptcp_sched_list); -static int mptcp_sched_default_get_send(struct mptcp_sock *msk, - struct mptcp_sched_data *data) +static int mptcp_sched_default_get_send(struct mptcp_sock *msk) { struct sock *ssk; @@ -29,8 +28,7 @@ static int mptcp_sched_default_get_send(struct mptcp_sock *msk, return 0; } -static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk, - struct mptcp_sched_data *data) +static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk) { struct sock *ssk; @@ -84,10 +82,23 @@ void mptcp_get_available_schedulers(char *buf, size_t maxlen) rcu_read_unlock(); } -int mptcp_register_scheduler(struct mptcp_sched_ops *sched) +int mptcp_validate_scheduler(struct mptcp_sched_ops *sched) { - if (!sched->get_send) + if (!sched->get_send) { + pr_err("%s does not implement required ops\n", sched->name); return -EINVAL; + } + + return 0; +} + +int mptcp_register_scheduler(struct mptcp_sched_ops *sched) +{ + int ret; + + ret = mptcp_validate_scheduler(sched); + if (ret) + return ret; spin_lock(&mptcp_sched_list_lock); if (mptcp_sched_find(sched->name)) { @@ -157,7 +168,6 @@ void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, int mptcp_sched_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct mptcp_sched_data *data = NULL; msk_owned_by_me(msk); @@ -178,14 +188,13 @@ int mptcp_sched_get_send(struct mptcp_sock *msk) } if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_send(msk, data); - return msk->sched->get_send(msk, data); + return mptcp_sched_default_get_send(msk); + return msk->sched->get_send(msk); } int mptcp_sched_get_retrans(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct mptcp_sched_data *data = NULL; msk_owned_by_me(msk); @@ -199,8 +208,8 @@ int mptcp_sched_get_retrans(struct mptcp_sock *msk) } if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_retrans(msk, data); + return mptcp_sched_default_get_retrans(msk); if (msk->sched->get_retrans) - return msk->sched->get_retrans(msk, data); - return msk->sched->get_send(msk, data); + return msk->sched->get_retrans(msk); + return msk->sched->get_send(msk); } diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 24c2de1891bdf..15613d691bfef 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -247,6 +247,7 @@ static int subflow_check_req(struct request_sock *req, if (unlikely(req->syncookie)) { if (!mptcp_can_accept_new_subflow(subflow_req->msk)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } @@ -745,15 +746,11 @@ struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *op EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); /* validate hmac received in third ACK */ -static bool subflow_hmac_valid(const struct request_sock *req, +static bool subflow_hmac_valid(const struct mptcp_subflow_request_sock *subflow_req, const struct mptcp_options_received *mp_opt) { - const struct mptcp_subflow_request_sock *subflow_req; + struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; - struct mptcp_sock *msk; - - subflow_req = mptcp_subflow_rsk(req); - msk = subflow_req->msk; subflow_generate_hmac(READ_ONCE(msk->remote_key), READ_ONCE(msk->local_key), @@ -899,13 +896,14 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, goto dispose_child; } - if (!subflow_hmac_valid(req, &mp_opt)) { + if (!subflow_hmac_valid(subflow_req, &mp_opt)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } if (!mptcp_can_accept_new_subflow(owner)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 4e0842df5234e..2c260f33b55cc 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -143,16 +143,15 @@ struct ncsi_channel_vlan_filter { }; struct ncsi_channel_stats { - u32 hnc_cnt_hi; /* Counter cleared */ - u32 hnc_cnt_lo; /* Counter cleared */ - u32 hnc_rx_bytes; /* Rx bytes */ - u32 hnc_tx_bytes; /* Tx bytes */ - u32 hnc_rx_uc_pkts; /* Rx UC packets */ - u32 hnc_rx_mc_pkts; /* Rx MC packets */ - u32 hnc_rx_bc_pkts; /* Rx BC packets */ - u32 hnc_tx_uc_pkts; /* Tx UC packets */ - u32 hnc_tx_mc_pkts; /* Tx MC packets */ - u32 hnc_tx_bc_pkts; /* Tx BC packets */ + u64 hnc_cnt; /* Counter cleared */ + u64 hnc_rx_bytes; /* Rx bytes */ + u64 hnc_tx_bytes; /* Tx bytes */ + u64 hnc_rx_uc_pkts; /* Rx UC packets */ + u64 hnc_rx_mc_pkts; /* Rx MC packets */ + u64 hnc_rx_bc_pkts; /* Rx BC packets */ + u64 hnc_tx_uc_pkts; /* Tx UC packets */ + u64 hnc_tx_mc_pkts; /* Tx MC packets */ + u64 hnc_tx_bc_pkts; /* Tx BC packets */ u32 hnc_fcs_err; /* FCS errors */ u32 hnc_align_err; /* Alignment errors */ u32 hnc_false_carrier; /* False carrier detection */ @@ -181,7 +180,7 @@ struct ncsi_channel_stats { u32 hnc_tx_1023_frames; /* Tx 512-1023 bytes frames */ u32 hnc_tx_1522_frames; /* Tx 1024-1522 bytes frames */ u32 hnc_tx_9022_frames; /* Tx 1523-9022 bytes frames */ - u32 hnc_rx_valid_bytes; /* Rx valid bytes */ + u64 hnc_rx_valid_bytes; /* Rx valid bytes */ u32 hnc_rx_runt_pkts; /* Rx error runt packets */ u32 hnc_rx_jabber_pkts; /* Rx error jabber packets */ u32 ncsi_rx_cmds; /* Rx NCSI commands */ diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index f2f3b5c1b9412..24edb27379724 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -252,16 +252,15 @@ struct ncsi_rsp_gp_pkt { /* Get Controller Packet Statistics */ struct ncsi_rsp_gcps_pkt { struct ncsi_rsp_pkt_hdr rsp; /* Response header */ - __be32 cnt_hi; /* Counter cleared */ - __be32 cnt_lo; /* Counter cleared */ - __be32 rx_bytes; /* Rx bytes */ - __be32 tx_bytes; /* Tx bytes */ - __be32 rx_uc_pkts; /* Rx UC packets */ - __be32 rx_mc_pkts; /* Rx MC packets */ - __be32 rx_bc_pkts; /* Rx BC packets */ - __be32 tx_uc_pkts; /* Tx UC packets */ - __be32 tx_mc_pkts; /* Tx MC packets */ - __be32 tx_bc_pkts; /* Tx BC packets */ + __be64 cnt; /* Counter cleared */ + __be64 rx_bytes; /* Rx bytes */ + __be64 tx_bytes; /* Tx bytes */ + __be64 rx_uc_pkts; /* Rx UC packets */ + __be64 rx_mc_pkts; /* Rx MC packets */ + __be64 rx_bc_pkts; /* Rx BC packets */ + __be64 tx_uc_pkts; /* Tx UC packets */ + __be64 tx_mc_pkts; /* Tx MC packets */ + __be64 tx_bc_pkts; /* Tx BC packets */ __be32 fcs_err; /* FCS errors */ __be32 align_err; /* Alignment errors */ __be32 false_carrier; /* False carrier detection */ @@ -290,11 +289,11 @@ struct ncsi_rsp_gcps_pkt { __be32 tx_1023_frames; /* Tx 512-1023 bytes frames */ __be32 tx_1522_frames; /* Tx 1024-1522 bytes frames */ __be32 tx_9022_frames; /* Tx 1523-9022 bytes frames */ - __be32 rx_valid_bytes; /* Rx valid bytes */ + __be64 rx_valid_bytes; /* Rx valid bytes */ __be32 rx_runt_pkts; /* Rx error runt packets */ __be32 rx_jabber_pkts; /* Rx error jabber packets */ __be32 checksum; /* Checksum */ -}; +} __packed __aligned(4); /* Get NCSI Statistics */ struct ncsi_rsp_gns_pkt { diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 4a8ce2949faea..8668888c5a2f9 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -926,16 +926,15 @@ static int ncsi_rsp_handler_gcps(struct ncsi_request *nr) /* Update HNC's statistics */ ncs = &nc->stats; - ncs->hnc_cnt_hi = ntohl(rsp->cnt_hi); - ncs->hnc_cnt_lo = ntohl(rsp->cnt_lo); - ncs->hnc_rx_bytes = ntohl(rsp->rx_bytes); - ncs->hnc_tx_bytes = ntohl(rsp->tx_bytes); - ncs->hnc_rx_uc_pkts = ntohl(rsp->rx_uc_pkts); - ncs->hnc_rx_mc_pkts = ntohl(rsp->rx_mc_pkts); - ncs->hnc_rx_bc_pkts = ntohl(rsp->rx_bc_pkts); - ncs->hnc_tx_uc_pkts = ntohl(rsp->tx_uc_pkts); - ncs->hnc_tx_mc_pkts = ntohl(rsp->tx_mc_pkts); - ncs->hnc_tx_bc_pkts = ntohl(rsp->tx_bc_pkts); + ncs->hnc_cnt = be64_to_cpu(rsp->cnt); + ncs->hnc_rx_bytes = be64_to_cpu(rsp->rx_bytes); + ncs->hnc_tx_bytes = be64_to_cpu(rsp->tx_bytes); + ncs->hnc_rx_uc_pkts = be64_to_cpu(rsp->rx_uc_pkts); + ncs->hnc_rx_mc_pkts = be64_to_cpu(rsp->rx_mc_pkts); + ncs->hnc_rx_bc_pkts = be64_to_cpu(rsp->rx_bc_pkts); + ncs->hnc_tx_uc_pkts = be64_to_cpu(rsp->tx_uc_pkts); + ncs->hnc_tx_mc_pkts = be64_to_cpu(rsp->tx_mc_pkts); + ncs->hnc_tx_bc_pkts = be64_to_cpu(rsp->tx_bc_pkts); ncs->hnc_fcs_err = ntohl(rsp->fcs_err); ncs->hnc_align_err = ntohl(rsp->align_err); ncs->hnc_false_carrier = ntohl(rsp->false_carrier); @@ -964,7 +963,7 @@ static int ncsi_rsp_handler_gcps(struct ncsi_request *nr) ncs->hnc_tx_1023_frames = ntohl(rsp->tx_1023_frames); ncs->hnc_tx_1522_frames = ntohl(rsp->tx_1522_frames); ncs->hnc_tx_9022_frames = ntohl(rsp->tx_9022_frames); - ncs->hnc_rx_valid_bytes = ntohl(rsp->rx_valid_bytes); + ncs->hnc_rx_valid_bytes = be64_to_cpu(rsp->rx_valid_bytes); ncs->hnc_rx_runt_pkts = ntohl(rsp->rx_runt_pkts); ncs->hnc_rx_jabber_pkts = ntohl(rsp->rx_jabber_pkts); diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index a20986806fea9..f60b81c660789 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -67,6 +67,29 @@ config RXKAD See Documentation/networking/rxrpc.rst. +config RXGK + bool "RxRPC GSSAPI security" + select CRYPTO_KRB5 + select CRYPTO_MANAGER + select CRYPTO_KRB5ENC + select CRYPTO_AUTHENC + select CRYPTO_SKCIPHER + select CRYPTO_HASH_INFO + select CRYPTO_HMAC + select CRYPTO_CMAC + select CRYPTO_SHA1 + select CRYPTO_SHA256 + select CRYPTO_SHA512 + select CRYPTO_CBC + select CRYPTO_CTS + select CRYPTO_AES + select CRYPTO_CAMELLIA + help + Provide the GSSAPI-based RxGK security class for AFS. Keys are added + with add_key(). + + See Documentation/networking/rxrpc.rst. + config RXPERF tristate "RxRPC test service" help diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index 210b75e3179e9..c0542bae719e3 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -24,6 +24,7 @@ rxrpc-y := \ local_object.o \ misc.o \ net_ns.o \ + oob.o \ output.o \ peer_event.o \ peer_object.o \ @@ -39,6 +40,9 @@ rxrpc-y := \ rxrpc-$(CONFIG_PROC_FS) += proc.o rxrpc-$(CONFIG_RXKAD) += rxkad.o rxrpc-$(CONFIG_SYSCTL) += sysctl.o - +rxrpc-$(CONFIG_RXGK) += \ + rxgk.o \ + rxgk_app.o \ + rxgk_kdf.o obj-$(CONFIG_RXPERF) += rxperf.o diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 86873399f7d50..3a558c1a541e1 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -265,7 +265,10 @@ static int rxrpc_listen(struct socket *sock, int backlog) * @gfp: Allocation flags * * Lookup or create a remote transport endpoint record for the specified - * address and return it with a ref held. + * address. + * + * Return: The peer record found with a reference, %NULL if no record is found + * or a negative error code if the address is invalid or unsupported. */ struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, struct sockaddr_rxrpc *srx, gfp_t gfp) @@ -283,9 +286,11 @@ EXPORT_SYMBOL(rxrpc_kernel_lookup_peer); /** * rxrpc_kernel_get_peer - Get a reference on a peer - * @peer: The peer to get a reference on. + * @peer: The peer to get a reference on (may be NULL). + * + * Get a reference for a remote peer record (if not NULL). * - * Get a record for the remote peer in a call. + * Return: The @peer argument. */ struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer) { @@ -296,6 +301,8 @@ EXPORT_SYMBOL(rxrpc_kernel_get_peer); /** * rxrpc_kernel_put_peer - Allow a kernel app to drop a peer reference * @peer: The peer to drop a ref on + * + * Drop a reference on a peer record. */ void rxrpc_kernel_put_peer(struct rxrpc_peer *peer) { @@ -320,10 +327,12 @@ EXPORT_SYMBOL(rxrpc_kernel_put_peer); * * Allow a kernel service to begin a call on the nominated socket. This just * sets up all the internal tracking structures and allocates connection and - * call IDs as appropriate. The call to be used is returned. + * call IDs as appropriate. * * The default socket destination address and security may be overridden by * supplying @srx and @key. + * + * Return: The new call or an error code. */ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_peer *peer, @@ -437,6 +446,8 @@ EXPORT_SYMBOL(rxrpc_kernel_put_call); * * Allow a kernel service to find out whether a call is still alive - whether * it has completed successfully and all received data has been consumed. + * + * Return: %true if the call is still ongoing and %false if it has completed. */ bool rxrpc_kernel_check_life(const struct socket *sock, const struct rxrpc_call *call) @@ -456,6 +467,8 @@ EXPORT_SYMBOL(rxrpc_kernel_check_life); * * Allow a kernel service to retrieve the epoch value from a service call to * see if the client at the other end rebooted. + * + * Return: The epoch of the call's connection. */ u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call) { @@ -464,24 +477,20 @@ u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call) EXPORT_SYMBOL(rxrpc_kernel_get_epoch); /** - * rxrpc_kernel_new_call_notification - Get notifications of new calls - * @sock: The socket to intercept received messages on - * @notify_new_call: Function to be called when new calls appear - * @discard_new_call: Function to discard preallocated calls + * rxrpc_kernel_set_notifications - Set table of callback operations + * @sock: The socket to install table upon + * @app_ops: Callback operation table to set * - * Allow a kernel service to be given notifications about new calls. + * Allow a kernel service to set a table of event notifications on a socket. */ -void rxrpc_kernel_new_call_notification( - struct socket *sock, - rxrpc_notify_new_call_t notify_new_call, - rxrpc_discard_new_call_t discard_new_call) +void rxrpc_kernel_set_notifications(struct socket *sock, + const struct rxrpc_kernel_ops *app_ops) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - rx->notify_new_call = notify_new_call; - rx->discard_new_call = discard_new_call; + rx->app_ops = app_ops; } -EXPORT_SYMBOL(rxrpc_kernel_new_call_notification); +EXPORT_SYMBOL(rxrpc_kernel_set_notifications); /** * rxrpc_kernel_set_max_life - Set maximum lifespan on a call @@ -624,7 +633,10 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) fallthrough; case RXRPC_SERVER_BOUND: case RXRPC_SERVER_LISTENING: - ret = rxrpc_do_sendmsg(rx, m, len); + if (m->msg_flags & MSG_OOB) + ret = rxrpc_sendmsg_oob(rx, m, len); + else + ret = rxrpc_do_sendmsg(rx, m, len); /* The socket has been unlocked */ goto out; default: @@ -659,7 +671,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - unsigned int min_sec_level; + unsigned int min_sec_level, val; u16 service_upgrade[2]; int ret; @@ -740,6 +752,26 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, rx->service_upgrade.to = service_upgrade[1]; goto success; + case RXRPC_MANAGE_RESPONSE: + ret = -EINVAL; + if (optlen != sizeof(unsigned int)) + goto error; + ret = -EISCONN; + if (rx->sk.sk_state != RXRPC_UNBOUND) + goto error; + ret = copy_safe_from_sockptr(&val, sizeof(val), + optval, optlen); + if (ret) + goto error; + ret = -EINVAL; + if (val > 1) + goto error; + if (val) + set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + else + clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + goto success; + default: break; } @@ -846,6 +878,8 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, rx->calls = RB_ROOT; spin_lock_init(&rx->incoming_lock); + skb_queue_head_init(&rx->recvmsg_oobq); + rx->pending_oobq = RB_ROOT; INIT_LIST_HEAD(&rx->sock_calls); INIT_LIST_HEAD(&rx->to_be_accepted); INIT_LIST_HEAD(&rx->recvmsg_q); @@ -879,8 +913,10 @@ static int rxrpc_shutdown(struct socket *sock, int flags) lock_sock(sk); if (sk->sk_state < RXRPC_CLOSE) { + spin_lock_irq(&rx->recvmsg_lock); sk->sk_state = RXRPC_CLOSE; sk->sk_shutdown = SHUTDOWN_MASK; + spin_unlock_irq(&rx->recvmsg_lock); } else { ret = -ESHUTDOWN; } @@ -891,6 +927,23 @@ static int rxrpc_shutdown(struct socket *sock, int flags) return ret; } +/* + * Purge the out-of-band queue. + */ +static void rxrpc_purge_oob_queue(struct sock *sk) +{ + struct rxrpc_sock *rx = rxrpc_sk(sk); + struct sk_buff *skb; + + while ((skb = skb_dequeue(&rx->recvmsg_oobq))) + rxrpc_kernel_free_oob(skb); + while (!RB_EMPTY_ROOT(&rx->pending_oobq)) { + skb = rb_entry(rx->pending_oobq.rb_node, struct sk_buff, rbnode); + rb_erase(&skb->rbnode, &rx->pending_oobq); + rxrpc_kernel_free_oob(skb); + } +} + /* * RxRPC socket destructor */ @@ -898,6 +951,7 @@ static void rxrpc_sock_destructor(struct sock *sk) { _enter("%p", sk); + rxrpc_purge_oob_queue(sk); rxrpc_purge_queue(&sk->sk_receive_queue); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); @@ -936,7 +990,9 @@ static int rxrpc_release_sock(struct sock *sk) break; } + spin_lock_irq(&rx->recvmsg_lock); sk->sk_state = RXRPC_CLOSE; + spin_unlock_irq(&rx->recvmsg_lock); if (rx->local && rx->local->service == rx) { write_lock(&rx->local->services_lock); @@ -948,6 +1004,7 @@ static int rxrpc_release_sock(struct sock *sk) rxrpc_discard_prealloc(rx); rxrpc_release_calls_on_socket(rx); flush_workqueue(rxrpc_workqueue); + rxrpc_purge_oob_queue(sk); rxrpc_purge_queue(&sk->sk_receive_queue); rxrpc_unuse_local(rx->local, rxrpc_local_unuse_release_sock); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 3cc3af15086ff..ca62a1db32866 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -31,6 +31,7 @@ struct key_preparsed_payload; struct rxrpc_connection; struct rxrpc_txbuf; struct rxrpc_txqueue; +struct rxgk_context; /* * Mark applied to socket buffers in skb->mark. skb->priority is used @@ -39,6 +40,7 @@ struct rxrpc_txqueue; enum rxrpc_skb_mark { RXRPC_SKB_MARK_PACKET, /* Received packet */ RXRPC_SKB_MARK_ERROR, /* Error notification */ + RXRPC_SKB_MARK_CHALLENGE, /* Challenge notification */ RXRPC_SKB_MARK_SERVICE_CONN_SECURED, /* Service connection response has been verified */ RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ @@ -146,10 +148,12 @@ struct rxrpc_backlog { struct rxrpc_sock { /* WARNING: sk has to be the first member */ struct sock sk; - rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */ - rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */ + const struct rxrpc_kernel_ops *app_ops; /* Table of kernel app notification funcs */ struct rxrpc_local *local; /* local endpoint */ struct rxrpc_backlog *backlog; /* Preallocation for services */ + struct sk_buff_head recvmsg_oobq; /* OOB messages for recvmsg to pick up */ + struct rb_root pending_oobq; /* OOB messages awaiting userspace to respond to */ + u64 oob_id_counter; /* OOB message ID counter */ spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */ struct list_head sock_calls; /* List of calls owned by this socket */ struct list_head to_be_accepted; /* calls awaiting acceptance */ @@ -160,6 +164,7 @@ struct rxrpc_sock { struct rb_root calls; /* User ID -> call mapping */ unsigned long flags; #define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */ +#define RXRPC_SOCK_MANAGE_RESPONSE 1 /* User wants to manage RESPONSE packets */ rwlock_t call_lock; /* lock for calls */ u32 min_sec_level; /* minimum security level */ #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT @@ -203,7 +208,7 @@ struct rxrpc_host_header { */ struct rxrpc_skb_priv { union { - struct rxrpc_connection *conn; /* Connection referred to (poke packet) */ + struct rxrpc_connection *poke_conn; /* Conn referred to (poke packet) */ struct { u16 offset; /* Offset of data */ u16 len; /* Length of data */ @@ -217,6 +222,19 @@ struct rxrpc_skb_priv { u16 nr_acks; /* Number of acks+nacks */ u8 reason; /* Reason for ack */ } ack; + struct { + struct rxrpc_connection *conn; /* Connection referred to */ + union { + u32 rxkad_nonce; + }; + } chall; + struct { + rxrpc_serial_t challenge_serial; + u32 kvno; + u32 version; + u16 len; + u16 ticket_len; + } resp; }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ }; @@ -270,9 +288,24 @@ struct rxrpc_security { /* issue a challenge */ int (*issue_challenge)(struct rxrpc_connection *); + /* Validate a challenge packet */ + bool (*validate_challenge)(struct rxrpc_connection *conn, + struct sk_buff *skb); + + /* Fill out the cmsg for recvmsg() to pass on a challenge to userspace. + * The security class gets to add additional information. + */ + int (*challenge_to_recvmsg)(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct msghdr *msg); + + /* Parse sendmsg() control message and respond to challenge. */ + int (*sendmsg_respond_to_challenge)(struct sk_buff *challenge, + struct msghdr *msg); + /* respond to a challenge */ - int (*respond_to_challenge)(struct rxrpc_connection *, - struct sk_buff *); + int (*respond_to_challenge)(struct rxrpc_connection *conn, + struct sk_buff *challenge); /* verify a response */ int (*verify_response)(struct rxrpc_connection *, @@ -280,6 +313,11 @@ struct rxrpc_security { /* clear connection security */ void (*clear)(struct rxrpc_connection *); + + /* Default ticket -> key decoder */ + int (*default_decode_ticket)(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key); }; /* @@ -526,7 +564,17 @@ struct rxrpc_connection { struct rxrpc_crypt csum_iv; /* packet checksum base */ u32 nonce; /* response re-use preventer */ } rxkad; + struct { + struct rxgk_context *keys[4]; /* (Re-)keying buffer */ + u64 start_time; /* The start time for TK derivation */ + u8 nonce[20]; /* Response re-use preventer */ + u32 enctype; /* Kerberos 5 encoding type */ + u32 key_number; /* Current key number */ + } rxgk; }; + rwlock_t security_use_lock; /* Security use/modification lock */ + struct sk_buff *tx_response; /* Response packet to be transmitted */ + unsigned long flags; unsigned long events; unsigned long idle_timestamp; /* Time at which last became idle */ @@ -692,6 +740,7 @@ struct rxrpc_call { u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ u32 security_level; /* Security level selected */ + u32 security_enctype; /* Security-specific encoding type (or 0) */ int debug_id; /* debug ID for printks */ unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ @@ -867,6 +916,8 @@ struct rxrpc_txbuf { unsigned short len; /* Amount of data in buffer */ unsigned short space; /* Remaining data space */ unsigned short offset; /* Offset of fill point */ + unsigned short crypto_header; /* Size of crypto header */ + unsigned short sec_header; /* Size of security header */ unsigned short pkt_len; /* Size of packet content */ unsigned short alloc_size; /* Amount of bufferage allocated */ unsigned int flags; @@ -1001,7 +1052,9 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct rxrpc_call_params *, gfp_t, - unsigned int); + unsigned int) + __releases(&rx->sk.sk_lock) + __acquires(&call->user_mutex); void rxrpc_start_call_timer(struct rxrpc_call *call); void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, struct sk_buff *); @@ -1198,8 +1251,11 @@ void rxrpc_error_report(struct sock *); bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, s32 abort_code, int err); int rxrpc_io_thread(void *data); +void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb); static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local) { + if (!local->io_thread) + return; wake_up_process(READ_ONCE(local->io_thread)); } @@ -1288,9 +1344,17 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) return net_generic(net, rxrpc_net_id); } +/* + * out_of_band.c + */ +void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb); +void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb); +int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len); + /* * output.c */ +ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len); void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why); void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call); @@ -1299,6 +1363,7 @@ void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); +void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *skb); /* * peer_event.c @@ -1362,6 +1427,11 @@ void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans); void rxrpc_call_init_rtt(struct rxrpc_call *call); +/* + * rxgk.c + */ +extern const struct rxrpc_security rxgk_yfs; + /* * rxkad.c */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index e685034ce4f7c..a4b363b47ccad 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -34,7 +34,6 @@ static void rxrpc_dummy_notify(struct sock *sk, struct rxrpc_call *call, static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, struct rxrpc_backlog *b, rxrpc_notify_rx_t notify_rx, - rxrpc_user_attach_call_t user_attach_call, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { @@ -123,9 +122,10 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, call->user_call_ID = user_call_ID; call->notify_rx = notify_rx; - if (user_attach_call) { + if (rx->app_ops && + rx->app_ops->user_attach_call) { rxrpc_get_call(call, rxrpc_call_get_kernel_service); - user_attach_call(call, user_call_ID); + rx->app_ops->user_attach_call(call, user_call_ID); } rxrpc_get_call(call, rxrpc_call_get_userid); @@ -219,9 +219,10 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_call *call = b->call_backlog[tail]; rcu_assign_pointer(call->socket, rx); - if (rx->discard_new_call) { + if (rx->app_ops && + rx->app_ops->discard_new_call) { _debug("discard %lx", call->user_call_ID); - rx->discard_new_call(call, call->user_call_ID); + rx->app_ops->discard_new_call(call, call->user_call_ID); if (call->notify_rx) call->notify_rx = rxrpc_dummy_notify; rxrpc_put_call(call, rxrpc_call_put_kernel); @@ -387,8 +388,9 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, rxrpc_incoming_call(rx, call, skb); conn = call->conn; - if (rx->notify_new_call) - rx->notify_new_call(&rx->sk, call, call->user_call_ID); + if (rx->app_ops && + rx->app_ops->notify_new_call) + rx->app_ops->notify_new_call(&rx->sk, call, call->user_call_ID); spin_lock(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_UNSECURED) { @@ -440,8 +442,7 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID) if (rx->sk.sk_state == RXRPC_CLOSE) return -ESHUTDOWN; - return rxrpc_service_prealloc_one(rx, b, NULL, NULL, user_call_ID, - GFP_KERNEL, + return rxrpc_service_prealloc_one(rx, b, NULL, user_call_ID, GFP_KERNEL, atomic_inc_return(&rxrpc_debug_id)); } @@ -449,20 +450,18 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID) * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls * @sock: The socket on which to preallocate * @notify_rx: Event notification function for the call - * @user_attach_call: Func to attach call to user_call_ID * @user_call_ID: The tag to attach to the preallocated call * @gfp: The allocation conditions. * @debug_id: The tracing debug ID. * - * Charge up the socket with preallocated calls, each with a user ID. A - * function should be provided to effect the attachment from the user's side. - * The user is given a ref to hold on the call. + * Charge up the socket with preallocated calls, each with a user ID. The + * ->user_attach_call() callback function should be provided to effect the + * attachment from the user's side. The user is given a ref to hold on the + * call. * * Note that the call may be come connected before this function returns. */ -int rxrpc_kernel_charge_accept(struct socket *sock, - rxrpc_notify_rx_t notify_rx, - rxrpc_user_attach_call_t user_attach_call, +int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { @@ -472,8 +471,7 @@ int rxrpc_kernel_charge_accept(struct socket *sock, if (sock->sk->sk_state == RXRPC_CLOSE) return -ESHUTDOWN; - return rxrpc_service_prealloc_one(rx, b, notify_rx, - user_attach_call, user_call_ID, + return rxrpc_service_prealloc_one(rx, b, notify_rx, user_call_ID, gfp, debug_id); } EXPORT_SYMBOL(rxrpc_kernel_charge_accept); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index fce58be65e7cf..e9e8f0ef3fd58 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -145,8 +145,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); INIT_LIST_HEAD(&call->attend_link); - skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->recvmsg_queue); + skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->rx_oos_queue); init_waitqueue_head(&call->waitq); spin_lock_init(&call->notify_lock); @@ -322,7 +322,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_call_params *p, gfp_t gfp, unsigned int debug_id) - __releases(&rx->sk.sk_lock.slock) + __releases(&rx->sk.sk_lock) __acquires(&call->user_mutex) { struct rxrpc_call *call, *xcall; @@ -760,3 +760,23 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) atomic_dec(&rxnet->nr_calls); wait_var_event(&rxnet->nr_calls, !atomic_read(&rxnet->nr_calls)); } + +/** + * rxrpc_kernel_query_call_security - Query call's security parameters + * @call: The call to query + * @_service_id: Where to return the service ID + * @_enctype: Where to return the "encoding type" + * + * This queries the security parameters of a call, setting *@_service_id and + * *@_enctype and returning the security class. + * + * Return: The security class protocol number. + */ +u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call, + u16 *_service_id, u32 *_enctype) +{ + *_service_id = call->dest_srx.srx_service; + *_enctype = call->security_enctype; + return call->security_ix; +} +EXPORT_SYMBOL(rxrpc_kernel_query_call_security); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 4d9c5e21ba785..232b6986da83e 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -19,7 +19,7 @@ /* * Set the completion state on an aborted connection. */ -static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff *skb, +static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, s32 abort_code, int err, enum rxrpc_call_completion compl) { @@ -49,12 +49,20 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, s32 abort_code, int err, enum rxrpc_abort_reason why) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - if (rxrpc_set_conn_aborted(conn, skb, abort_code, err, + u32 cid = conn->proto.cid, call = 0, seq = 0; + + if (skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + cid = sp->hdr.cid; + call = sp->hdr.callNumber; + seq = sp->hdr.seq; + } + + if (rxrpc_set_conn_aborted(conn, abort_code, err, RXRPC_CALL_LOCALLY_ABORTED)) { - trace_rxrpc_abort(0, why, sp->hdr.cid, sp->hdr.callNumber, - sp->hdr.seq, abort_code, err); + trace_rxrpc_abort(0, why, cid, call, seq, abort_code, err); rxrpc_poke_conn(conn, rxrpc_conn_get_poke_abort); } return -EPROTO; @@ -67,7 +75,7 @@ static void rxrpc_input_conn_abort(struct rxrpc_connection *conn, struct sk_buff *skb) { trace_rxrpc_rx_conn_abort(conn, skb); - rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED, + rxrpc_set_conn_aborted(conn, skb->priority, -ECONNABORTED, RXRPC_CALL_REMOTELY_ABORTED); } @@ -248,7 +256,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_CHALLENGE: - return conn->security->respond_to_challenge(conn, skb); + ret = conn->security->respond_to_challenge(conn, skb); + sp->chall.conn = NULL; + rxrpc_put_connection(conn, rxrpc_conn_put_challenge_input); + return ret; case RXRPC_PACKET_TYPE_RESPONSE: ret = conn->security->verify_response(conn, skb); @@ -270,7 +281,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, * we've already received the packet, put it on the * front of the queue. */ - sp->conn = rxrpc_get_connection(conn, rxrpc_conn_get_poke_secured); + sp->poke_conn = rxrpc_get_connection( + conn, rxrpc_conn_get_poke_secured); skb->mark = RXRPC_SKB_MARK_SERVICE_CONN_SECURED; rxrpc_get_skb(skb, rxrpc_skb_get_conn_secured); skb_queue_head(&conn->local->rx_queue, skb); @@ -391,6 +403,61 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, rxrpc_queue_conn(conn, rxrpc_conn_queue_rx_work); } +/* + * Post a CHALLENGE packet to the socket of one of a connection's calls so that + * it can get application data to include in the packet, possibly querying + * userspace. + */ +static bool rxrpc_post_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_call *call = NULL; + struct rxrpc_sock *rx; + bool respond = false; + + sp->chall.conn = + rxrpc_get_connection(conn, rxrpc_conn_get_challenge_input); + + if (!conn->security->challenge_to_recvmsg) { + rxrpc_post_packet_to_conn(conn, skb); + return true; + } + + rcu_read_lock(); + + for (int i = 0; i < ARRAY_SIZE(conn->channels); i++) { + if (conn->channels[i].call) { + call = conn->channels[i].call; + rx = rcu_dereference(call->socket); + if (!rx) { + call = NULL; + continue; + } + + respond = true; + if (test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags)) + break; + call = NULL; + } + } + + if (!respond) { + rcu_read_unlock(); + rxrpc_put_connection(conn, rxrpc_conn_put_challenge_input); + sp->chall.conn = NULL; + return false; + } + + if (call) + rxrpc_notify_socket_oob(call, skb); + rcu_read_unlock(); + + if (!call) + rxrpc_post_packet_to_conn(conn, skb); + return true; +} + /* * Input a connection-level packet. */ @@ -411,6 +478,16 @@ bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) return true; case RXRPC_PACKET_TYPE_CHALLENGE: + rxrpc_see_skb(skb, rxrpc_skb_see_oob_challenge); + if (rxrpc_is_conn_aborted(conn)) { + if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED) + rxrpc_send_conn_abort(conn); + return true; + } + if (!conn->security->validate_challenge(conn, skb)) + return false; + return rxrpc_post_challenge(conn, skb); + case RXRPC_PACKET_TYPE_RESPONSE: if (rxrpc_is_conn_aborted(conn)) { if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED) @@ -436,6 +513,19 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) if (test_and_clear_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events)) rxrpc_abort_calls(conn); + if (conn->tx_response) { + struct sk_buff *skb; + + spin_lock_irq(&conn->local->lock); + skb = conn->tx_response; + conn->tx_response = NULL; + spin_unlock_irq(&conn->local->lock); + + if (conn->state != RXRPC_CONN_ABORTED) + rxrpc_send_response(conn, skb); + rxrpc_free_skb(skb, rxrpc_skb_put_response); + } + if (skb) { switch (skb->mark) { case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: @@ -452,3 +542,31 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) rxrpc_process_delayed_final_acks(conn, false); } + +/* + * Post a RESPONSE message to the I/O thread for transmission. + */ +void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_local *local = conn->local; + struct sk_buff *old; + + _enter("%x", sp->resp.challenge_serial); + + spin_lock_irq(&local->lock); + old = conn->tx_response; + if (old) { + struct rxrpc_skb_priv *osp = rxrpc_skb(skb); + + /* Always go with the response to the most recent challenge. */ + if (after(sp->resp.challenge_serial, osp->resp.challenge_serial)) + conn->tx_response = old; + else + old = skb; + } else { + conn->tx_response = skb; + } + spin_unlock_irq(&local->lock); + rxrpc_poke_conn(conn, rxrpc_conn_get_poke_response); +} diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 8ac22dde8b39b..37340becb224d 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -73,6 +73,7 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, skb_queue_head_init(&conn->rx_queue); conn->rxnet = rxnet; conn->security = &rxrpc_no_security; + rwlock_init(&conn->security_use_lock); spin_lock_init(&conn->state_lock); conn->debug_id = atomic_inc_return(&rxrpc_debug_id); conn->idle_timestamp = jiffies; @@ -329,6 +330,7 @@ static void rxrpc_clean_up_connection(struct work_struct *work) } rxrpc_purge_queue(&conn->rx_queue); + rxrpc_free_skb(conn->tx_response, rxrpc_skb_put_response); rxrpc_kill_client_conn(conn); diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index e068f9b79d023..1f7c136d6d0e6 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -42,13 +42,19 @@ static void none_free_call_crypto(struct rxrpc_call *call) { } -static int none_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb) +static bool none_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) { return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, rxrpc_eproto_rxnull_challenge); } +static int none_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + return -EINVAL; +} + static int none_verify_response(struct rxrpc_connection *conn, struct sk_buff *skb) { @@ -82,7 +88,8 @@ const struct rxrpc_security rxrpc_no_security = { .alloc_txbuf = none_alloc_txbuf, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, - .respond_to_challenge = none_respond_to_challenge, + .validate_challenge = none_validate_challenge, + .sendmsg_respond_to_challenge = none_sendmsg_respond_to_challenge, .verify_response = none_verify_response, .clear = none_clear, }; diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 64f8d77b87312..27b650d30f4db 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -489,8 +489,8 @@ int rxrpc_io_thread(void *data) rxrpc_free_skb(skb, rxrpc_skb_put_error_report); break; case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: - rxrpc_input_conn_event(sp->conn, skb); - rxrpc_put_connection(sp->conn, rxrpc_conn_put_poke); + rxrpc_input_conn_event(sp->poke_conn, skb); + rxrpc_put_connection(sp->poke_conn, rxrpc_conn_put_poke); rxrpc_free_skb(skb, rxrpc_skb_put_conn_secured); break; default: @@ -501,9 +501,11 @@ int rxrpc_io_thread(void *data) } /* Deal with connections that want immediate attention. */ - spin_lock_irq(&local->lock); - list_splice_tail_init(&local->conn_attend_q, &conn_attend_q); - spin_unlock_irq(&local->lock); + if (!list_empty_careful(&local->conn_attend_q)) { + spin_lock_irq(&local->lock); + list_splice_tail_init(&local->conn_attend_q, &conn_attend_q); + spin_unlock_irq(&local->lock); + } while ((conn = list_first_entry_or_null(&conn_attend_q, struct rxrpc_connection, diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 33e8302a79e33..9fdc1f031c9da 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -129,6 +129,160 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep, return 0; } +static u64 xdr_dec64(const __be32 *xdr) +{ + return (u64)ntohl(xdr[0]) << 32 | (u64)ntohl(xdr[1]); +} + +static time64_t rxrpc_s64_to_time64(s64 time_in_100ns) +{ + bool neg = false; + u64 tmp = time_in_100ns; + + if (time_in_100ns < 0) { + tmp = -time_in_100ns; + neg = true; + } + do_div(tmp, 10000000); + return neg ? -tmp : tmp; +} + +/* + * Parse a YFS-RxGK type XDR format token + * - the caller guarantees we have at least 4 words + * + * struct token_rxgk { + * opr_time begintime; + * opr_time endtime; + * afs_int64 level; + * afs_int64 lifetime; + * afs_int64 bytelife; + * afs_int64 enctype; + * opaque key<>; + * opaque ticket<>; + * }; + */ +static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep, + size_t datalen, + const __be32 *xdr, unsigned int toklen) +{ + struct rxrpc_key_token *token, **pptoken; + time64_t expiry; + size_t plen; + const __be32 *ticket, *key; + s64 tmp; + u32 tktlen, keylen; + + _enter(",{%x,%x,%x,%x},%x", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), + toklen); + + if (6 * 2 + 2 > toklen / 4) + goto reject; + + key = xdr + (6 * 2 + 1); + keylen = ntohl(key[-1]); + _debug("keylen: %x", keylen); + keylen = round_up(keylen, 4); + if ((6 * 2 + 2) * 4 + keylen > toklen) + goto reject; + + ticket = xdr + (6 * 2 + 1 + (keylen / 4) + 1); + tktlen = ntohl(ticket[-1]); + _debug("tktlen: %x", tktlen); + tktlen = round_up(tktlen, 4); + if ((6 * 2 + 2) * 4 + keylen + tktlen != toklen) { + kleave(" = -EKEYREJECTED [%x!=%x, %x,%x]", + (6 * 2 + 2) * 4 + keylen + tktlen, toklen, + keylen, tktlen); + goto reject; + } + + plen = sizeof(*token) + sizeof(*token->rxgk) + tktlen + keylen; + prep->quotalen = datalen + plen; + + plen -= sizeof(*token); + token = kzalloc(sizeof(*token), GFP_KERNEL); + if (!token) + goto nomem; + + token->rxgk = kzalloc(sizeof(*token->rxgk) + keylen, GFP_KERNEL); + if (!token->rxgk) + goto nomem_token; + + token->security_index = RXRPC_SECURITY_YFS_RXGK; + token->rxgk->begintime = xdr_dec64(xdr + 0 * 2); + token->rxgk->endtime = xdr_dec64(xdr + 1 * 2); + token->rxgk->level = tmp = xdr_dec64(xdr + 2 * 2); + if (tmp < -1LL || tmp > RXRPC_SECURITY_ENCRYPT) + goto reject_token; + token->rxgk->lifetime = xdr_dec64(xdr + 3 * 2); + token->rxgk->bytelife = xdr_dec64(xdr + 4 * 2); + token->rxgk->enctype = tmp = xdr_dec64(xdr + 5 * 2); + if (tmp < 0 || tmp > UINT_MAX) + goto reject_token; + token->rxgk->key.len = ntohl(key[-1]); + token->rxgk->key.data = token->rxgk->_key; + token->rxgk->ticket.len = ntohl(ticket[-1]); + + if (token->rxgk->endtime != 0) { + expiry = rxrpc_s64_to_time64(token->rxgk->endtime); + if (expiry < 0) + goto expired; + if (expiry < prep->expiry) + prep->expiry = expiry; + } + + memcpy(token->rxgk->key.data, key, token->rxgk->key.len); + + /* Pad the ticket so that we can use it directly in XDR */ + token->rxgk->ticket.data = kzalloc(round_up(token->rxgk->ticket.len, 4), + GFP_KERNEL); + if (!token->rxgk->ticket.data) + goto nomem_yrxgk; + memcpy(token->rxgk->ticket.data, ticket, token->rxgk->ticket.len); + + _debug("SCIX: %u", token->security_index); + _debug("EXPY: %llx", token->rxgk->endtime); + _debug("LIFE: %llx", token->rxgk->lifetime); + _debug("BYTE: %llx", token->rxgk->bytelife); + _debug("ENC : %u", token->rxgk->enctype); + _debug("LEVL: %u", token->rxgk->level); + _debug("KLEN: %u", token->rxgk->key.len); + _debug("TLEN: %u", token->rxgk->ticket.len); + _debug("KEY0: %*phN", token->rxgk->key.len, token->rxgk->key.data); + _debug("TICK: %*phN", + min_t(u32, token->rxgk->ticket.len, 32), token->rxgk->ticket.data); + + /* count the number of tokens attached */ + prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); + + /* attach the data */ + for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; + *pptoken; + pptoken = &(*pptoken)->next) + continue; + *pptoken = token; + + _leave(" = 0"); + return 0; + +nomem_yrxgk: + kfree(token->rxgk); +nomem_token: + kfree(token); +nomem: + return -ENOMEM; +reject_token: + kfree(token); +reject: + return -EKEYREJECTED; +expired: + kfree(token->rxgk); + kfree(token); + return -EKEYEXPIRED; +} + /* * attempt to parse the data as the XDR format * - the caller guarantees we have more than 7 words @@ -228,6 +382,9 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) case RXRPC_SECURITY_RXKAD: ret2 = rxrpc_preparse_xdr_rxkad(prep, datalen, token, toklen); break; + case RXRPC_SECURITY_YFS_RXGK: + ret2 = rxrpc_preparse_xdr_yfs_rxgk(prep, datalen, token, toklen); + break; default: ret2 = -EPROTONOSUPPORT; break; @@ -390,6 +547,10 @@ static void rxrpc_free_token_list(struct rxrpc_key_token *token) case RXRPC_SECURITY_RXKAD: kfree(token->kad); break; + case RXRPC_SECURITY_YFS_RXGK: + kfree(token->rxgk->ticket.data); + kfree(token->rxgk); + break; default: pr_err("Unknown token type %x on rxrpc key\n", token->security_index); @@ -433,6 +594,9 @@ static void rxrpc_describe(const struct key *key, struct seq_file *m) case RXRPC_SECURITY_RXKAD: seq_puts(m, "ka"); break; + case RXRPC_SECURITY_YFS_RXGK: + seq_puts(m, "ygk"); + break; default: /* we have a ticket we can't encode */ seq_printf(m, "%u", token->security_index); break; @@ -531,6 +695,8 @@ EXPORT_SYMBOL(rxrpc_get_server_data_key); * * Generate a null RxRPC key that can be used to indicate anonymous security is * required for a particular domain. + * + * Return: The new key or a negative error code. */ struct key *rxrpc_get_null_key(const char *keyname) { @@ -595,6 +761,13 @@ static long rxrpc_read(const struct key *key, toksize += RND(token->kad->ticket_len); break; + case RXRPC_SECURITY_YFS_RXGK: + toksize += 6 * 8 + 2 * 4; + if (!token->no_leak_key) + toksize += RND(token->rxgk->key.len); + toksize += RND(token->rxgk->ticket.len); + break; + default: /* we have a ticket we can't encode */ pr_err("Unsupported key token type (%u)\n", token->security_index); @@ -674,6 +847,20 @@ static long rxrpc_read(const struct key *key, ENCODE_DATA(token->kad->ticket_len, token->kad->ticket); break; + case RXRPC_SECURITY_YFS_RXGK: + ENCODE64(token->rxgk->begintime); + ENCODE64(token->rxgk->endtime); + ENCODE64(token->rxgk->level); + ENCODE64(token->rxgk->lifetime); + ENCODE64(token->rxgk->bytelife); + ENCODE64(token->rxgk->enctype); + if (token->no_leak_key) + ENCODE(0); + else + ENCODE_DATA(token->rxgk->key.len, token->rxgk->key.data); + ENCODE_DATA(token->rxgk->ticket.len, token->rxgk->ticket.data); + break; + default: pr_err("Unsupported key token type (%u)\n", token->security_index); diff --git a/net/rxrpc/oob.c b/net/rxrpc/oob.c new file mode 100644 index 0000000000000..05ca9c1faa577 --- /dev/null +++ b/net/rxrpc/oob.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Out of band message handling (e.g. challenge-response) + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +enum rxrpc_oob_command { + RXRPC_OOB_CMD_UNSET, + RXRPC_OOB_CMD_RESPOND, +} __mode(byte); + +struct rxrpc_oob_params { + u64 oob_id; /* ID number of message if reply */ + s32 abort_code; + enum rxrpc_oob_command command; + bool have_oob_id:1; +}; + +/* + * Post an out-of-band message for attention by the socket or kernel service + * associated with a reference call. + */ +void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_sock *rx; + struct sock *sk; + + rcu_read_lock(); + + rx = rcu_dereference(call->socket); + if (rx) { + sk = &rx->sk; + spin_lock_irq(&rx->recvmsg_lock); + + if (sk->sk_state < RXRPC_CLOSE) { + skb->skb_mstamp_ns = rx->oob_id_counter++; + rxrpc_get_skb(skb, rxrpc_skb_get_post_oob); + skb_queue_tail(&rx->recvmsg_oobq, skb); + + trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial); + if (rx->app_ops) + rx->app_ops->notify_oob(sk, skb); + } + + spin_unlock_irq(&rx->recvmsg_lock); + if (!rx->app_ops && !sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + } + + rcu_read_unlock(); +} + +/* + * Locate the OOB message to respond to by its ID. + */ +static struct sk_buff *rxrpc_find_pending_oob(struct rxrpc_sock *rx, u64 oob_id) +{ + struct rb_node *p; + struct sk_buff *skb; + + p = rx->pending_oobq.rb_node; + while (p) { + skb = rb_entry(p, struct sk_buff, rbnode); + + if (oob_id < skb->skb_mstamp_ns) + p = p->rb_left; + else if (oob_id > skb->skb_mstamp_ns) + p = p->rb_right; + else + return skb; + } + + return NULL; +} + +/* + * Add an OOB message into the pending-response set. We always assign the next + * value from a 64-bit counter to the oob_id, so just assume we're always going + * to be on the right-hand edge of the tree and that the counter won't wrap. + * The tree is also given a ref to the message. + */ +void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb) +{ + struct rb_node **pp = &rx->pending_oobq.rb_node, *p = NULL; + + while (*pp) { + p = *pp; + pp = &(*pp)->rb_right; + } + + rb_link_node(&skb->rbnode, p, pp); + rb_insert_color(&skb->rbnode, &rx->pending_oobq); +} + +/* + * Extract control messages from the sendmsg() control buffer. + */ +static int rxrpc_sendmsg_oob_cmsg(struct msghdr *msg, struct rxrpc_oob_params *p) +{ + struct cmsghdr *cmsg; + int len; + + if (msg->msg_controllen == 0) + return -EINVAL; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + len = cmsg->cmsg_len - sizeof(struct cmsghdr); + _debug("CMSG %d, %d, %d", + cmsg->cmsg_level, cmsg->cmsg_type, len); + + if (cmsg->cmsg_level != SOL_RXRPC) + continue; + + switch (cmsg->cmsg_type) { + case RXRPC_OOB_ID: + if (len != sizeof(p->oob_id) || p->have_oob_id) + return -EINVAL; + memcpy(&p->oob_id, CMSG_DATA(cmsg), sizeof(p->oob_id)); + p->have_oob_id = true; + break; + case RXRPC_RESPOND: + if (p->command != RXRPC_OOB_CMD_UNSET) + return -EINVAL; + p->command = RXRPC_OOB_CMD_RESPOND; + break; + case RXRPC_ABORT: + if (len != sizeof(p->abort_code) || p->abort_code) + return -EINVAL; + memcpy(&p->abort_code, CMSG_DATA(cmsg), sizeof(p->abort_code)); + if (p->abort_code == 0) + return -EINVAL; + break; + case RXRPC_RESP_RXGK_APPDATA: + if (p->command != RXRPC_OOB_CMD_RESPOND) + return -EINVAL; + break; + default: + return -EINVAL; + } + } + + switch (p->command) { + case RXRPC_OOB_CMD_RESPOND: + if (!p->have_oob_id) + return -EBADSLT; + break; + default: + return -EINVAL; + } + + return 0; +} + +/* + * Allow userspace to respond to an OOB using sendmsg(). + */ +static int rxrpc_respond_to_oob(struct rxrpc_sock *rx, + struct rxrpc_oob_params *p, + struct msghdr *msg) +{ + struct rxrpc_connection *conn; + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + int ret; + + skb = rxrpc_find_pending_oob(rx, p->oob_id); + if (skb) + rb_erase(&skb->rbnode, &rx->pending_oobq); + release_sock(&rx->sk); + if (!skb) + return -EBADSLT; + + sp = rxrpc_skb(skb); + + switch (p->command) { + case RXRPC_OOB_CMD_RESPOND: + ret = -EPROTO; + if (skb->mark != RXRPC_OOB_CHALLENGE) + break; + conn = sp->chall.conn; + ret = -EOPNOTSUPP; + if (!conn->security->sendmsg_respond_to_challenge) + break; + if (p->abort_code) { + rxrpc_abort_conn(conn, NULL, p->abort_code, -ECONNABORTED, + rxrpc_abort_response_sendmsg); + ret = 0; + } else { + ret = conn->security->sendmsg_respond_to_challenge(skb, msg); + } + break; + default: + ret = -EINVAL; + break; + } + + rxrpc_free_skb(skb, rxrpc_skb_put_oob); + return ret; +} + +/* + * Send an out-of-band message or respond to a received out-of-band message. + * - caller gives us the socket lock + * - the socket may be either a client socket or a server socket + */ +int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) +{ + struct rxrpc_oob_params p = {}; + int ret; + + _enter(""); + + ret = rxrpc_sendmsg_oob_cmsg(msg, &p); + if (ret < 0) + goto error_release_sock; + + if (p.have_oob_id) + return rxrpc_respond_to_oob(rx, &p, msg); + + release_sock(&rx->sk); + + switch (p.command) { + default: + ret = -EINVAL; + break; + } + + _leave(" = %d", ret); + return ret; + +error_release_sock: + release_sock(&rx->sk); + return ret; +} + +/** + * rxrpc_kernel_query_oob - Query the parameters of an out-of-band message + * @oob: The message to query + * @_peer: Where to return the peer record + * @_peer_appdata: The application data attached to a peer record + * + * Extract useful parameters from an out-of-band message. The source peer + * parameters are returned through the argument list and the message type is + * returned. + * + * Return: + * * %RXRPC_OOB_CHALLENGE - Challenge wanting a response. + */ +enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(oob); + enum rxrpc_oob_type type = oob->mark; + + switch (type) { + case RXRPC_OOB_CHALLENGE: + *_peer = sp->chall.conn->peer; + *_peer_appdata = sp->chall.conn->peer->app_data; + break; + default: + WARN_ON_ONCE(1); + *_peer = NULL; + *_peer_appdata = 0; + break; + } + + return type; +} +EXPORT_SYMBOL(rxrpc_kernel_query_oob); + +/** + * rxrpc_kernel_dequeue_oob - Dequeue and return the front OOB message + * @sock: The socket to query + * @_type: Where to return the message type + * + * Dequeue the front OOB message, if there is one, and return it and + * its type. + * + * Return: The sk_buff representing the OOB message or %NULL if the queue was + * empty. + */ +struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock, + enum rxrpc_oob_type *_type) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct sk_buff *oob; + + oob = skb_dequeue(&rx->recvmsg_oobq); + if (oob) + *_type = oob->mark; + return oob; +} +EXPORT_SYMBOL(rxrpc_kernel_dequeue_oob); + +/** + * rxrpc_kernel_free_oob - Free an out-of-band message + * @oob: The OOB message to free + * + * Free an OOB message along with any resources it holds. + */ +void rxrpc_kernel_free_oob(struct sk_buff *oob) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(oob); + + switch (oob->mark) { + case RXRPC_OOB_CHALLENGE: + rxrpc_put_connection(sp->chall.conn, rxrpc_conn_put_oob); + break; + } + + rxrpc_free_skb(oob, rxrpc_skb_put_purge_oob); +} +EXPORT_SYMBOL(rxrpc_kernel_free_oob); + +/** + * rxrpc_kernel_query_challenge - Query the parameters of a challenge + * @challenge: The challenge to query + * @_peer: Where to return the peer record + * @_peer_appdata: The application data attached to a peer record + * @_service_id: Where to return the connection service ID + * @_security_index: Where to return the connection security index + * + * Extract useful parameters from a CHALLENGE message. + */ +void rxrpc_kernel_query_challenge(struct sk_buff *challenge, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata, + u16 *_service_id, u8 *_security_index) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + *_peer = sp->chall.conn->peer; + *_peer_appdata = sp->chall.conn->peer->app_data; + *_service_id = sp->hdr.serviceId; + *_security_index = sp->hdr.securityIndex; +} +EXPORT_SYMBOL(rxrpc_kernel_query_challenge); + +/** + * rxrpc_kernel_reject_challenge - Allow a kernel service to reject a challenge + * @challenge: The challenge to be rejected + * @abort_code: The abort code to stick into the ABORT packet + * @error: Local error value + * @why: Indication as to why. + * + * Allow a kernel service to reject a challenge by aborting the connection if + * it's still in an abortable state. The error is returned so this function + * can be used with a return statement. + * + * Return: The %error parameter. + */ +int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code, + int error, enum rxrpc_abort_reason why) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + _enter("{%x},%d,%d,%u", sp->hdr.serial, abort_code, error, why); + + rxrpc_abort_conn(sp->chall.conn, NULL, abort_code, error, why); + return error; +} +EXPORT_SYMBOL(rxrpc_kernel_reject_challenge); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 95905b85a8d71..0af19bcdc80a4 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -18,7 +18,7 @@ extern int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); -static ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len) +ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len) { struct sockaddr *sa = msg->msg_name; struct sock *sk = socket->sk; @@ -916,3 +916,61 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) peer->last_tx_at = ktime_get_seconds(); _leave(""); } + +/* + * Send a RESPONSE message. + */ +void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *response) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(response); + struct scatterlist sg[16]; + struct bio_vec bvec[16]; + struct msghdr msg; + size_t len = sp->resp.len; + __be32 wserial; + u32 serial = 0; + int ret, nr_sg; + + _enter("C=%x,%x", conn->debug_id, sp->resp.challenge_serial); + + sg_init_table(sg, ARRAY_SIZE(sg)); + ret = skb_to_sgvec(response, sg, 0, len); + if (ret < 0) + goto fail; + nr_sg = ret; + + for (int i = 0; i < nr_sg; i++) + bvec_set_page(&bvec[i], sg_page(&sg[i]), sg[i].length, sg[i].offset); + + iov_iter_bvec(&msg.msg_iter, WRITE, bvec, nr_sg, len); + + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_SPLICE_PAGES; + + serial = rxrpc_get_next_serials(conn, 1); + wserial = htonl(serial); + + trace_rxrpc_tx_response(conn, serial, sp); + + ret = skb_store_bits(response, offsetof(struct rxrpc_wire_header, serial), + &wserial, sizeof(wserial)); + if (ret < 0) + goto fail; + + rxrpc_local_dont_fragment(conn->local, false); + + ret = do_udp_sendmsg(conn->local->socket, &msg, len); + if (ret < 0) + goto fail; + + conn->peer->last_tx_at = ktime_get_seconds(); + return; + +fail: + trace_rxrpc_tx_fail(conn->debug_id, serial, ret, + rxrpc_tx_point_response); + kleave(" = %d", ret); +} diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 71b6e07bf1616..e2f35e6c04d6c 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -475,6 +475,8 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) * @call: The call to query * * Get a record for the remote peer in a call. + * + * Return: The call's peer record. */ struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call) { @@ -486,7 +488,9 @@ EXPORT_SYMBOL(rxrpc_kernel_get_call_peer); * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT * @peer: The peer to query * - * Get the call's peer smoothed RTT in uS or UINT_MAX if we have no samples. + * Get the call's peer smoothed RTT. + * + * Return: The RTT in uS or %UINT_MAX if we have no samples. */ unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer) { @@ -499,7 +503,10 @@ EXPORT_SYMBOL(rxrpc_kernel_get_srtt); * @peer: The peer to query * * Get a pointer to the address from a peer record. The caller is responsible - * for making sure that the address is not deallocated. + * for making sure that the address is not deallocated. A fake address will be + * substituted if %peer in NULL. + * + * Return: The rxrpc address record or a fake record. */ const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer) { @@ -512,7 +519,10 @@ EXPORT_SYMBOL(rxrpc_kernel_remote_srx); * @peer: The peer to query * * Get a pointer to the transport address from a peer record. The caller is - * responsible for making sure that the address is not deallocated. + * responsible for making sure that the address is not deallocated. A fake + * address will be substituted if %peer in NULL. + * + * Return: The transport address record or a fake record. */ const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer) { @@ -527,7 +537,9 @@ EXPORT_SYMBOL(rxrpc_kernel_remote_addr); * @app_data: The data to set * * Set the app-specific data on a peer. AF_RXRPC makes no effort to retain - * anything the data might refer to. The previous app_data is returned. + * anything the data might refer to. + * + * Return: The previous app_data. */ unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data) { @@ -540,6 +552,8 @@ EXPORT_SYMBOL(rxrpc_kernel_set_peer_data); * @peer: The peer to query * * Retrieve the app-specific data from a peer. + * + * Return: The peer's app data. */ unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer) { diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index 42f70e4636f82..f8bfec12bc7e0 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -181,4 +181,24 @@ struct rxkad_response { __be32 ticket_len; /* Kerberos ticket length */ } __packed; +/* + * GSSAPI security type-4 and type-6 data header. + */ +struct rxgk_header { + __be32 epoch; + __be32 cid; + __be32 call_number; + __be32 seq; + __be32 sec_index; + __be32 data_len; +} __packed; + +/* + * GSSAPI security type-4 and type-6 response packet header. + */ +struct rxgk_response { + __be64 start_time; + __be32 token_len; +} __packed; + #endif /* _LINUX_RXRPC_PACKET_H */ diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 32cd5f1d541db..86a27fb55a1cc 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -154,6 +154,82 @@ static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb) return call->security->verify_packet(call, skb); } +/* + * Transcribe a call's user ID to a control message. + */ +static int rxrpc_recvmsg_user_id(struct rxrpc_call *call, struct msghdr *msg, + int flags) +{ + if (!test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) + return 0; + + if (flags & MSG_CMSG_COMPAT) { + unsigned int id32 = call->user_call_ID; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + sizeof(unsigned int), &id32); + } else { + unsigned long idl = call->user_call_ID; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + sizeof(unsigned long), &idl); + } +} + +/* + * Deal with a CHALLENGE packet. + */ +static int rxrpc_recvmsg_challenge(struct socket *sock, struct msghdr *msg, + struct sk_buff *challenge, unsigned int flags) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + struct rxrpc_connection *conn = sp->chall.conn; + + return conn->security->challenge_to_recvmsg(conn, challenge, msg); +} + +/* + * Process OOB packets. Called with the socket locked. + */ +static int rxrpc_recvmsg_oob(struct socket *sock, struct msghdr *msg, + unsigned int flags) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct sk_buff *skb; + bool need_response = false; + int ret; + + skb = skb_peek(&rx->recvmsg_oobq); + if (!skb) + return -EAGAIN; + rxrpc_see_skb(skb, rxrpc_skb_see_recvmsg); + + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_OOB_ID, sizeof(u64), + &skb->skb_mstamp_ns); + if (ret < 0) + return ret; + + switch ((enum rxrpc_oob_type)skb->mark) { + case RXRPC_OOB_CHALLENGE: + need_response = true; + ret = rxrpc_recvmsg_challenge(sock, msg, skb, flags); + break; + default: + WARN_ONCE(1, "recvmsg() can't process unknown OOB type %u\n", + skb->mark); + ret = -EIO; + break; + } + + if (!(flags & MSG_PEEK)) + skb_unlink(skb, &rx->recvmsg_oobq); + if (need_response) + rxrpc_add_pending_oob(rx, skb); + else + rxrpc_free_skb(skb, rxrpc_skb_put_oob); + return ret; +} + /* * Deliver messages to a call. This keeps processing packets until the buffer * is filled and we find either more DATA (returns 0) or the end of the DATA @@ -165,6 +241,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, size_t len, int flags, size_t *_offset) { struct rxrpc_skb_priv *sp; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct sk_buff *skb; rxrpc_seq_t seq = 0; size_t remain; @@ -207,7 +284,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq, sp->offset, sp->len, ret2); if (ret2 < 0) { - kdebug("verify = %d", ret2); ret = ret2; goto out; } @@ -255,6 +331,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (!(flags & MSG_PEEK)) rxrpc_rotate_rx_window(call); + + if (!rx->app_ops && + !skb_queue_empty_lockless(&rx->recvmsg_oobq)) { + trace_rxrpc_recvdata(call, rxrpc_recvmsg_oobq, seq, + rx_pkt_offset, rx_pkt_len, ret); + break; + } } out: @@ -262,6 +345,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, call->rx_pkt_offset = rx_pkt_offset; call->rx_pkt_len = rx_pkt_len; } + done: trace_rxrpc_recvdata(call, rxrpc_recvmsg_data_return, seq, rx_pkt_offset, rx_pkt_len, ret); @@ -301,6 +385,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, /* Return immediately if a client socket has no outstanding calls */ if (RB_EMPTY_ROOT(&rx->calls) && list_empty(&rx->recvmsg_q) && + skb_queue_empty_lockless(&rx->recvmsg_oobq) && rx->sk.sk_state != RXRPC_SERVER_LISTENING) { release_sock(&rx->sk); return -EAGAIN; @@ -322,7 +407,8 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (ret) goto wait_error; - if (list_empty(&rx->recvmsg_q)) { + if (list_empty(&rx->recvmsg_q) && + skb_queue_empty_lockless(&rx->recvmsg_oobq)) { if (signal_pending(current)) goto wait_interrupted; trace_rxrpc_recvmsg(0, rxrpc_recvmsg_wait, 0); @@ -332,6 +418,15 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, goto try_again; } + /* Deal with OOB messages before we consider getting normal data. */ + if (!skb_queue_empty_lockless(&rx->recvmsg_oobq)) { + ret = rxrpc_recvmsg_oob(sock, msg, flags); + release_sock(&rx->sk); + if (ret == -EAGAIN) + goto try_again; + goto error_no_call; + } + /* Find the next call and dequeue it if we're not just peeking. If we * do dequeue it, that comes with a ref that we will need to release. * We also want to weed out calls that got requeued whilst we were @@ -342,7 +437,8 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, call = list_entry(l, struct rxrpc_call, recvmsg_link); if (!rxrpc_call_is_complete(call) && - skb_queue_empty(&call->recvmsg_queue)) { + skb_queue_empty(&call->recvmsg_queue) && + skb_queue_empty(&rx->recvmsg_oobq)) { list_del_init(&call->recvmsg_link); spin_unlock_irq(&rx->recvmsg_lock); release_sock(&rx->sk); @@ -377,21 +473,9 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); - if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { - if (flags & MSG_CMSG_COMPAT) { - unsigned int id32 = call->user_call_ID; - - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - sizeof(unsigned int), &id32); - } else { - unsigned long idl = call->user_call_ID; - - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - sizeof(unsigned long), &idl); - } - if (ret < 0) - goto error_unlock_call; - } + ret = rxrpc_recvmsg_user_id(call, msg, flags); + if (ret < 0) + goto error_unlock_call; if (msg->msg_name && call->peer) { size_t len = sizeof(call->dest_srx); @@ -477,14 +561,14 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, * @_service: Where to store the actual service ID (may be upgraded) * * Allow a kernel service to receive data and pick up information about the - * state of a call. Returns 0 if got what was asked for and there's more - * available, 1 if we got what was asked for and we're at the end of the data - * and -EAGAIN if we need more data. + * state of a call. Note that *@_abort should also be initialised to %0. * - * Note that we may return -EAGAIN to drain empty packets at the end of the - * data, even if we've already copied over the requested data. + * Note that we may return %-EAGAIN to drain empty packets at the end + * of the data, even if we've already copied over the requested data. * - * *_abort should also be initialised to 0. + * Return: %0 if got what was asked for and there's more available, %1 + * if we got what was asked for and we're at the end of the data and + * %-EAGAIN if we need more data. */ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, struct iov_iter *iter, size_t *_len, diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c new file mode 100644 index 0000000000000..ba8bc201b8d38 --- /dev/null +++ b/net/rxrpc/rxgk.c @@ -0,0 +1,1367 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* GSSAPI-based RxRPC security + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include "ar-internal.h" +#include "rxgk_common.h" + +/* + * Parse the information from a server key + */ +static int rxgk_preparse_server_key(struct key_preparsed_payload *prep) +{ + const struct krb5_enctype *krb5; + struct krb5_buffer *server_key = (void *)&prep->payload.data[2]; + unsigned int service, sec_class, kvno, enctype; + int n = 0; + + _enter("%zu", prep->datalen); + + if (sscanf(prep->orig_description, "%u:%u:%u:%u%n", + &service, &sec_class, &kvno, &enctype, &n) != 4) + return -EINVAL; + + if (prep->orig_description[n]) + return -EINVAL; + + krb5 = crypto_krb5_find_enctype(enctype); + if (!krb5) + return -ENOPKG; + + prep->payload.data[0] = (struct krb5_enctype *)krb5; + + if (prep->datalen != krb5->key_len) + return -EKEYREJECTED; + + server_key->len = prep->datalen; + server_key->data = kmemdup(prep->data, prep->datalen, GFP_KERNEL); + if (!server_key->data) + return -ENOMEM; + + _leave(" = 0"); + return 0; +} + +static void rxgk_free_server_key(union key_payload *payload) +{ + struct krb5_buffer *server_key = (void *)&payload->data[2]; + + kfree_sensitive(server_key->data); +} + +static void rxgk_free_preparse_server_key(struct key_preparsed_payload *prep) +{ + rxgk_free_server_key(&prep->payload); +} + +static void rxgk_destroy_server_key(struct key *key) +{ + rxgk_free_server_key(&key->payload); +} + +static void rxgk_describe_server_key(const struct key *key, struct seq_file *m) +{ + const struct krb5_enctype *krb5 = key->payload.data[0]; + + if (krb5) + seq_printf(m, ": %s", krb5->name); +} + +/* + * Handle rekeying the connection when we see our limits overrun or when the + * far side decided to rekey. + * + * Returns a ref on the context if successful or -ESTALE if the key is out of + * date. + */ +static struct rxgk_context *rxgk_rekey(struct rxrpc_connection *conn, + const u16 *specific_key_number) +{ + struct rxgk_context *gk, *dead = NULL; + unsigned int key_number, current_key, mask = ARRAY_SIZE(conn->rxgk.keys) - 1; + bool crank = false; + + _enter("%d", specific_key_number ? *specific_key_number : -1); + + mutex_lock(&conn->security_lock); + + current_key = conn->rxgk.key_number; + if (!specific_key_number) { + key_number = current_key; + } else { + if (*specific_key_number == (u16)current_key) + key_number = current_key; + else if (*specific_key_number == (u16)(current_key - 1)) + key_number = current_key - 1; + else if (*specific_key_number == (u16)(current_key + 1)) + goto crank_window; + else + goto bad_key; + } + + gk = conn->rxgk.keys[key_number & mask]; + if (!gk) + goto generate_key; + if (!specific_key_number && + test_bit(RXGK_TK_NEEDS_REKEY, &gk->flags)) + goto crank_window; + +grab: + refcount_inc(&gk->usage); + mutex_unlock(&conn->security_lock); + rxgk_put(dead); + return gk; + +crank_window: + trace_rxrpc_rxgk_rekey(conn, current_key, + specific_key_number ? *specific_key_number : -1); + if (current_key == UINT_MAX) + goto bad_key; + if (current_key + 1 == UINT_MAX) + set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); + + key_number = current_key + 1; + if (WARN_ON(conn->rxgk.keys[key_number & mask])) + goto bad_key; + crank = true; + +generate_key: + gk = conn->rxgk.keys[current_key & mask]; + gk = rxgk_generate_transport_key(conn, gk->key, key_number, GFP_NOFS); + if (IS_ERR(gk)) { + mutex_unlock(&conn->security_lock); + return gk; + } + + write_lock(&conn->security_use_lock); + if (crank) { + current_key++; + conn->rxgk.key_number = current_key; + dead = conn->rxgk.keys[(current_key - 2) & mask]; + conn->rxgk.keys[(current_key - 2) & mask] = NULL; + } + conn->rxgk.keys[current_key & mask] = gk; + write_unlock(&conn->security_use_lock); + goto grab; + +bad_key: + mutex_unlock(&conn->security_lock); + return ERR_PTR(-ESTALE); +} + +/* + * Get the specified keying context. + * + * Returns a ref on the context if successful or -ESTALE if the key is out of + * date. + */ +static struct rxgk_context *rxgk_get_key(struct rxrpc_connection *conn, + const u16 *specific_key_number) +{ + struct rxgk_context *gk; + unsigned int key_number, current_key, mask = ARRAY_SIZE(conn->rxgk.keys) - 1; + + _enter("{%u},%d", + conn->rxgk.key_number, specific_key_number ? *specific_key_number : -1); + + read_lock(&conn->security_use_lock); + + current_key = conn->rxgk.key_number; + if (!specific_key_number) { + key_number = current_key; + } else { + /* Only the bottom 16 bits of the key number are exposed in the + * header, so we try and keep the upper 16 bits in step. The + * whole 32 bits are used to generate the TK. + */ + if (*specific_key_number == (u16)current_key) + key_number = current_key; + else if (*specific_key_number == (u16)(current_key - 1)) + key_number = current_key - 1; + else if (*specific_key_number == (u16)(current_key + 1)) + goto rekey; + else + goto bad_key; + } + + gk = conn->rxgk.keys[key_number & mask]; + if (!gk) + goto slow_path; + if (!specific_key_number && + key_number < UINT_MAX) { + if (time_after(jiffies, gk->expiry) || + gk->bytes_remaining < 0) { + set_bit(RXGK_TK_NEEDS_REKEY, &gk->flags); + goto slow_path; + } + + if (test_bit(RXGK_TK_NEEDS_REKEY, &gk->flags)) + goto slow_path; + } + + refcount_inc(&gk->usage); + read_unlock(&conn->security_use_lock); + return gk; + +rekey: + _debug("rekey"); + if (current_key == UINT_MAX) + goto bad_key; + gk = conn->rxgk.keys[current_key & mask]; + if (gk) + set_bit(RXGK_TK_NEEDS_REKEY, &gk->flags); +slow_path: + read_unlock(&conn->security_use_lock); + return rxgk_rekey(conn, specific_key_number); +bad_key: + read_unlock(&conn->security_use_lock); + return ERR_PTR(-ESTALE); +} + +/* + * initialise connection security + */ +static int rxgk_init_connection_security(struct rxrpc_connection *conn, + struct rxrpc_key_token *token) +{ + struct rxgk_context *gk; + int ret; + + _enter("{%d,%u},{%x}", + conn->debug_id, conn->rxgk.key_number, key_serial(conn->key)); + + conn->security_ix = token->security_index; + conn->security_level = token->rxgk->level; + + if (rxrpc_conn_is_client(conn)) { + conn->rxgk.start_time = ktime_get(); + do_div(conn->rxgk.start_time, 100); + } + + gk = rxgk_generate_transport_key(conn, token->rxgk, conn->rxgk.key_number, + GFP_NOFS); + if (IS_ERR(gk)) + return PTR_ERR(gk); + conn->rxgk.enctype = gk->krb5->etype; + conn->rxgk.keys[gk->key_number & 3] = gk; + + switch (conn->security_level) { + case RXRPC_SECURITY_PLAIN: + case RXRPC_SECURITY_AUTH: + case RXRPC_SECURITY_ENCRYPT: + break; + default: + ret = -EKEYREJECTED; + goto error; + } + + ret = 0; +error: + _leave(" = %d", ret); + return ret; +} + +/* + * Clean up the crypto on a call. + */ +static void rxgk_free_call_crypto(struct rxrpc_call *call) +{ +} + +/* + * Work out how much data we can put in a packet. + */ +static struct rxrpc_txbuf *rxgk_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) +{ + enum krb5_crypto_mode mode; + struct rxgk_context *gk; + struct rxrpc_txbuf *txb; + size_t shdr, alloc, limit, part, offset, gap; + + switch (call->conn->security_level) { + default: + alloc = umin(remain, RXRPC_JUMBO_DATALEN); + return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp); + case RXRPC_SECURITY_AUTH: + shdr = 0; + mode = KRB5_CHECKSUM_MODE; + break; + case RXRPC_SECURITY_ENCRYPT: + shdr = sizeof(struct rxgk_header); + mode = KRB5_ENCRYPT_MODE; + break; + } + + gk = rxgk_get_key(call->conn, NULL); + if (IS_ERR(gk)) + return NULL; + + /* Work out the maximum amount of data that will fit. */ + alloc = RXRPC_JUMBO_DATALEN; + limit = crypto_krb5_how_much_data(gk->krb5, mode, &alloc, &offset); + + if (remain < limit - shdr) { + part = remain; + alloc = crypto_krb5_how_much_buffer(gk->krb5, mode, + shdr + part, &offset); + gap = 0; + } else { + part = limit - shdr; + gap = RXRPC_JUMBO_DATALEN - alloc; + alloc = RXRPC_JUMBO_DATALEN; + } + + rxgk_put(gk); + + txb = rxrpc_alloc_data_txbuf(call, alloc, 16, gfp); + if (!txb) + return NULL; + + txb->crypto_header = offset; + txb->sec_header = shdr; + txb->offset += offset + shdr; + txb->space = part; + + /* Clear excess space in the packet */ + if (gap) + memset(txb->data + alloc - gap, 0, gap); + return txb; +} + +/* + * Integrity mode (sign a packet - level 1 security) + */ +static int rxgk_secure_packet_integrity(const struct rxrpc_call *call, + struct rxgk_context *gk, + struct rxrpc_txbuf *txb) +{ + struct rxgk_header *hdr; + struct scatterlist sg[1]; + struct krb5_buffer metadata; + int ret = -ENOMEM; + + _enter(""); + + hdr = kzalloc(sizeof(*hdr), GFP_NOFS); + if (!hdr) + goto error_gk; + + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(txb->seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(txb->len); + metadata.len = sizeof(*hdr); + metadata.data = hdr; + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], txb->data, txb->alloc_size); + + ret = crypto_krb5_get_mic(gk->krb5, gk->tx_Kc, &metadata, + sg, 1, txb->alloc_size, + txb->crypto_header, txb->sec_header + txb->len); + if (ret >= 0) { + txb->pkt_len = ret; + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; + gk->bytes_remaining -= ret; + } + kfree(hdr); +error_gk: + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * wholly encrypt a packet (level 2 security) + */ +static int rxgk_secure_packet_encrypted(const struct rxrpc_call *call, + struct rxgk_context *gk, + struct rxrpc_txbuf *txb) +{ + struct rxgk_header *hdr; + struct scatterlist sg[1]; + int ret; + + _enter("%x", txb->len); + + /* Insert the header into the buffer. */ + hdr = txb->data + txb->crypto_header; + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(txb->seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(txb->len); + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], txb->data, txb->alloc_size); + + ret = crypto_krb5_encrypt(gk->krb5, gk->tx_enc, + sg, 1, txb->alloc_size, + txb->crypto_header, txb->sec_header + txb->len, + false); + if (ret >= 0) { + txb->pkt_len = ret; + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; + gk->bytes_remaining -= ret; + } + + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * checksum an RxRPC packet header + */ +static int rxgk_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +{ + struct rxgk_context *gk; + int ret; + + _enter("{%d{%x}},{#%u},%u,", + call->debug_id, key_serial(call->conn->key), txb->seq, txb->len); + + gk = rxgk_get_key(call->conn, NULL); + if (IS_ERR(gk)) + return PTR_ERR(gk) == -ESTALE ? -EKEYREJECTED : PTR_ERR(gk); + + ret = key_validate(call->conn->key); + if (ret < 0) + return ret; + + call->security_enctype = gk->krb5->etype; + txb->cksum = htons(gk->key_number); + + switch (call->conn->security_level) { + case RXRPC_SECURITY_PLAIN: + rxgk_put(gk); + txb->pkt_len = txb->len; + return 0; + case RXRPC_SECURITY_AUTH: + return rxgk_secure_packet_integrity(call, gk, txb); + case RXRPC_SECURITY_ENCRYPT: + return rxgk_secure_packet_encrypted(call, gk, txb); + default: + rxgk_put(gk); + return -EPERM; + } +} + +/* + * Integrity mode (check the signature on a packet - level 1 security) + */ +static int rxgk_verify_packet_integrity(struct rxrpc_call *call, + struct rxgk_context *gk, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_header *hdr; + struct krb5_buffer metadata; + unsigned int offset = sp->offset, len = sp->len; + size_t data_offset = 0, data_len = len; + u32 ac; + int ret = -ENOMEM; + + _enter(""); + + crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE, + &data_offset, &data_len); + + hdr = kzalloc(sizeof(*hdr), GFP_NOFS); + if (!hdr) + return -ENOMEM; + + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(sp->hdr.seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(data_len); + + metadata.len = sizeof(*hdr); + metadata.data = hdr; + ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata, + skb, &offset, &len, &ac); + kfree(hdr); + if (ret == -EPROTO) { + rxrpc_abort_eproto(call, skb, ac, + rxgk_abort_1_verify_mic_eproto); + } else { + sp->offset = offset; + sp->len = len; + } + + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Decrypt an encrypted packet (level 2 security). + */ +static int rxgk_verify_packet_encrypted(struct rxrpc_call *call, + struct rxgk_context *gk, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_header hdr; + unsigned int offset = sp->offset, len = sp->len; + int ret; + u32 ac; + + _enter(""); + + ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac); + if (ret == -EPROTO) + rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); + if (ret < 0) + goto error; + + if (len < sizeof(hdr)) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_2_short_header); + goto error; + } + + /* Extract the header from the skb */ + ret = skb_copy_bits(skb, offset, &hdr, sizeof(hdr)); + if (ret < 0) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_2_short_encdata); + goto error; + } + offset += sizeof(hdr); + len -= sizeof(hdr); + + if (ntohl(hdr.epoch) != call->conn->proto.epoch || + ntohl(hdr.cid) != call->cid || + ntohl(hdr.call_number) != call->call_id || + ntohl(hdr.seq) != sp->hdr.seq || + ntohl(hdr.sec_index) != call->security_ix || + ntohl(hdr.data_len) > len) { + ret = rxrpc_abort_eproto(call, skb, RXGK_SEALEDINCON, + rxgk_abort_2_short_data); + goto error; + } + + sp->offset = offset; + sp->len = ntohl(hdr.data_len); + ret = 0; +error: + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Verify the security on a received packet or subpacket (if part of a + * jumbo packet). + */ +static int rxgk_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_context *gk; + u16 key_number = sp->hdr.cksum; + + _enter("{%d{%x}},{#%u}", + call->debug_id, key_serial(call->conn->key), sp->hdr.seq); + + gk = rxgk_get_key(call->conn, &key_number); + if (IS_ERR(gk)) { + switch (PTR_ERR(gk)) { + case -ESTALE: + return rxrpc_abort_eproto(call, skb, RXGK_BADKEYNO, + rxgk_abort_bad_key_number); + default: + return PTR_ERR(gk); + } + } + + call->security_enctype = gk->krb5->etype; + switch (call->conn->security_level) { + case RXRPC_SECURITY_PLAIN: + return 0; + case RXRPC_SECURITY_AUTH: + return rxgk_verify_packet_integrity(call, gk, skb); + case RXRPC_SECURITY_ENCRYPT: + return rxgk_verify_packet_encrypted(call, gk, skb); + default: + rxgk_put(gk); + return -ENOANO; + } +} + +/* + * Allocate memory to hold a challenge or a response packet. We're not running + * in the io_thread, so we can't use ->tx_alloc. + */ +static struct page *rxgk_alloc_packet(size_t total_len) +{ + gfp_t gfp = GFP_NOFS; + int order; + + order = get_order(total_len); + if (order > 0) + gfp |= __GFP_COMP; + return alloc_pages(gfp, order); +} + +/* + * Issue a challenge. + */ +static int rxgk_issue_challenge(struct rxrpc_connection *conn) +{ + struct rxrpc_wire_header *whdr; + struct bio_vec bvec[1]; + struct msghdr msg; + struct page *page; + size_t len = sizeof(*whdr) + sizeof(conn->rxgk.nonce); + u32 serial; + int ret; + + _enter("{%d}", conn->debug_id); + + get_random_bytes(&conn->rxgk.nonce, sizeof(conn->rxgk.nonce)); + + /* We can't use conn->tx_alloc without a lock */ + page = rxgk_alloc_packet(sizeof(*whdr) + sizeof(conn->rxgk.nonce)); + if (!page) + return -ENOMEM; + + bvec_set_page(&bvec[0], page, len, 0); + iov_iter_bvec(&msg.msg_iter, WRITE, bvec, 1, len); + + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_SPLICE_PAGES; + + whdr = page_address(page); + whdr->epoch = htonl(conn->proto.epoch); + whdr->cid = htonl(conn->proto.cid); + whdr->callNumber = 0; + whdr->seq = 0; + whdr->type = RXRPC_PACKET_TYPE_CHALLENGE; + whdr->flags = conn->out_clientflag; + whdr->userStatus = 0; + whdr->securityIndex = conn->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(conn->service_id); + + memcpy(whdr + 1, conn->rxgk.nonce, sizeof(conn->rxgk.nonce)); + + serial = rxrpc_get_next_serials(conn, 1); + whdr->serial = htonl(serial); + + trace_rxrpc_tx_challenge(conn, serial, 0, *(u32 *)&conn->rxgk.nonce); + + ret = do_udp_sendmsg(conn->local->socket, &msg, len); + if (ret > 0) + conn->peer->last_tx_at = ktime_get_seconds(); + __free_page(page); + + if (ret < 0) { + trace_rxrpc_tx_fail(conn->debug_id, serial, ret, + rxrpc_tx_point_rxgk_challenge); + return -EAGAIN; + } + + trace_rxrpc_tx_packet(conn->debug_id, whdr, + rxrpc_tx_point_rxgk_challenge); + _leave(" = 0"); + return 0; +} + +/* + * Validate a challenge packet. + */ +static bool rxgk_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + u8 nonce[20]; + + if (!conn->key) { + rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxgk_abort_chall_no_key); + return false; + } + + if (key_validate(conn->key) < 0) { + rxrpc_abort_conn(conn, skb, RXGK_EXPIRED, -EPROTO, + rxgk_abort_chall_key_expired); + return false; + } + + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + nonce, sizeof(nonce)) < 0) { + rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_chall_short); + return false; + } + + trace_rxrpc_rx_challenge(conn, sp->hdr.serial, 0, *(u32 *)nonce, 0); + return true; +} + +/** + * rxgk_kernel_query_challenge - Query RxGK-specific challenge parameters + * @challenge: The challenge packet to query + * + * Return: The Kerberos 5 encoding type for the challenged connection. + */ +u32 rxgk_kernel_query_challenge(struct sk_buff *challenge) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + return sp->chall.conn->rxgk.enctype; +} +EXPORT_SYMBOL(rxgk_kernel_query_challenge); + +/* + * Fill out the control message to pass to userspace to inform about the + * challenge. + */ +static int rxgk_challenge_to_recvmsg(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct msghdr *msg) +{ + struct rxgk_challenge chall; + + chall.base.service_id = conn->service_id; + chall.base.security_index = conn->security_ix; + chall.enctype = conn->rxgk.enctype; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_CHALLENGED, sizeof(chall), &chall); +} + +/* + * Insert the requisite amount of XDR padding for the length given. + */ +static int rxgk_pad_out(struct sk_buff *response, size_t len, size_t offset) +{ + __be32 zero = 0; + size_t pad = xdr_round_up(len) - len; + int ret; + + if (!pad) + return 0; + + ret = skb_store_bits(response, offset, &zero, pad); + if (ret < 0) + return ret; + return pad; +} + +/* + * Insert the header into the response. + */ +static noinline ssize_t rxgk_insert_response_header(struct rxrpc_connection *conn, + struct rxgk_context *gk, + struct sk_buff *response, + size_t offset) +{ + struct rxrpc_skb_priv *rsp = rxrpc_skb(response); + + struct { + struct rxrpc_wire_header whdr; + __be32 start_time_msw; + __be32 start_time_lsw; + __be32 ticket_len; + } h; + int ret; + + rsp->resp.kvno = gk->key_number; + rsp->resp.version = gk->krb5->etype; + + h.whdr.epoch = htonl(conn->proto.epoch); + h.whdr.cid = htonl(conn->proto.cid); + h.whdr.callNumber = 0; + h.whdr.serial = 0; + h.whdr.seq = 0; + h.whdr.type = RXRPC_PACKET_TYPE_RESPONSE; + h.whdr.flags = conn->out_clientflag; + h.whdr.userStatus = 0; + h.whdr.securityIndex = conn->security_ix; + h.whdr.cksum = htons(gk->key_number); + h.whdr.serviceId = htons(conn->service_id); + h.start_time_msw = htonl(upper_32_bits(conn->rxgk.start_time)); + h.start_time_lsw = htonl(lower_32_bits(conn->rxgk.start_time)); + h.ticket_len = htonl(gk->key->ticket.len); + + ret = skb_store_bits(response, offset, &h, sizeof(h)); + return ret < 0 ? ret : sizeof(h); +} + +/* + * Construct the authenticator to go in the response packet + * + * struct RXGK_Authenticator { + * opaque nonce[20]; + * opaque appdata<>; + * RXGK_Level level; + * unsigned int epoch; + * unsigned int cid; + * unsigned int call_numbers<>; + * }; + */ +static ssize_t rxgk_construct_authenticator(struct rxrpc_connection *conn, + struct sk_buff *challenge, + const struct krb5_buffer *appdata, + struct sk_buff *response, + size_t offset) +{ + struct { + u8 nonce[20]; + __be32 appdata_len; + } a; + struct { + __be32 level; + __be32 epoch; + __be32 cid; + __be32 call_numbers_count; + __be32 call_numbers[4]; + } b; + int ret; + + ret = skb_copy_bits(challenge, sizeof(struct rxrpc_wire_header), + a.nonce, sizeof(a.nonce)); + if (ret < 0) + return -EPROTO; + + a.appdata_len = htonl(appdata->len); + + ret = skb_store_bits(response, offset, &a, sizeof(a)); + if (ret < 0) + return ret; + offset += sizeof(a); + + if (appdata->len) { + ret = skb_store_bits(response, offset, appdata->data, appdata->len); + if (ret < 0) + return ret; + offset += appdata->len; + + ret = rxgk_pad_out(response, appdata->len, offset); + if (ret < 0) + return ret; + offset += ret; + } + + b.level = htonl(conn->security_level); + b.epoch = htonl(conn->proto.epoch); + b.cid = htonl(conn->proto.cid); + b.call_numbers_count = htonl(4); + b.call_numbers[0] = htonl(conn->channels[0].call_counter); + b.call_numbers[1] = htonl(conn->channels[1].call_counter); + b.call_numbers[2] = htonl(conn->channels[2].call_counter); + b.call_numbers[3] = htonl(conn->channels[3].call_counter); + + ret = skb_store_bits(response, offset, &b, sizeof(b)); + if (ret < 0) + return ret; + return sizeof(a) + xdr_round_up(appdata->len) + sizeof(b); +} + +static ssize_t rxgk_encrypt_authenticator(struct rxrpc_connection *conn, + struct rxgk_context *gk, + struct sk_buff *response, + size_t offset, + size_t alloc_len, + size_t auth_offset, + size_t auth_len) +{ + struct scatterlist sg[16]; + int nr_sg; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(response, sg, offset, alloc_len); + if (unlikely(nr_sg < 0)) + return nr_sg; + return crypto_krb5_encrypt(gk->krb5, gk->resp_enc, sg, nr_sg, alloc_len, + auth_offset, auth_len, false); +} + +/* + * Construct the response. + * + * struct RXGK_Response { + * rxgkTime start_time; + * RXGK_Data token; + * opaque authenticator + * }; + */ +static int rxgk_construct_response(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + struct rxrpc_skb_priv *csp, *rsp; + struct rxgk_context *gk; + struct sk_buff *response; + size_t len, auth_len, authx_len, offset, auth_offset, authx_offset; + __be32 tmp; + int ret; + + gk = rxgk_get_key(conn, NULL); + if (IS_ERR(gk)) + return PTR_ERR(gk); + + auth_len = 20 + (4 + appdata->len) + 12 + (1 + 4) * 4; + authx_len = crypto_krb5_how_much_buffer(gk->krb5, KRB5_ENCRYPT_MODE, + auth_len, &auth_offset); + len = sizeof(struct rxrpc_wire_header) + + 8 + (4 + xdr_round_up(gk->key->ticket.len)) + (4 + authx_len); + + response = alloc_skb_with_frags(0, len, 0, &ret, GFP_NOFS); + if (!response) + goto error; + rxrpc_new_skb(response, rxrpc_skb_new_response_rxgk); + response->len = len; + response->data_len = len; + + ret = rxgk_insert_response_header(conn, gk, response, 0); + if (ret < 0) + goto error; + offset = ret; + + ret = skb_store_bits(response, offset, gk->key->ticket.data, gk->key->ticket.len); + if (ret < 0) + goto error; + offset += gk->key->ticket.len; + ret = rxgk_pad_out(response, gk->key->ticket.len, offset); + if (ret < 0) + goto error; + + authx_offset = offset + ret + 4; /* Leave a gap for the length. */ + + ret = rxgk_construct_authenticator(conn, challenge, appdata, response, + authx_offset + auth_offset); + if (ret < 0) + goto error; + auth_len = ret; + + ret = rxgk_encrypt_authenticator(conn, gk, response, + authx_offset, authx_len, + auth_offset, auth_len); + if (ret < 0) + goto error; + authx_len = ret; + + tmp = htonl(authx_len); + ret = skb_store_bits(response, authx_offset - 4, &tmp, 4); + if (ret < 0) + goto error; + + ret = rxgk_pad_out(response, authx_len, authx_offset + authx_len); + if (ret < 0) + return ret; + len = authx_offset + authx_len + ret; + + if (len != response->len) { + response->len = len; + response->data_len = len; + } + + csp = rxrpc_skb(challenge); + rsp = rxrpc_skb(response); + rsp->resp.len = len; + rsp->resp.challenge_serial = csp->hdr.serial; + rxrpc_post_response(conn, response); + response = NULL; + ret = 0; + +error: + rxrpc_free_skb(response, rxrpc_skb_put_response); + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Respond to a challenge packet. + */ +static int rxgk_respond_to_challenge(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + + if (key_validate(conn->key) < 0) + return rxrpc_abort_conn(conn, NULL, RXGK_EXPIRED, -EPROTO, + rxgk_abort_chall_key_expired); + + return rxgk_construct_response(conn, challenge, appdata); +} + +static int rxgk_respond_to_challenge_no_appdata(struct rxrpc_connection *conn, + struct sk_buff *challenge) +{ + struct krb5_buffer appdata = {}; + + return rxgk_respond_to_challenge(conn, challenge, &appdata); +} + +/** + * rxgk_kernel_respond_to_challenge - Respond to a challenge with appdata + * @challenge: The challenge to respond to + * @appdata: The application data to include in the RESPONSE authenticator + * + * Allow a kernel application to respond to a CHALLENGE with application data + * to be included in the RxGK RESPONSE Authenticator. + * + * Return: %0 if successful and a negative error code otherwise. + */ +int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + + return rxgk_respond_to_challenge(csp->chall.conn, challenge, appdata); +} +EXPORT_SYMBOL(rxgk_kernel_respond_to_challenge); + +/* + * Parse sendmsg() control message and respond to challenge. We need to see if + * there's an appdata to fish out. + */ +static int rxgk_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + struct krb5_buffer appdata = {}; + struct cmsghdr *cmsg; + + for_each_cmsghdr(cmsg, msg) { + if (cmsg->cmsg_level != SOL_RXRPC || + cmsg->cmsg_type != RXRPC_RESP_RXGK_APPDATA) + continue; + if (appdata.data) + return -EINVAL; + appdata.data = CMSG_DATA(cmsg); + appdata.len = cmsg->cmsg_len - sizeof(struct cmsghdr); + } + + return rxgk_kernel_respond_to_challenge(challenge, &appdata); +} + +/* + * Verify the authenticator. + * + * struct RXGK_Authenticator { + * opaque nonce[20]; + * opaque appdata<>; + * RXGK_Level level; + * unsigned int epoch; + * unsigned int cid; + * unsigned int call_numbers<>; + * }; + */ +static int rxgk_do_verify_authenticator(struct rxrpc_connection *conn, + const struct krb5_enctype *krb5, + struct sk_buff *skb, + __be32 *p, __be32 *end) +{ + u32 app_len, call_count, level, epoch, cid, i; + + _enter(""); + + if (memcmp(p, conn->rxgk.nonce, 20) != 0) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_nonce); + p += 20 / sizeof(__be32); + + app_len = ntohl(*p++); + if (app_len > (end - p) * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_applen); + + p += xdr_round_up(app_len) / sizeof(__be32); + if (end - p < 4) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_applen); + + level = ntohl(*p++); + epoch = ntohl(*p++); + cid = ntohl(*p++); + call_count = ntohl(*p++); + + if (level != conn->security_level || + epoch != conn->proto.epoch || + cid != conn->proto.cid || + call_count > 4) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_param); + + if (end - p < call_count) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_call_list); + + for (i = 0; i < call_count; i++) { + u32 call_id = ntohl(*p++); + + if (call_id > INT_MAX) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_callid); + + if (call_id < conn->channels[i].call_counter) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_call_ctr); + + if (call_id > conn->channels[i].call_counter) { + if (conn->channels[i].call) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_call_state); + + conn->channels[i].call_counter = call_id; + } + } + + _leave(" = 0"); + return 0; +} + +/* + * Extract the authenticator and verify it. + */ +static int rxgk_verify_authenticator(struct rxrpc_connection *conn, + const struct krb5_enctype *krb5, + struct sk_buff *skb, + unsigned int auth_offset, unsigned int auth_len) +{ + void *auth; + __be32 *p; + int ret; + + auth = kmalloc(auth_len, GFP_NOFS); + if (!auth) + return -ENOMEM; + + ret = skb_copy_bits(skb, auth_offset, auth, auth_len); + if (ret < 0) { + ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_auth); + goto error; + } + + p = auth; + ret = rxgk_do_verify_authenticator(conn, krb5, skb, p, p + auth_len); +error: + kfree(auth); + return ret; +} + +/* + * Verify a response. + * + * struct RXGK_Response { + * rxgkTime start_time; + * RXGK_Data token; + * opaque authenticator + * }; + */ +static int rxgk_verify_response(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + const struct krb5_enctype *krb5; + struct rxrpc_key_token *token; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_response rhdr; + struct rxgk_context *gk; + struct key *key = NULL; + unsigned int offset = sizeof(struct rxrpc_wire_header); + unsigned int len = skb->len - sizeof(struct rxrpc_wire_header); + unsigned int token_offset, token_len; + unsigned int auth_offset, auth_len; + __be32 xauth_len; + int ret, ec; + + _enter("{%d}", conn->debug_id); + + /* Parse the RXGK_Response object */ + if (sizeof(rhdr) + sizeof(__be32) > len) + goto short_packet; + + if (skb_copy_bits(skb, offset, &rhdr, sizeof(rhdr)) < 0) + goto short_packet; + offset += sizeof(rhdr); + len -= sizeof(rhdr); + + token_offset = offset; + token_len = ntohl(rhdr.token_len); + if (xdr_round_up(token_len) + sizeof(__be32) > len) + goto short_packet; + + trace_rxrpc_rx_response(conn, sp->hdr.serial, 0, sp->hdr.cksum, token_len); + + offset += xdr_round_up(token_len); + len -= xdr_round_up(token_len); + + if (skb_copy_bits(skb, offset, &xauth_len, sizeof(xauth_len)) < 0) + goto short_packet; + offset += sizeof(xauth_len); + len -= sizeof(xauth_len); + + auth_offset = offset; + auth_len = ntohl(xauth_len); + if (auth_len < len) + goto short_packet; + if (auth_len & 3) + goto inconsistent; + if (auth_len < 20 + 9 * 4) + goto auth_too_short; + + /* We need to extract and decrypt the token and instantiate a session + * key for it. This bit, however, is application-specific. If + * possible, we use a default parser, but we might end up bumping this + * to the app to deal with - which might mean a round trip to + * userspace. + */ + ret = rxgk_extract_token(conn, skb, token_offset, token_len, &key); + if (ret < 0) + goto out; + + /* We now have a key instantiated from the decrypted ticket. We can + * pass this to the application so that they can parse the ticket + * content and we can use the session key it contains to derive the + * keys we need. + * + * Note that we have to switch enctype at this point as the enctype of + * the ticket doesn't necessarily match that of the transport. + */ + token = key->payload.data[0]; + conn->security_level = token->rxgk->level; + conn->rxgk.start_time = __be64_to_cpu(rhdr.start_time); + + gk = rxgk_generate_transport_key(conn, token->rxgk, sp->hdr.cksum, GFP_NOFS); + if (IS_ERR(gk)) { + ret = PTR_ERR(gk); + goto cant_get_token; + } + + krb5 = gk->krb5; + + trace_rxrpc_rx_response(conn, sp->hdr.serial, krb5->etype, sp->hdr.cksum, token_len); + + /* Decrypt, parse and verify the authenticator. */ + ret = rxgk_decrypt_skb(krb5, gk->resp_enc, skb, + &auth_offset, &auth_len, &ec); + if (ret < 0) { + rxrpc_abort_conn(conn, skb, RXGK_SEALEDINCON, ret, + rxgk_abort_resp_auth_dec); + goto out; + } + + ret = rxgk_verify_authenticator(conn, krb5, skb, auth_offset, auth_len); + if (ret < 0) + goto out; + + conn->key = key; + key = NULL; + ret = 0; +out: + key_put(key); + _leave(" = %d", ret); + return ret; + +inconsistent: + ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_xdr_align); + goto out; +auth_too_short: + ret = rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_short_auth); + goto out; +short_packet: + ret = rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_short_packet); + goto out; + +cant_get_token: + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -EINVAL: + ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_internal_error); + goto out; + case -ENOPKG: + ret = rxrpc_abort_conn(conn, skb, KRB5_PROG_KEYTYPE_NOSUPP, + -EKEYREJECTED, rxgk_abort_resp_nopkg); + goto out; + } + +temporary_error: + /* Ignore the response packet if we got a temporary error such as + * ENOMEM. We just want to send the challenge again. Note that we + * also come out this way if the ticket decryption fails. + */ + goto out; +} + +/* + * clear the connection security + */ +static void rxgk_clear(struct rxrpc_connection *conn) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(conn->rxgk.keys); i++) + rxgk_put(conn->rxgk.keys[i]); +} + +/* + * Initialise the RxGK security service. + */ +static int rxgk_init(void) +{ + return 0; +} + +/* + * Clean up the RxGK security service. + */ +static void rxgk_exit(void) +{ +} + +/* + * RxRPC YFS GSSAPI-based security + */ +const struct rxrpc_security rxgk_yfs = { + .name = "yfs-rxgk", + .security_index = RXRPC_SECURITY_YFS_RXGK, + .no_key_abort = RXGK_NOTAUTH, + .init = rxgk_init, + .exit = rxgk_exit, + .preparse_server_key = rxgk_preparse_server_key, + .free_preparse_server_key = rxgk_free_preparse_server_key, + .destroy_server_key = rxgk_destroy_server_key, + .describe_server_key = rxgk_describe_server_key, + .init_connection_security = rxgk_init_connection_security, + .alloc_txbuf = rxgk_alloc_txbuf, + .secure_packet = rxgk_secure_packet, + .verify_packet = rxgk_verify_packet, + .free_call_crypto = rxgk_free_call_crypto, + .issue_challenge = rxgk_issue_challenge, + .validate_challenge = rxgk_validate_challenge, + .challenge_to_recvmsg = rxgk_challenge_to_recvmsg, + .sendmsg_respond_to_challenge = rxgk_sendmsg_respond_to_challenge, + .respond_to_challenge = rxgk_respond_to_challenge_no_appdata, + .verify_response = rxgk_verify_response, + .clear = rxgk_clear, + .default_decode_ticket = rxgk_yfs_decode_ticket, +}; diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c new file mode 100644 index 0000000000000..b94b77a1c3178 --- /dev/null +++ b/net/rxrpc/rxgk_app.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Application-specific bits for GSSAPI-based RxRPC security + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include "ar-internal.h" +#include "rxgk_common.h" + +/* + * Decode a default-style YFS ticket in a response and turn it into an + * rxrpc-type key. + * + * struct rxgk_key { + * afs_uint32 enctype; + * opaque key<>; + * }; + * + * struct RXGK_AuthName { + * afs_int32 kind; + * opaque data; + * opaque display; + * }; + * + * struct RXGK_Token { + * rxgk_key K0; + * RXGK_Level level; + * rxgkTime starttime; + * afs_int32 lifetime; + * afs_int32 bytelife; + * rxgkTime expirationtime; + * struct RXGK_AuthName identities<>; + * }; + */ +int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key) +{ + struct rxrpc_key_token *token; + const struct cred *cred = current_cred(); // TODO - use socket creds + struct key *key; + size_t pre_ticket_len, payload_len; + unsigned int klen, enctype; + void *payload, *ticket; + __be32 *t, *p, *q, tmp[2]; + int ret; + + _enter(""); + + /* Get the session key length */ + ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp)); + if (ret < 0) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_klen); + enctype = ntohl(tmp[0]); + klen = ntohl(tmp[1]); + + if (klen > ticket_len - 10 * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_key); + + pre_ticket_len = ((5 + 14) * sizeof(__be32) + + xdr_round_up(klen) + + sizeof(__be32)); + payload_len = pre_ticket_len + xdr_round_up(ticket_len); + + payload = kzalloc(payload_len, GFP_NOFS); + if (!payload) + return -ENOMEM; + + /* We need to fill out the XDR form for a key payload that we can pass + * to add_key(). Start by copying in the ticket so that we can parse + * it. + */ + ticket = payload + pre_ticket_len; + ret = skb_copy_bits(skb, ticket_offset, ticket, ticket_len); + if (ret < 0) { + ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_tkt); + goto error; + } + + /* Fill out the form header. */ + p = payload; + p[0] = htonl(0); /* Flags */ + p[1] = htonl(1); /* len(cellname) */ + p[2] = htonl(0x20000000); /* Cellname " " */ + p[3] = htonl(1); /* #tokens */ + p[4] = htonl(15 * sizeof(__be32) + xdr_round_up(klen) + + xdr_round_up(ticket_len)); /* Token len */ + + /* Now fill in the body. Most of this we can just scrape directly from + * the ticket. + */ + t = ticket + sizeof(__be32) * 2 + xdr_round_up(klen); + q = payload + 5 * sizeof(__be32); + q[0] = htonl(RXRPC_SECURITY_YFS_RXGK); + q[1] = t[1]; /* begintime - msw */ + q[2] = t[2]; /* - lsw */ + q[3] = t[5]; /* endtime - msw */ + q[4] = t[6]; /* - lsw */ + q[5] = 0; /* level - msw */ + q[6] = t[0]; /* - lsw */ + q[7] = 0; /* lifetime - msw */ + q[8] = t[3]; /* - lsw */ + q[9] = 0; /* bytelife - msw */ + q[10] = t[4]; /* - lsw */ + q[11] = 0; /* enctype - msw */ + q[12] = htonl(enctype); /* - lsw */ + q[13] = htonl(klen); /* Key length */ + + q += 14; + + memcpy(q, ticket + sizeof(__be32) * 2, klen); + q += xdr_round_up(klen) / 4; + q[0] = htonl(ticket_len); + q++; + if (WARN_ON((unsigned long)q != (unsigned long)ticket)) { + ret = -EIO; + goto error; + } + + /* Ticket read in with skb_copy_bits above */ + q += xdr_round_up(ticket_len) / 4; + if (WARN_ON((unsigned long)q - (unsigned long)payload != payload_len)) { + ret = -EIO; + goto error; + } + + /* Now turn that into a key. */ + key = key_alloc(&key_type_rxrpc, "x", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, // TODO: Use socket owner + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(key)) { + _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); + ret = PTR_ERR(key); + goto error; + } + + _debug("key %d", key_serial(key)); + + ret = key_instantiate_and_link(key, payload, payload_len, NULL, NULL); + if (ret < 0) + goto error_key; + + token = key->payload.data[0]; + token->no_leak_key = true; + *_key = key; + key = NULL; + ret = 0; + goto error; + +error_key: + key_put(key); +error: + kfree_sensitive(payload); + _leave(" = %d", ret); + return ret; +} + +/* + * Extract the token and set up a session key from the details. + * + * struct RXGK_TokenContainer { + * afs_int32 kvno; + * afs_int32 enctype; + * opaque encrypted_token<>; + * }; + * + * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-afs-08 sec 6.1] + */ +int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int token_offset, unsigned int token_len, + struct key **_key) +{ + const struct krb5_enctype *krb5; + const struct krb5_buffer *server_secret; + struct crypto_aead *token_enc = NULL; + struct key *server_key; + unsigned int ticket_offset, ticket_len; + u32 kvno, enctype; + int ret, ec; + + struct { + __be32 kvno; + __be32 enctype; + __be32 token_len; + } container; + + /* Decode the RXGK_TokenContainer object. This tells us which server + * key we should be using. We can then fetch the key, get the secret + * and set up the crypto to extract the token. + */ + if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0) + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); + + kvno = ntohl(container.kvno); + enctype = ntohl(container.enctype); + ticket_len = ntohl(container.token_len); + ticket_offset = token_offset + sizeof(container); + + if (xdr_round_up(ticket_len) > token_len - 3 * 4) + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); + + _debug("KVNO %u", kvno); + _debug("ENC %u", enctype); + _debug("TLEN %u", ticket_len); + + server_key = rxrpc_look_up_server_security(conn, skb, kvno, enctype); + if (IS_ERR(server_key)) + goto cant_get_server_key; + + down_read(&server_key->sem); + server_secret = (const void *)&server_key->payload.data[2]; + ret = rxgk_set_up_token_cipher(server_secret, &token_enc, enctype, &krb5, GFP_NOFS); + up_read(&server_key->sem); + key_put(server_key); + if (ret < 0) + goto cant_get_token; + + /* We can now decrypt and parse the token/ticket. This allows us to + * gain access to K0, from which we can derive the transport key and + * thence decode the authenticator. + */ + ret = rxgk_decrypt_skb(krb5, token_enc, skb, + &ticket_offset, &ticket_len, &ec); + crypto_free_aead(token_enc); + token_enc = NULL; + if (ret < 0) + return rxrpc_abort_conn(conn, skb, ec, ret, + rxgk_abort_resp_tok_dec); + + ret = conn->security->default_decode_ticket(conn, skb, ticket_offset, + ticket_len, _key); + if (ret < 0) + goto cant_get_token; + + _leave(" = 0"); + return ret; + +cant_get_server_key: + ret = PTR_ERR(server_key); + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -ENOKEY: + case -EKEYREJECTED: + case -EKEYEXPIRED: + case -EKEYREVOKED: + case -EPERM: + return rxrpc_abort_conn(conn, skb, RXGK_BADKEYNO, -EKEYREJECTED, + rxgk_abort_resp_tok_nokey); + default: + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_tok_keyerr); + } + +cant_get_token: + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -EINVAL: + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_tok_internal_error); + case -ENOPKG: + return rxrpc_abort_conn(conn, skb, KRB5_PROG_KEYTYPE_NOSUPP, + -EKEYREJECTED, rxgk_abort_resp_tok_nopkg); + } + +temporary_error: + /* Ignore the response packet if we got a temporary error such as + * ENOMEM. We just want to send the challenge again. Note that we + * also come out this way if the ticket decryption fails. + */ + return ret; +} diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h new file mode 100644 index 0000000000000..7370a56559853 --- /dev/null +++ b/net/rxrpc/rxgk_common.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Common bits for GSSAPI-based RxRPC security. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include + +/* + * Per-key number context. This is replaced when the connection is rekeyed. + */ +struct rxgk_context { + refcount_t usage; + unsigned int key_number; /* Rekeying number (goes in the rx header) */ + unsigned long flags; +#define RXGK_TK_NEEDS_REKEY 0 /* Set if this needs rekeying */ + unsigned long expiry; /* Expiration time of this key */ + long long bytes_remaining; /* Remaining Tx lifetime of this key */ + const struct krb5_enctype *krb5; /* RxGK encryption type */ + const struct rxgk_key *key; + + /* We need up to 7 keys derived from the transport key, but we don't + * actually need the transport key. Each key is derived by + * DK(TK,constant). + */ + struct crypto_aead *tx_enc; /* Transmission key */ + struct crypto_aead *rx_enc; /* Reception key */ + struct crypto_shash *tx_Kc; /* Transmission checksum key */ + struct crypto_shash *rx_Kc; /* Reception checksum key */ + struct crypto_aead *resp_enc; /* Response packet enc key */ +}; + +#define xdr_round_up(x) (round_up((x), sizeof(__be32))) +#define xdr_object_len(x) (4 + xdr_round_up(x)) + +/* + * rxgk_app.c + */ +int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key); +int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int token_offset, unsigned int token_len, + struct key **_key); + +/* + * rxgk_kdf.c + */ +void rxgk_put(struct rxgk_context *gk); +struct rxgk_context *rxgk_generate_transport_key(struct rxrpc_connection *conn, + const struct rxgk_key *key, + unsigned int key_number, + gfp_t gfp); +int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key, + struct crypto_aead **token_key, + unsigned int enctype, + const struct krb5_enctype **_krb5, + gfp_t gfp); + +/* + * Apply decryption and checksumming functions to part of an skbuff. The + * offset and length are updated to reflect the actual content of the encrypted + * region. + */ +static inline +int rxgk_decrypt_skb(const struct krb5_enctype *krb5, + struct crypto_aead *aead, + struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len, + int *_error_code) +{ + struct scatterlist sg[16]; + size_t offset = 0, len = *_len; + int nr_sg, ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(skb, sg, *_offset, len); + if (unlikely(nr_sg < 0)) + return nr_sg; + + ret = crypto_krb5_decrypt(krb5, aead, sg, nr_sg, + &offset, &len); + switch (ret) { + case 0: + *_offset += offset; + *_len = len; + break; + case -EPROTO: + case -EBADMSG: + *_error_code = RXGK_SEALEDINCON; + break; + default: + break; + } + + return ret; +} + +/* + * Check the MIC on a region of an skbuff. The offset and length are updated + * to reflect the actual content of the secure region. + */ +static inline +int rxgk_verify_mic_skb(const struct krb5_enctype *krb5, + struct crypto_shash *shash, + const struct krb5_buffer *metadata, + struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len, + u32 *_error_code) +{ + struct scatterlist sg[16]; + size_t offset = 0, len = *_len; + int nr_sg, ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(skb, sg, *_offset, len); + if (unlikely(nr_sg < 0)) + return nr_sg; + + ret = crypto_krb5_verify_mic(krb5, shash, metadata, sg, nr_sg, + &offset, &len); + switch (ret) { + case 0: + *_offset += offset; + *_len = len; + break; + case -EPROTO: + case -EBADMSG: + *_error_code = RXGK_SEALEDINCON; + break; + default: + break; + } + + return ret; +} diff --git a/net/rxrpc/rxgk_kdf.c b/net/rxrpc/rxgk_kdf.c new file mode 100644 index 0000000000000..b4db5aa30e5b9 --- /dev/null +++ b/net/rxrpc/rxgk_kdf.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* RxGK transport key derivation. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include "ar-internal.h" +#include "rxgk_common.h" + +#define round16(x) (((x) + 15) & ~15) + +/* + * Constants used to derive the keys and hmacs actually used for doing stuff. + */ +#define RXGK_CLIENT_ENC_PACKET 1026U // 0x402 +#define RXGK_CLIENT_MIC_PACKET 1027U // 0x403 +#define RXGK_SERVER_ENC_PACKET 1028U // 0x404 +#define RXGK_SERVER_MIC_PACKET 1029U // 0x405 +#define RXGK_CLIENT_ENC_RESPONSE 1030U // 0x406 +#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c + +static void rxgk_free(struct rxgk_context *gk) +{ + if (gk->tx_Kc) + crypto_free_shash(gk->tx_Kc); + if (gk->rx_Kc) + crypto_free_shash(gk->rx_Kc); + if (gk->tx_enc) + crypto_free_aead(gk->tx_enc); + if (gk->rx_enc) + crypto_free_aead(gk->rx_enc); + if (gk->resp_enc) + crypto_free_aead(gk->resp_enc); + kfree(gk); +} + +void rxgk_put(struct rxgk_context *gk) +{ + if (gk && refcount_dec_and_test(&gk->usage)) + rxgk_free(gk); +} + +/* + * Transport key derivation function. + * + * TK = random-to-key(PRF+(K0, L, + * epoch || cid || start_time || key_number)) + * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-11 sec 8.3] + */ +static int rxgk_derive_transport_key(struct rxrpc_connection *conn, + struct rxgk_context *gk, + const struct rxgk_key *rxgk, + struct krb5_buffer *TK, + gfp_t gfp) +{ + const struct krb5_enctype *krb5 = gk->krb5; + struct krb5_buffer conn_info; + unsigned int L = krb5->key_bytes; + __be32 *info; + u8 *buffer; + int ret; + + _enter(""); + + conn_info.len = sizeof(__be32) * 5; + + buffer = kzalloc(round16(conn_info.len), gfp); + if (!buffer) + return -ENOMEM; + + conn_info.data = buffer; + + info = (__be32 *)conn_info.data; + info[0] = htonl(conn->proto.epoch); + info[1] = htonl(conn->proto.cid); + info[2] = htonl(conn->rxgk.start_time >> 32); + info[3] = htonl(conn->rxgk.start_time >> 0); + info[4] = htonl(gk->key_number); + + ret = crypto_krb5_calc_PRFplus(krb5, &rxgk->key, L, &conn_info, TK, gfp); + kfree_sensitive(buffer); + _leave(" = %d", ret); + return ret; +} + +/* + * Set up the ciphers for the usage keys. + */ +static int rxgk_set_up_ciphers(struct rxrpc_connection *conn, + struct rxgk_context *gk, + const struct rxgk_key *rxgk, + gfp_t gfp) +{ + const struct krb5_enctype *krb5 = gk->krb5; + struct crypto_shash *shash; + struct crypto_aead *aead; + struct krb5_buffer TK; + bool service = rxrpc_conn_is_service(conn); + int ret; + u8 *buffer; + + buffer = kzalloc(krb5->key_bytes, gfp); + if (!buffer) + return -ENOMEM; + + TK.len = krb5->key_bytes; + TK.data = buffer; + + ret = rxgk_derive_transport_key(conn, gk, rxgk, &TK, gfp); + if (ret < 0) + goto out; + + aead = crypto_krb5_prepare_encryption(krb5, &TK, RXGK_CLIENT_ENC_RESPONSE, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->resp_enc = aead; + + if (crypto_aead_blocksize(gk->resp_enc) != krb5->block_len || + crypto_aead_authsize(gk->resp_enc) != krb5->cksum_len) { + pr_notice("algo inconsistent with krb5 table %u!=%u or %u!=%u\n", + crypto_aead_blocksize(gk->resp_enc), krb5->block_len, + crypto_aead_authsize(gk->resp_enc), krb5->cksum_len); + ret = -EINVAL; + goto out; + } + + if (service) { + switch (conn->security_level) { + case RXRPC_SECURITY_AUTH: + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_SERVER_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->tx_Kc = shash; + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_CLIENT_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->rx_Kc = shash; + break; + case RXRPC_SECURITY_ENCRYPT: + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_SERVER_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->tx_enc = aead; + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_CLIENT_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->rx_enc = aead; + break; + } + } else { + switch (conn->security_level) { + case RXRPC_SECURITY_AUTH: + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_CLIENT_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->tx_Kc = shash; + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_SERVER_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->rx_Kc = shash; + break; + case RXRPC_SECURITY_ENCRYPT: + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_CLIENT_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->tx_enc = aead; + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_SERVER_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->rx_enc = aead; + break; + } + } + + ret = 0; +out: + kfree_sensitive(buffer); + return ret; +aead_error: + ret = PTR_ERR(aead); + goto out; +hash_error: + ret = PTR_ERR(shash); + goto out; +} + +/* + * Derive a transport key for a connection and then derive a bunch of usage + * keys from it and set up ciphers using them. + */ +struct rxgk_context *rxgk_generate_transport_key(struct rxrpc_connection *conn, + const struct rxgk_key *key, + unsigned int key_number, + gfp_t gfp) +{ + struct rxgk_context *gk; + unsigned long lifetime; + int ret = -ENOPKG; + + _enter(""); + + gk = kzalloc(sizeof(*gk), GFP_KERNEL); + if (!gk) + return ERR_PTR(-ENOMEM); + refcount_set(&gk->usage, 1); + gk->key = key; + gk->key_number = key_number; + + gk->krb5 = crypto_krb5_find_enctype(key->enctype); + if (!gk->krb5) + goto err_tk; + + ret = rxgk_set_up_ciphers(conn, gk, key, gfp); + if (ret) + goto err_tk; + + /* Set the remaining number of bytes encrypted with this key that may + * be transmitted before rekeying. Note that the spec has been + * interpreted differently on this point... + */ + switch (key->bytelife) { + case 0: + case 63: + gk->bytes_remaining = LLONG_MAX; + break; + case 1 ... 62: + gk->bytes_remaining = 1LL << key->bytelife; + break; + default: + gk->bytes_remaining = key->bytelife; + break; + } + + /* Set the time after which rekeying must occur */ + if (key->lifetime) { + lifetime = min_t(u64, key->lifetime, INT_MAX / HZ); + lifetime *= HZ; + } else { + lifetime = MAX_JIFFY_OFFSET; + } + gk->expiry = jiffies + lifetime; + return gk; + +err_tk: + rxgk_put(gk); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * Use the server secret key to set up the ciphers that will be used to extract + * the token from a response packet. + */ +int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key, + struct crypto_aead **token_aead, + unsigned int enctype, + const struct krb5_enctype **_krb5, + gfp_t gfp) +{ + const struct krb5_enctype *krb5; + struct crypto_aead *aead; + + krb5 = crypto_krb5_find_enctype(enctype); + if (!krb5) + return -ENOPKG; + + aead = crypto_krb5_prepare_encryption(krb5, server_key, RXGK_SERVER_ENC_TOKEN, gfp); + if (IS_ERR(aead)) + return PTR_ERR(aead); + + *_krb5 = krb5; + *token_aead = aead; + return 0; +} diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 6cb37b0eb77f4..3657c0661cdc7 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -177,8 +177,10 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem if (!txb) return NULL; - txb->offset += shdr; - txb->space = part; + txb->crypto_header = 0; + txb->sec_header = shdr; + txb->offset += shdr; + txb->space = part; return txb; } @@ -683,6 +685,8 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) serial = rxrpc_get_next_serial(conn); whdr.serial = htonl(serial); + trace_rxrpc_tx_challenge(conn, serial, 0, conn->rxkad.nonce); + ret = kernel_sendmsg(conn->local->socket, &msg, iov, 2, len); if (ret < 0) { trace_rxrpc_tx_fail(conn->debug_id, serial, ret, @@ -697,62 +701,6 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) return 0; } -/* - * send a Kerberos security response - */ -static int rxkad_send_response(struct rxrpc_connection *conn, - struct rxrpc_host_header *hdr, - struct rxkad_response *resp, - const struct rxkad_key *s2) -{ - struct rxrpc_wire_header whdr; - struct msghdr msg; - struct kvec iov[3]; - size_t len; - u32 serial; - int ret; - - _enter(""); - - msg.msg_name = &conn->peer->srx.transport; - msg.msg_namelen = conn->peer->srx.transport_len; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - memset(&whdr, 0, sizeof(whdr)); - whdr.epoch = htonl(hdr->epoch); - whdr.cid = htonl(hdr->cid); - whdr.type = RXRPC_PACKET_TYPE_RESPONSE; - whdr.flags = conn->out_clientflag; - whdr.securityIndex = hdr->securityIndex; - whdr.serviceId = htons(hdr->serviceId); - - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); - iov[1].iov_base = resp; - iov[1].iov_len = sizeof(*resp); - iov[2].iov_base = (void *)s2->ticket; - iov[2].iov_len = s2->ticket_len; - - len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len; - - serial = rxrpc_get_next_serial(conn); - whdr.serial = htonl(serial); - - rxrpc_local_dont_fragment(conn->local, false); - ret = kernel_sendmsg(conn->local->socket, &msg, iov, 3, len); - if (ret < 0) { - trace_rxrpc_tx_fail(conn->debug_id, serial, ret, - rxrpc_tx_point_rxkad_response); - return -EAGAIN; - } - - conn->peer->last_tx_at = ktime_get_seconds(); - _leave(" = 0"); - return 0; -} - /* * calculate the response checksum */ @@ -772,12 +720,21 @@ static void rxkad_calc_response_checksum(struct rxkad_response *response) * encrypt the response packet */ static int rxkad_encrypt_response(struct rxrpc_connection *conn, - struct rxkad_response *resp, + struct sk_buff *response, const struct rxkad_key *s2) { struct skcipher_request *req; struct rxrpc_crypt iv; struct scatterlist sg[1]; + size_t encsize = sizeof(((struct rxkad_response *)0)->encrypted); + int ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + ret = skb_to_sgvec(response, sg, + sizeof(struct rxrpc_wire_header) + + offsetof(struct rxkad_response, encrypted), encsize); + if (ret < 0) + return ret; req = skcipher_request_alloc(&conn->rxkad.cipher->base, GFP_NOFS); if (!req) @@ -786,88 +743,205 @@ static int rxkad_encrypt_response(struct rxrpc_connection *conn, /* continue encrypting from where we left off */ memcpy(&iv, s2->session_key, sizeof(iv)); - sg_init_table(sg, 1); - sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); skcipher_request_set_sync_tfm(req, conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); - crypto_skcipher_encrypt(req); + skcipher_request_set_crypt(req, sg, sg, encsize, iv.x); + ret = crypto_skcipher_encrypt(req); skcipher_request_free(req); - return 0; + return ret; } /* - * respond to a challenge packet + * Validate a challenge packet. */ -static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb) +static bool rxkad_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) { - const struct rxrpc_key_token *token; struct rxkad_challenge challenge; - struct rxkad_response *resp; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - u32 version, nonce, min_level; - int ret = -EPROTO; + u32 version, min_level; + int ret; _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); - if (!conn->key) - return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, - rxkad_abort_chall_no_key); + if (!conn->key) { + rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxkad_abort_chall_no_key); + return false; + } ret = key_validate(conn->key); - if (ret < 0) - return rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret, - rxkad_abort_chall_key_expired); + if (ret < 0) { + rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret, + rxkad_abort_chall_key_expired); + return false; + } if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), - &challenge, sizeof(challenge)) < 0) - return rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, - rxkad_abort_chall_short); + &challenge, sizeof(challenge)) < 0) { + rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, + rxkad_abort_chall_short); + return false; + } version = ntohl(challenge.version); - nonce = ntohl(challenge.nonce); + sp->chall.rxkad_nonce = ntohl(challenge.nonce); min_level = ntohl(challenge.min_level); - trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, nonce, min_level); + trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, + sp->chall.rxkad_nonce, min_level); + + if (version != RXKAD_VERSION) { + rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, + rxkad_abort_chall_version); + return false; + } - if (version != RXKAD_VERSION) - return rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, - rxkad_abort_chall_version); + if (conn->security_level < min_level) { + rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES, + rxkad_abort_chall_level); + return false; + } + return true; +} + +/* + * Insert the header into the response. + */ +static noinline +int rxkad_insert_response_header(struct rxrpc_connection *conn, + const struct rxrpc_key_token *token, + struct sk_buff *challenge, + struct sk_buff *response, + size_t *offset) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + struct { + struct rxrpc_wire_header whdr; + struct rxkad_response resp; + } h; + int ret; - if (conn->security_level < min_level) - return rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES, - rxkad_abort_chall_level); + h.whdr.epoch = htonl(conn->proto.epoch); + h.whdr.cid = htonl(conn->proto.cid); + h.whdr.callNumber = 0; + h.whdr.serial = 0; + h.whdr.seq = 0; + h.whdr.type = RXRPC_PACKET_TYPE_RESPONSE; + h.whdr.flags = conn->out_clientflag; + h.whdr.userStatus = 0; + h.whdr.securityIndex = conn->security_ix; + h.whdr.cksum = 0; + h.whdr.serviceId = htons(conn->service_id); + h.resp.version = htonl(RXKAD_VERSION); + h.resp.__pad = 0; + h.resp.encrypted.epoch = htonl(conn->proto.epoch); + h.resp.encrypted.cid = htonl(conn->proto.cid); + h.resp.encrypted.checksum = 0; + h.resp.encrypted.securityIndex = htonl(conn->security_ix); + h.resp.encrypted.call_id[0] = htonl(conn->channels[0].call_counter); + h.resp.encrypted.call_id[1] = htonl(conn->channels[1].call_counter); + h.resp.encrypted.call_id[2] = htonl(conn->channels[2].call_counter); + h.resp.encrypted.call_id[3] = htonl(conn->channels[3].call_counter); + h.resp.encrypted.inc_nonce = htonl(csp->chall.rxkad_nonce + 1); + h.resp.encrypted.level = htonl(conn->security_level); + h.resp.kvno = htonl(token->kad->kvno); + h.resp.ticket_len = htonl(token->kad->ticket_len); + + rxkad_calc_response_checksum(&h.resp); + + ret = skb_store_bits(response, *offset, &h, sizeof(h)); + *offset += sizeof(h); + return ret; +} + +/* + * respond to a challenge packet + */ +static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, + struct sk_buff *challenge) +{ + const struct rxrpc_key_token *token; + struct rxrpc_skb_priv *csp, *rsp; + struct sk_buff *response; + size_t len, offset = 0; + int ret = -EPROTO; + + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + + ret = key_validate(conn->key); + if (ret < 0) + return rxrpc_abort_conn(conn, challenge, RXKADEXPIRED, ret, + rxkad_abort_chall_key_expired); token = conn->key->payload.data[0]; /* build the response packet */ - resp = kzalloc(sizeof(struct rxkad_response), GFP_NOFS); - if (!resp) - return -ENOMEM; + len = sizeof(struct rxrpc_wire_header) + + sizeof(struct rxkad_response) + + token->kad->ticket_len; + + response = alloc_skb_with_frags(0, len, 0, &ret, GFP_NOFS); + if (!response) + goto error; + rxrpc_new_skb(response, rxrpc_skb_new_response_rxkad); + response->len = len; + response->data_len = len; + + offset = 0; + ret = rxkad_insert_response_header(conn, token, challenge, response, + &offset); + if (ret < 0) + goto error; + + ret = rxkad_encrypt_response(conn, response, token->kad); + if (ret < 0) + goto error; + + ret = skb_store_bits(response, offset, token->kad->ticket, + token->kad->ticket_len); + if (ret < 0) + goto error; - resp->version = htonl(RXKAD_VERSION); - resp->encrypted.epoch = htonl(conn->proto.epoch); - resp->encrypted.cid = htonl(conn->proto.cid); - resp->encrypted.securityIndex = htonl(conn->security_ix); - resp->encrypted.inc_nonce = htonl(nonce + 1); - resp->encrypted.level = htonl(conn->security_level); - resp->kvno = htonl(token->kad->kvno); - resp->ticket_len = htonl(token->kad->ticket_len); - resp->encrypted.call_id[0] = htonl(conn->channels[0].call_counter); - resp->encrypted.call_id[1] = htonl(conn->channels[1].call_counter); - resp->encrypted.call_id[2] = htonl(conn->channels[2].call_counter); - resp->encrypted.call_id[3] = htonl(conn->channels[3].call_counter); - - /* calculate the response checksum and then do the encryption */ - rxkad_calc_response_checksum(resp); - ret = rxkad_encrypt_response(conn, resp, token->kad); - if (ret == 0) - ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad); - kfree(resp); + csp = rxrpc_skb(challenge); + rsp = rxrpc_skb(response); + rsp->resp.len = len; + rsp->resp.challenge_serial = csp->hdr.serial; + rxrpc_post_response(conn, response); + response = NULL; + ret = 0; + +error: + rxrpc_free_skb(response, rxrpc_skb_put_response); return ret; } +/* + * RxKAD does automatic response only as there's nothing to manage that isn't + * already in the key. + */ +static int rxkad_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + return -EINVAL; +} + +/** + * rxkad_kernel_respond_to_challenge - Respond to a challenge with appdata + * @challenge: The challenge to respond to + * + * Allow a kernel application to respond to a CHALLENGE. + * + * Return: %0 if successful and a negative error code otherwise. + */ +int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + + return rxkad_respond_to_challenge(csp->chall.conn, challenge); +} +EXPORT_SYMBOL(rxkad_kernel_respond_to_challenge); + /* * decrypt the kerberos IV ticket in the response */ @@ -1276,6 +1350,8 @@ const struct rxrpc_security rxkad = { .verify_packet = rxkad_verify_packet, .free_call_crypto = rxkad_free_call_crypto, .issue_challenge = rxkad_issue_challenge, + .validate_challenge = rxkad_validate_challenge, + .sendmsg_respond_to_challenge = rxkad_sendmsg_respond_to_challenge, .respond_to_challenge = rxkad_respond_to_challenge, .verify_response = rxkad_verify_response, .clear = rxkad_clear, diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index e848a4777b8c7..0377301156b09 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "rxperf: " fmt #include #include +#include #include #include #define RXRPC_TRACE_ONLY_DEFINE_ENUMS @@ -136,6 +137,12 @@ static void rxperf_notify_end_reply_tx(struct sock *sock, RXPERF_CALL_SV_AWAIT_ACK); } +static const struct rxrpc_kernel_ops rxperf_rxrpc_callback_ops = { + .notify_new_call = rxperf_rx_new_call, + .discard_new_call = rxperf_rx_discard_new_call, + .user_attach_call = rxperf_rx_attach, +}; + /* * Charge the incoming call preallocation. */ @@ -161,7 +168,6 @@ static void rxperf_charge_preallocation(struct work_struct *work) if (rxrpc_kernel_charge_accept(rxperf_socket, rxperf_notify_rx, - rxperf_rx_attach, (unsigned long)call, GFP_KERNEL, call->debug_id) < 0) @@ -209,8 +215,7 @@ static int rxperf_open_socket(void) if (ret < 0) goto error_2; - rxrpc_kernel_new_call_notification(socket, rxperf_rx_new_call, - rxperf_rx_discard_new_call); + rxrpc_kernel_set_notifications(socket, &rxperf_rxrpc_callback_ops); ret = kernel_listen(socket, INT_MAX); if (ret < 0) @@ -546,9 +551,9 @@ static int rxperf_process_call(struct rxperf_call *call) } /* - * Add a key to the security keyring. + * Add an rxkad key to the security keyring. */ -static int rxperf_add_key(struct key *keyring) +static int rxperf_add_rxkad_key(struct key *keyring) { key_ref_t kref; int ret; @@ -574,6 +579,47 @@ static int rxperf_add_key(struct key *keyring) return ret; } +#ifdef CONFIG_RXGK +/* + * Add a yfs-rxgk key to the security keyring. + */ +static int rxperf_add_yfs_rxgk_key(struct key *keyring, u32 enctype) +{ + const struct krb5_enctype *krb5 = crypto_krb5_find_enctype(enctype); + key_ref_t kref; + char name[64]; + int ret; + u8 key[32]; + + if (!krb5 || krb5->key_len > sizeof(key)) + return 0; + + /* The key is just { 0, 1, 2, 3, 4, ... } */ + for (int i = 0; i < krb5->key_len; i++) + key[i] = i; + + sprintf(name, "%u:6:1:%u", RX_PERF_SERVICE, enctype); + + kref = key_create_or_update(make_key_ref(keyring, true), + "rxrpc_s", name, + key, krb5->key_len, + KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + + if (IS_ERR(kref)) { + pr_err("Can't allocate rxperf server key: %ld\n", PTR_ERR(kref)); + return PTR_ERR(kref); + } + + ret = key_link(keyring, key_ref_to_ptr(kref)); + if (ret < 0) + pr_err("Can't link rxperf server key: %d\n", ret); + key_ref_put(kref); + return ret; +} +#endif + /* * Initialise the rxperf server. */ @@ -603,9 +649,29 @@ static int __init rxperf_init(void) goto error_keyring; } rxperf_sec_keyring = keyring; - ret = rxperf_add_key(keyring); + ret = rxperf_add_rxkad_key(keyring); + if (ret < 0) + goto error_key; +#ifdef CONFIG_RXGK + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES256_CTS_HMAC_SHA1_96); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES128_CTS_HMAC_SHA256_128); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES256_CTS_HMAC_SHA384_192); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_CAMELLIA128_CTS_CMAC); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_CAMELLIA256_CTS_CMAC); if (ret < 0) goto error_key; +#endif ret = rxperf_open_socket(); if (ret < 0) diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index 9784adc8f2759..078d91a6b77fb 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -20,6 +20,9 @@ static const struct rxrpc_security *rxrpc_security_types[] = { #ifdef CONFIG_RXKAD [RXRPC_SECURITY_RXKAD] = &rxkad, #endif +#ifdef CONFIG_RXGK + [RXRPC_SECURITY_YFS_RXGK] = &rxgk_yfs, +#endif }; int __init rxrpc_init_security(void) diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 84dc6c94f23b1..ebbb78b842de8 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -607,7 +607,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, struct rxrpc_send_params *p) - __releases(&rx->sk.sk_lock.slock) + __releases(&rx->sk.sk_lock) __acquires(&call->user_mutex) { struct rxrpc_conn_parameters cp; @@ -657,7 +657,6 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, * - the socket may be either a client socket or a server socket */ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) - __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call; bool dropped_lock = false; @@ -759,14 +758,21 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) if (rxrpc_call_is_complete(call)) { /* it's too late for this call */ ret = -ESHUTDOWN; - } else if (p.command == RXRPC_CMD_SEND_ABORT) { + goto out_put_unlock; + } + + switch (p.command) { + case RXRPC_CMD_SEND_ABORT: rxrpc_propose_abort(call, p.abort_code, -ECONNABORTED, rxrpc_abort_call_sendmsg); ret = 0; - } else if (p.command != RXRPC_CMD_SEND_DATA) { - ret = -EINVAL; - } else { + break; + case RXRPC_CMD_SEND_DATA: ret = rxrpc_send_data(rx, call, msg, len, NULL, &dropped_lock); + break; + default: + ret = -EINVAL; + break; } out_put_unlock: @@ -794,6 +800,8 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) * appropriate to sending data. No control data should be supplied in @msg, * nor should an address be supplied. MSG_MORE should be flagged if there's * more data to come, otherwise this data will end the transmission phase. + * + * Return: %0 if successful and a negative error code otherwise. */ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, size_t len, @@ -829,8 +837,9 @@ EXPORT_SYMBOL(rxrpc_kernel_send_data); * @error: Local error value * @why: Indication as to why. * - * Allow a kernel service to abort a call, if it's still in an abortable state - * and return true if the call was aborted, false if it was already complete. + * Allow a kernel service to abort a call if it's still in an abortable state. + * + * Return: %true if the call was aborted, %false if it was already complete. */ bool rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, u32 abort_code, int error, enum rxrpc_abort_reason why) diff --git a/net/rxrpc/server_key.c b/net/rxrpc/server_key.c index e51940589ee54..36b05fd842a7b 100644 --- a/net/rxrpc/server_key.c +++ b/net/rxrpc/server_key.c @@ -152,6 +152,8 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) * * Set the server security keyring on an rxrpc socket. This is used to provide * the encryption keys for a kernel service. + * + * Return: %0 if successful and a negative error code otherwise. */ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring) { @@ -169,3 +171,43 @@ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring) return ret; } EXPORT_SYMBOL(rxrpc_sock_set_security_keyring); + +/** + * rxrpc_sock_set_manage_response - Set the manage-response flag for a kernel service + * @sk: The socket to set the keyring on + * @set: True to set, false to clear the flag + * + * Set the flag on an rxrpc socket to say that the caller wants to manage the + * RESPONSE packet and the user-defined data it may contain. Setting this + * means that recvmsg() will return messages with RXRPC_CHALLENGED in the + * control message buffer containing information about the challenge. + * + * The user should respond to the challenge by passing RXRPC_RESPOND or + * RXRPC_RESPOND_ABORT control messages with sendmsg() to the same call. + * Supplementary control messages, such as RXRPC_RESP_RXGK_APPDATA, may be + * included to indicate the parts the user wants to supply. + * + * The server will be passed the response data with a RXRPC_RESPONDED control + * message when it gets the first data from each call. + * + * Note that this is only honoured by security classes that need auxiliary data + * (e.g. RxGK). Those that don't offer the facility (e.g. RxKAD) respond + * without consulting userspace. + * + * Return: The previous setting. + */ +int rxrpc_sock_set_manage_response(struct sock *sk, bool set) +{ + struct rxrpc_sock *rx = rxrpc_sk(sk); + int ret; + + lock_sock(sk); + ret = !!test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + if (set) + set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + else + clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + release_sock(sk); + return ret; +} +EXPORT_SYMBOL(rxrpc_sock_set_manage_response); diff --git a/net/sched/Kconfig b/net/sched/Kconfig index a800127effcd7..9f0b3f943fca8 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -403,6 +403,18 @@ config NET_SCH_ETS If unsure, say N. +config NET_SCH_BPF + bool "BPF-based Qdisc" + depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF + help + This option allows BPF-based queueing disiplines. With BPF struct_ops, + users can implement supported operators in Qdisc_ops using BPF programs. + The queue holding skb can be built with BPF maps or graphs. + + Say Y here if you want to use BPF-based Qdisc. + + If unsure, say N. + menuconfig NET_SCH_DEFAULT bool "Allow override default queue discipline" help diff --git a/net/sched/Makefile b/net/sched/Makefile index 82c3f78ca486e..904d784902d14 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -62,6 +62,7 @@ obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o +obj-$(CONFIG_NET_SCH_BPF) += bpf_qdisc.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 839790043256b..057e20cef3754 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1461,17 +1461,29 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {}; - struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + struct nlattr *tb[TCA_ACT_MAX_PRIO + 2]; struct tc_action *act; size_t sz = 0; int err; int i; - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL, + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO + 1, nla, NULL, extack); if (err < 0) return err; + /* The nested attributes are parsed as types, but they are really an + * array of actions. So we parse one more than we can handle, and return + * an error if the last one is set (as that indicates that the request + * contained more than the maximum number of actions). + */ + if (tb[TCA_ACT_MAX_PRIO + 1]) { + NL_SET_ERR_MSG_FMT(extack, + "Only %d actions supported per filter", + TCA_ACT_MAX_PRIO); + return -EINVAL; + } + for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { struct tc_action_ops *a_o; diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c new file mode 100644 index 0000000000000..9f32b305636fc --- /dev/null +++ b/net/sched/bpf_qdisc.c @@ -0,0 +1,461 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include + +#define QDISC_OP_IDX(op) (offsetof(struct Qdisc_ops, op) / sizeof(void (*)(void))) +#define QDISC_MOFF_IDX(moff) (moff / sizeof(void (*)(void))) + +static struct bpf_struct_ops bpf_Qdisc_ops; + +struct bpf_sched_data { + struct qdisc_watchdog watchdog; +}; + +struct bpf_sk_buff_ptr { + struct sk_buff *skb; +}; + +static int bpf_qdisc_init(struct btf *btf) +{ + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_qdisc_ids, struct, Qdisc) +BTF_ID_LIST_SINGLE(bpf_sk_buff_ids, struct, sk_buff) +BTF_ID_LIST_SINGLE(bpf_sk_buff_ptr_ids, struct, bpf_sk_buff_ptr) + +static bool bpf_qdisc_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + struct btf *btf = prog->aux->attach_btf; + u32 arg; + + arg = btf_ctx_arg_idx(btf, prog->aux->attach_func_proto, off); + if (prog->aux->attach_st_ops_member_off == offsetof(struct Qdisc_ops, enqueue)) { + if (arg == 2 && type == BPF_READ) { + info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED; + info->btf = btf; + info->btf_id = bpf_sk_buff_ptr_ids[0]; + return true; + } + } + + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +static int bpf_qdisc_qdisc_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, size_t *end) +{ + switch (off) { + case offsetof(struct Qdisc, limit): + *end = offsetofend(struct Qdisc, limit); + break; + case offsetof(struct Qdisc, q) + offsetof(struct qdisc_skb_head, qlen): + *end = offsetof(struct Qdisc, q) + offsetofend(struct qdisc_skb_head, qlen); + break; + case offsetof(struct Qdisc, qstats) ... offsetofend(struct Qdisc, qstats) - 1: + *end = offsetofend(struct Qdisc, qstats); + break; + default: + return -EACCES; + } + + return 0; +} + +static int bpf_qdisc_sk_buff_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, size_t *end) +{ + switch (off) { + case offsetof(struct sk_buff, tstamp): + *end = offsetofend(struct sk_buff, tstamp); + break; + case offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data[0]) ... + offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, + data[QDISC_CB_PRIV_LEN - 1]): + *end = offsetof(struct sk_buff, cb) + + offsetofend(struct qdisc_skb_cb, data[QDISC_CB_PRIV_LEN - 1]); + break; + default: + return -EACCES; + } + + return 0; +} + +static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + const struct btf_type *t, *skbt, *qdisct; + size_t end; + int err; + + skbt = btf_type_by_id(reg->btf, bpf_sk_buff_ids[0]); + qdisct = btf_type_by_id(reg->btf, bpf_qdisc_ids[0]); + t = btf_type_by_id(reg->btf, reg->btf_id); + + if (t == skbt) { + err = bpf_qdisc_sk_buff_access(log, reg, off, &end); + } else if (t == qdisct) { + err = bpf_qdisc_qdisc_access(log, reg, off, &end); + } else { + bpf_log(log, "only read is supported\n"); + return -EACCES; + } + + if (err) { + bpf_log(log, "no write support to %s at off %d\n", + btf_name_by_offset(reg->btf, t->name_off), off); + return -EACCES; + } + + if (off + size > end) { + bpf_log(log, + "write access at off %d with size %d beyond the member of %s ended at %zu\n", + off, size, btf_name_by_offset(reg->btf, t->name_off), end); + return -EACCES; + } + + return 0; +} + +BTF_ID_LIST(bpf_qdisc_init_prologue_ids) +BTF_ID(func, bpf_qdisc_init_prologue) + +static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, init)) + return 0; + + /* r6 = r1; // r6 will be "u64 *ctx". r1 is "u64 *ctx". + * r2 = r1[16]; // r2 will be "struct netlink_ext_ack *extack" + * r1 = r1[0]; // r1 will be "struct Qdisc *sch" + * r0 = bpf_qdisc_init_prologue(r1, r2); + * if r0 == 0 goto pc+1; + * BPF_EXIT; + * r1 = r6; // r1 will be "u64 *ctx". + */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 16); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_init_prologue_ids[0]); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1); + *insn++ = BPF_EXIT_INSN(); + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + +BTF_ID_LIST(bpf_qdisc_reset_destroy_epilogue_ids) +BTF_ID(func, bpf_qdisc_reset_destroy_epilogue) + +static int bpf_qdisc_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog, + s16 ctx_stack_off) +{ + struct bpf_insn *insn = insn_buf; + + if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, reset) && + prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, destroy)) + return 0; + + /* r1 = stack[ctx_stack_off]; // r1 will be "u64 *ctx" + * r1 = r1[0]; // r1 will be "struct Qdisc *sch" + * r0 = bpf_qdisc_reset_destroy_epilogue(r1); + * BPF_EXIT; + */ + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_FP, ctx_stack_off); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_reset_destroy_epilogue_ids[0]); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + +__bpf_kfunc_start_defs(); + +/* bpf_skb_get_hash - Get the flow hash of an skb. + * @skb: The skb to get the flow hash from. + */ +__bpf_kfunc u32 bpf_skb_get_hash(struct sk_buff *skb) +{ + return skb_get_hash(skb); +} + +/* bpf_kfree_skb - Release an skb's reference and drop it immediately. + * @skb: The skb whose reference to be released and dropped. + */ +__bpf_kfunc void bpf_kfree_skb(struct sk_buff *skb) +{ + kfree_skb(skb); +} + +/* bpf_qdisc_skb_drop - Drop an skb by adding it to a deferred free list. + * @skb: The skb whose reference to be released and dropped. + * @to_free_list: The list of skbs to be dropped. + */ +__bpf_kfunc void bpf_qdisc_skb_drop(struct sk_buff *skb, + struct bpf_sk_buff_ptr *to_free_list) +{ + __qdisc_drop(skb, (struct sk_buff **)to_free_list); +} + +/* bpf_qdisc_watchdog_schedule - Schedule a qdisc to a later time using a timer. + * @sch: The qdisc to be scheduled. + * @expire: The expiry time of the timer. + * @delta_ns: The slack range of the timer. + */ +__bpf_kfunc void bpf_qdisc_watchdog_schedule(struct Qdisc *sch, u64 expire, u64 delta_ns) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_schedule_range_ns(&q->watchdog, expire, delta_ns); +} + +/* bpf_qdisc_init_prologue - Hidden kfunc called in prologue of .init. */ +__bpf_kfunc int bpf_qdisc_init_prologue(struct Qdisc *sch, + struct netlink_ext_ack *extack) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *p; + + if (sch->parent != TC_H_ROOT) { + p = qdisc_lookup(dev, TC_H_MAJ(sch->parent)); + if (!p) + return -ENOENT; + + if (!(p->flags & TCQ_F_MQROOT)) { + NL_SET_ERR_MSG(extack, "BPF qdisc only supported on root or mq"); + return -EINVAL; + } + } + + qdisc_watchdog_init(&q->watchdog, sch); + return 0; +} + +/* bpf_qdisc_reset_destroy_epilogue - Hidden kfunc called in epilogue of .reset + * and .destroy + */ +__bpf_kfunc void bpf_qdisc_reset_destroy_epilogue(struct Qdisc *sch) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_cancel(&q->watchdog); +} + +/* bpf_qdisc_bstats_update - Update Qdisc basic statistics + * @sch: The qdisc from which an skb is dequeued. + * @skb: The skb to be dequeued. + */ +__bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb) +{ + bstats_update(&sch->bstats, skb); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(qdisc_kfunc_ids) +BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(qdisc_kfunc_ids) + +BTF_SET_START(qdisc_common_kfunc_set) +BTF_ID(func, bpf_skb_get_hash) +BTF_ID(func, bpf_kfree_skb) +BTF_ID(func, bpf_dynptr_from_skb) +BTF_SET_END(qdisc_common_kfunc_set) + +BTF_SET_START(qdisc_enqueue_kfunc_set) +BTF_ID(func, bpf_qdisc_skb_drop) +BTF_ID(func, bpf_qdisc_watchdog_schedule) +BTF_SET_END(qdisc_enqueue_kfunc_set) + +BTF_SET_START(qdisc_dequeue_kfunc_set) +BTF_ID(func, bpf_qdisc_watchdog_schedule) +BTF_ID(func, bpf_qdisc_bstats_update) +BTF_SET_END(qdisc_dequeue_kfunc_set) + +enum qdisc_ops_kf_flags { + QDISC_OPS_KF_COMMON = 0, + QDISC_OPS_KF_ENQUEUE = 1 << 0, + QDISC_OPS_KF_DEQUEUE = 1 << 1, +}; + +static const u32 qdisc_ops_context_flags[] = { + [QDISC_OP_IDX(enqueue)] = QDISC_OPS_KF_ENQUEUE, + [QDISC_OP_IDX(dequeue)] = QDISC_OPS_KF_DEQUEUE, + [QDISC_OP_IDX(init)] = QDISC_OPS_KF_COMMON, + [QDISC_OP_IDX(reset)] = QDISC_OPS_KF_COMMON, + [QDISC_OP_IDX(destroy)] = QDISC_OPS_KF_COMMON, +}; + +static int bpf_qdisc_kfunc_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + u32 moff, flags; + + if (!btf_id_set8_contains(&qdisc_kfunc_ids, kfunc_id)) + return 0; + + if (prog->aux->st_ops != &bpf_Qdisc_ops) + return -EACCES; + + moff = prog->aux->attach_st_ops_member_off; + flags = qdisc_ops_context_flags[QDISC_MOFF_IDX(moff)]; + + if ((flags & QDISC_OPS_KF_ENQUEUE) && + btf_id_set_contains(&qdisc_enqueue_kfunc_set, kfunc_id)) + return 0; + + if ((flags & QDISC_OPS_KF_DEQUEUE) && + btf_id_set_contains(&qdisc_dequeue_kfunc_set, kfunc_id)) + return 0; + + if (btf_id_set_contains(&qdisc_common_kfunc_set, kfunc_id)) + return 0; + + return -EACCES; +} + +static const struct btf_kfunc_id_set bpf_qdisc_kfunc_set = { + .owner = THIS_MODULE, + .set = &qdisc_kfunc_ids, + .filter = bpf_qdisc_kfunc_filter, +}; + +static const struct bpf_verifier_ops bpf_qdisc_verifier_ops = { + .get_func_proto = bpf_base_func_proto, + .is_valid_access = bpf_qdisc_is_valid_access, + .btf_struct_access = bpf_qdisc_btf_struct_access, + .gen_prologue = bpf_qdisc_gen_prologue, + .gen_epilogue = bpf_qdisc_gen_epilogue, +}; + +static int bpf_qdisc_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct Qdisc_ops *uqdisc_ops; + struct Qdisc_ops *qdisc_ops; + u32 moff; + + uqdisc_ops = (const struct Qdisc_ops *)udata; + qdisc_ops = (struct Qdisc_ops *)kdata; + + moff = __btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct Qdisc_ops, priv_size): + if (uqdisc_ops->priv_size) + return -EINVAL; + qdisc_ops->priv_size = sizeof(struct bpf_sched_data); + return 1; + case offsetof(struct Qdisc_ops, peek): + qdisc_ops->peek = qdisc_peek_dequeued; + return 0; + case offsetof(struct Qdisc_ops, id): + if (bpf_obj_name_cpy(qdisc_ops->id, uqdisc_ops->id, + sizeof(qdisc_ops->id)) <= 0) + return -EINVAL; + return 1; + } + + return 0; +} + +static int bpf_qdisc_reg(void *kdata, struct bpf_link *link) +{ + return register_qdisc(kdata); +} + +static void bpf_qdisc_unreg(void *kdata, struct bpf_link *link) +{ + return unregister_qdisc(kdata); +} + +static int Qdisc_ops__enqueue(struct sk_buff *skb__ref, struct Qdisc *sch, + struct sk_buff **to_free) +{ + return 0; +} + +static struct sk_buff *Qdisc_ops__dequeue(struct Qdisc *sch) +{ + return NULL; +} + +static int Qdisc_ops__init(struct Qdisc *sch, struct nlattr *arg, + struct netlink_ext_ack *extack) +{ + return 0; +} + +static void Qdisc_ops__reset(struct Qdisc *sch) +{ +} + +static void Qdisc_ops__destroy(struct Qdisc *sch) +{ +} + +static struct Qdisc_ops __bpf_ops_qdisc_ops = { + .enqueue = Qdisc_ops__enqueue, + .dequeue = Qdisc_ops__dequeue, + .init = Qdisc_ops__init, + .reset = Qdisc_ops__reset, + .destroy = Qdisc_ops__destroy, +}; + +static struct bpf_struct_ops bpf_Qdisc_ops = { + .verifier_ops = &bpf_qdisc_verifier_ops, + .reg = bpf_qdisc_reg, + .unreg = bpf_qdisc_unreg, + .init_member = bpf_qdisc_init_member, + .init = bpf_qdisc_init, + .name = "Qdisc_ops", + .cfi_stubs = &__bpf_ops_qdisc_ops, + .owner = THIS_MODULE, +}; + +BTF_ID_LIST(bpf_sk_buff_dtor_ids) +BTF_ID(func, bpf_kfree_skb) + +static int __init bpf_qdisc_kfunc_init(void) +{ + int ret; + const struct btf_id_dtor_kfunc skb_kfunc_dtors[] = { + { + .btf_id = bpf_sk_buff_ids[0], + .kfunc_btf_id = bpf_sk_buff_dtor_ids[0] + }, + }; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_qdisc_kfunc_set); + ret = ret ?: register_btf_id_dtor_kfuncs(skb_kfunc_dtors, + ARRAY_SIZE(skb_kfunc_dtors), + THIS_MODULE); + ret = ret ?: register_bpf_struct_ops(&bpf_Qdisc_ops, Qdisc_ops); + + return ret; +} +late_initcall(bpf_qdisc_kfunc_init); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f74a097f54ae7..c5e3673aadbe7 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -207,7 +208,7 @@ static struct Qdisc_ops *qdisc_lookup_default(const char *name) for (q = qdisc_base; q; q = q->next) { if (!strcmp(name, q->id)) { - if (!try_module_get(q->owner)) + if (!bpf_try_module_get(q, q->owner)) q = NULL; break; } @@ -237,7 +238,7 @@ int qdisc_set_default(const char *name) if (ops) { /* Set new default */ - module_put(default_qdisc_ops->owner); + bpf_module_put(default_qdisc_ops, default_qdisc_ops->owner); default_qdisc_ops = ops; } write_unlock(&qdisc_mod_lock); @@ -359,7 +360,7 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) read_lock(&qdisc_mod_lock); for (q = qdisc_base; q; q = q->next) { if (nla_strcmp(kind, q->id) == 0) { - if (!try_module_get(q->owner)) + if (!bpf_try_module_get(q, q->owner)) q = NULL; break; } @@ -1370,7 +1371,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); err_out2: - module_put(ops->owner); + bpf_module_put(ops, ops->owner); err_out: *errp = err; return NULL; @@ -1782,7 +1783,7 @@ static void request_qdisc_module(struct nlattr *kind) ops = qdisc_lookup_ops(kind); if (ops) { - module_put(ops->owner); + bpf_module_put(ops, ops->owner); return; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 514b1b6ac6819..08e0e3aff9763 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1001,14 +1002,14 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, { struct Qdisc *sch; - if (!try_module_get(ops->owner)) { + if (!bpf_try_module_get(ops, ops->owner)) { NL_SET_ERR_MSG(extack, "Failed to increase module reference counter"); return NULL; } sch = qdisc_alloc(dev_queue, ops, extack); if (IS_ERR(sch)) { - module_put(ops->owner); + bpf_module_put(ops, ops->owner); return NULL; } sch->parent = parentid; @@ -1078,7 +1079,7 @@ static void __qdisc_destroy(struct Qdisc *qdisc) ops->destroy(qdisc); lockdep_unregister_key(&qdisc->root_lock_key); - module_put(ops->owner); + bpf_module_put(ops, ops->owner); netdev_put(dev, &qdisc->dev_tracker); trace_qdisc_destroy(qdisc); diff --git a/net/tipc/link.c b/net/tipc/link.c index 18be6ff4c3db0..3ee44d7317004 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2228,7 +2228,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, break; if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME) break; - strncpy(if_name, data, TIPC_MAX_IF_NAME); + strscpy(if_name, data, TIPC_MAX_IF_NAME); /* Update own tolerance if peer indicates a non-zero value */ if (tipc_in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) { diff --git a/net/tipc/node.c b/net/tipc/node.c index ccf5e427f43eb..cb43f2016a708 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1581,7 +1581,7 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, tipc_node_read_lock(node); link = node->links[bearer_id].link; if (link) { - strncpy(linkname, tipc_link_name(link), len); + strscpy(linkname, tipc_link_name(link), len); err = 0; } tipc_node_read_unlock(node); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f78a2492826f9..2ab20821d6bb2 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -950,13 +950,6 @@ static void unix_close(struct sock *sk, long timeout) */ } -static void unix_unhash(struct sock *sk) -{ - /* Nothing to do here, unix socket does not need a ->unhash(). - * This is merely for sockmap. - */ -} - static bool unix_bpf_bypass_getsockopt(int level, int optname) { if (level == SOL_SOCKET) { @@ -987,7 +980,6 @@ struct proto unix_stream_proto = { .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, - .unhash = unix_unhash, .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_stream_bpf_update_proto, diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 25a76c5ce0f12..cbf2129e808b3 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -266,13 +266,17 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, void xp_clear_dev(struct xsk_buff_pool *pool) { + struct net_device *netdev = pool->netdev; + if (!pool->netdev) return; + netdev_lock_ops(netdev); xp_disable_drv_zc(pool); xsk_clear_pool_at_qid(pool->netdev, pool->queue_id); - dev_put(pool->netdev); pool->netdev = NULL; + netdev_unlock_ops(netdev); + dev_put(netdev); } static void xp_release_deferred(struct work_struct *work) diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 622445f041d32..cb1e12740c87b 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -952,32 +952,28 @@ static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .get_link_net = xfrmi_get_link_net, }; -static void __net_exit xfrmi_exit_batch_rtnl(struct list_head *net_exit_list, - struct list_head *dev_to_kill) +static void __net_exit xfrmi_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - struct net *net; + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + struct xfrm_if __rcu **xip; + struct xfrm_if *xi; + int i; - ASSERT_RTNL(); - list_for_each_entry(net, net_exit_list, exit_list) { - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - struct xfrm_if __rcu **xip; - struct xfrm_if *xi; - int i; - - for (i = 0; i < XFRMI_HASH_SIZE; i++) { - for (xip = &xfrmn->xfrmi[i]; - (xi = rtnl_dereference(*xip)) != NULL; - xip = &xi->next) - unregister_netdevice_queue(xi->dev, dev_to_kill); - } - xi = rtnl_dereference(xfrmn->collect_md_xfrmi); - if (xi) + for (i = 0; i < XFRMI_HASH_SIZE; i++) { + for (xip = &xfrmn->xfrmi[i]; + (xi = rtnl_net_dereference(net, *xip)) != NULL; + xip = &xi->next) unregister_netdevice_queue(xi->dev, dev_to_kill); } + + xi = rtnl_net_dereference(net, xfrmn->collect_md_xfrmi); + if (xi) + unregister_netdevice_queue(xi->dev, dev_to_kill); } static struct pernet_operations xfrmi_net_ops = { - .exit_batch_rtnl = xfrmi_exit_batch_rtnl, + .exit_rtnl = xfrmi_exit_rtnl, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c index b7997541f7eef..f93d9145ab8a5 100644 --- a/samples/bpf/sockex2_kern.c +++ b/samples/bpf/sockex2_kern.c @@ -31,7 +31,6 @@ static inline int proto_ports_offset(__u64 proto) switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: - case IPPROTO_DCCP: case IPPROTO_ESP: case IPPROTO_SCTP: case IPPROTO_UDPLITE: diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3d22bf863eec9..298c8f8d77199 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4804,7 +4804,7 @@ sub process { } # do not use BUG() or variants - if ($line =~ /\b(?!AA_|BUILD_|DCCP_|IDA_|KVM_|RWLOCK_|snd_|SPIN_)(?:[a-zA-Z_]*_)?BUG(?:_ON)?(?:_[A-Z_]+)?\s*\(/) { + if ($line =~ /\b(?!AA_|BUILD_|IDA_|KVM_|RWLOCK_|snd_|SPIN_)(?:[a-zA-Z_]*_)?BUG(?:_ON)?(?:_[A-Z_]+)?\s*\(/) { my $msg_level = \&WARN; $msg_level = \&CHK if ($file); &{$msg_level}("AVOID_BUG", diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 1b942b4908a26..7d623b00495c1 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -68,13 +67,6 @@ int ipv4_skb_to_auditdata(struct sk_buff *skb, ad->u.net->dport = uh->dest; break; } - case IPPROTO_DCCP: { - struct dccp_hdr *dh = dccp_hdr(skb); - - ad->u.net->sport = dh->dccph_sport; - ad->u.net->dport = dh->dccph_dport; - break; - } case IPPROTO_SCTP: { struct sctphdr *sh = sctp_hdr(skb); @@ -140,17 +132,6 @@ int ipv6_skb_to_auditdata(struct sk_buff *skb, ad->u.net->dport = uh->dest; break; } - case IPPROTO_DCCP: { - struct dccp_hdr _dccph, *dh; - - dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); - if (dh == NULL) - break; - - ad->u.net->sport = dh->dccph_sport; - ad->u.net->dport = dh->dccph_dport; - break; - } case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index e7a7dcab81db6..b2695785610ad 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -65,7 +65,6 @@ #include #include #include -#include #include #include #include @@ -1191,8 +1190,6 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc return SECCLASS_ICMP_SOCKET; else return SECCLASS_RAWIP_SOCKET; - case SOCK_DCCP: - return SECCLASS_DCCP_SOCKET; default: return SECCLASS_RAWIP_SOCKET; } @@ -4392,22 +4389,6 @@ static int selinux_parse_skb_ipv4(struct sk_buff *skb, break; } - case IPPROTO_DCCP: { - struct dccp_hdr _dccph, *dh; - - if (ntohs(ih->frag_off) & IP_OFFSET) - break; - - offset += ihlen; - dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); - if (dh == NULL) - break; - - ad->u.net->sport = dh->dccph_sport; - ad->u.net->dport = dh->dccph_dport; - break; - } - #if IS_ENABLED(CONFIG_IP_SCTP) case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; @@ -4486,18 +4467,6 @@ static int selinux_parse_skb_ipv6(struct sk_buff *skb, break; } - case IPPROTO_DCCP: { - struct dccp_hdr _dccph, *dh; - - dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); - if (dh == NULL) - break; - - ad->u.net->sport = dh->dccph_sport; - ad->u.net->dport = dh->dccph_dport; - break; - } - #if IS_ENABLED(CONFIG_IP_SCTP) case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; @@ -4849,10 +4818,6 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in node_perm = UDP_SOCKET__NODE_BIND; break; - case SECCLASS_DCCP_SOCKET: - node_perm = DCCP_SOCKET__NODE_BIND; - break; - case SECCLASS_SCTP_SOCKET: node_perm = SCTP_SOCKET__NODE_BIND; break; @@ -4908,11 +4873,10 @@ static int selinux_socket_connect_helper(struct socket *sock, return 0; /* - * If a TCP, DCCP or SCTP socket, check name_connect permission + * If a TCP or SCTP socket, check name_connect permission * for the port. */ if (sksec->sclass == SECCLASS_TCP_SOCKET || - sksec->sclass == SECCLASS_DCCP_SOCKET || sksec->sclass == SECCLASS_SCTP_SOCKET) { struct common_audit_data ad; struct lsm_network_audit net = {0,}; @@ -4957,9 +4921,6 @@ static int selinux_socket_connect_helper(struct socket *sock, case SECCLASS_TCP_SOCKET: perm = TCP_SOCKET__NAME_CONNECT; break; - case SECCLASS_DCCP_SOCKET: - perm = DCCP_SOCKET__NAME_CONNECT; - break; case SECCLASS_SCTP_SOCKET: perm = SCTP_SOCKET__NAME_CONNECT; break; diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 04a9b480885eb..5665aa5e7853e 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -127,8 +127,6 @@ const struct security_class_mapping secclass_map[] = { { "key", { "view", "read", "write", "search", "link", "setattr", "create", NULL } }, - { "dccp_socket", - { COMMON_SOCK_PERMS, "node_bind", "name_connect", NULL } }, { "memprotect", { "mmap_zero", NULL } }, { "peer", { "recv", NULL } }, { "capability2", { COMMON_CAP2_PERMS, NULL } }, diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 3a95986b134f2..2c0b07f9fbbd0 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -98,7 +98,6 @@ static const struct nlmsg_perm nlmsg_route_perms[] = { static const struct nlmsg_perm nlmsg_tcpdiag_perms[] = { { TCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ }, - { DCCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ }, { SOCK_DIAG_BY_FAMILY, NETLINK_TCPDIAG_SOCKET__NLMSG_READ }, { SOCK_DESTROY, NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE }, }; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 99833168604ea..fc340a6f0ddea 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -4061,7 +4060,6 @@ static int smk_skb_to_addr_ipv6(struct sk_buff *skb, struct sockaddr_in6 *sip) __be16 frag_off; struct tcphdr _tcph, *th; struct udphdr _udph, *uh; - struct dccp_hdr _dccph, *dh; sip->sin6_port = 0; @@ -4090,11 +4088,6 @@ static int smk_skb_to_addr_ipv6(struct sk_buff *skb, struct sockaddr_in6 *sip) if (uh != NULL) sip->sin6_port = uh->source; break; - case IPPROTO_DCCP: - dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); - if (dh != NULL) - sip->sin6_port = dh->dccph_sport; - break; } return proto; } @@ -4216,7 +4209,7 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) case PF_INET6: proto = smk_skb_to_addr_ipv6(skb, &sadd); if (proto != IPPROTO_UDP && proto != IPPROTO_UDPLITE && - proto != IPPROTO_TCP && proto != IPPROTO_DCCP) + proto != IPPROTO_TCP) break; #ifdef SMACK_IPV6_SECMARK_LABELING skp = smack_from_skb(skb); diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index e0605403f9773..fdcee6a71e0fa 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1283,6 +1283,7 @@ enum bpf_tc_attach_point { BPF_TC_INGRESS = 1 << 0, BPF_TC_EGRESS = 1 << 1, BPF_TC_CUSTOM = 1 << 2, + BPF_TC_QDISC = 1 << 3, }; #define BPF_TC_PARENT(a, b) \ @@ -1297,9 +1298,11 @@ struct bpf_tc_hook { int ifindex; enum bpf_tc_attach_point attach_point; __u32 parent; + __u32 handle; + const char *qdisc; size_t :0; }; -#define bpf_tc_hook__last_field parent +#define bpf_tc_hook__last_field qdisc struct bpf_tc_opts { size_t sz; diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 68a2def171751..c997e69d507fe 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -529,9 +529,9 @@ int bpf_xdp_query_id(int ifindex, int flags, __u32 *prog_id) } -typedef int (*qdisc_config_t)(struct libbpf_nla_req *req); +typedef int (*qdisc_config_t)(struct libbpf_nla_req *req, const struct bpf_tc_hook *hook); -static int clsact_config(struct libbpf_nla_req *req) +static int clsact_config(struct libbpf_nla_req *req, const struct bpf_tc_hook *hook) { req->tc.tcm_parent = TC_H_CLSACT; req->tc.tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0); @@ -539,6 +539,16 @@ static int clsact_config(struct libbpf_nla_req *req) return nlattr_add(req, TCA_KIND, "clsact", sizeof("clsact")); } +static int qdisc_config(struct libbpf_nla_req *req, const struct bpf_tc_hook *hook) +{ + const char *qdisc = OPTS_GET(hook, qdisc, NULL); + + req->tc.tcm_parent = OPTS_GET(hook, parent, TC_H_ROOT); + req->tc.tcm_handle = OPTS_GET(hook, handle, 0); + + return nlattr_add(req, TCA_KIND, qdisc, strlen(qdisc) + 1); +} + static int attach_point_to_config(struct bpf_tc_hook *hook, qdisc_config_t *config) { @@ -552,6 +562,9 @@ static int attach_point_to_config(struct bpf_tc_hook *hook, return 0; case BPF_TC_CUSTOM: return -EOPNOTSUPP; + case BPF_TC_QDISC: + *config = &qdisc_config; + return 0; default: return -EINVAL; } @@ -596,7 +609,7 @@ static int tc_qdisc_modify(struct bpf_tc_hook *hook, int cmd, int flags) req.tc.tcm_family = AF_UNSPEC; req.tc.tcm_ifindex = OPTS_GET(hook, ifindex, 0); - ret = config(&req); + ret = config(&req, hook); if (ret < 0) return ret; @@ -639,6 +652,7 @@ int bpf_tc_hook_destroy(struct bpf_tc_hook *hook) case BPF_TC_INGRESS: case BPF_TC_EGRESS: return libbpf_err(__bpf_tc_detach(hook, NULL, true)); + case BPF_TC_QDISC: case BPF_TC_INGRESS | BPF_TC_EGRESS: return libbpf_err(tc_qdisc_delete(hook)); case BPF_TC_CUSTOM: diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index f3269ce39e5b7..8b7bf673b686f 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -20,13 +20,18 @@ CFLAGS_ethtool:=$(call get_hdr_inc,_LINUX_ETHTOOL_H,ethtool.h) \ $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_H_,ethtool_netlink.h) \ $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_GENERATED_H,ethtool_netlink_generated.h) CFLAGS_handshake:=$(call get_hdr_inc,_LINUX_HANDSHAKE_H,handshake.h) +CFLAGS_lockd_netlink:=$(call get_hdr_inc,_LINUX_LOCKD_NETLINK_H,lockd_netlink.h) CFLAGS_mptcp_pm:=$(call get_hdr_inc,_LINUX_MPTCP_PM_H,mptcp_pm.h) CFLAGS_net_shaper:=$(call get_hdr_inc,_LINUX_NET_SHAPER_H,net_shaper.h) CFLAGS_netdev:=$(call get_hdr_inc,_LINUX_NETDEV_H,netdev.h) CFLAGS_nl80211:=$(call get_hdr_inc,__LINUX_NL802121_H,nl80211.h) CFLAGS_nlctrl:=$(call get_hdr_inc,__LINUX_GENERIC_NETLINK_H,genetlink.h) CFLAGS_nfsd:=$(call get_hdr_inc,_LINUX_NFSD_NETLINK_H,nfsd_netlink.h) +CFLAGS_ovpn:=$(call get_hdr_inc,_LINUX_OVPN,ovpn.h) CFLAGS_ovs_datapath:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_flow:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_vport:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) +CFLAGS_rt-addr:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ + $(call get_hdr_inc,__LINUX_IF_ADDR_H,if_addr.h) +CFLAGS_rt-route:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) CFLAGS_tcp_metrics:=$(call get_hdr_inc,_LINUX_TCP_METRICS_H,tcp_metrics.h) diff --git a/tools/net/ynl/generated/Makefile b/tools/net/ynl/generated/Makefile index 21f9e299dc759..6603ad8d4ce11 100644 --- a/tools/net/ynl/generated/Makefile +++ b/tools/net/ynl/generated/Makefile @@ -25,7 +25,7 @@ SPECS_DIR:=../../../../Documentation/netlink/specs GENS_PATHS=$(shell grep -nrI --files-without-match \ 'protocol: netlink' \ $(SPECS_DIR)) -GENS=$(patsubst $(SPECS_DIR)/%.yaml,%,${GENS_PATHS}) +GENS=$(patsubst $(SPECS_DIR)/%.yaml,%,${GENS_PATHS}) rt-addr rt-route SRCS=$(patsubst %,%-user.c,${GENS}) HDRS=$(patsubst %,%-user.h,${GENS}) OBJS=$(patsubst %,%-user.o,${GENS}) diff --git a/tools/net/ynl/lib/ynl-priv.h b/tools/net/ynl/lib/ynl-priv.h index 3c09a7bbfba59..634eb16548b93 100644 --- a/tools/net/ynl/lib/ynl-priv.h +++ b/tools/net/ynl/lib/ynl-priv.h @@ -94,6 +94,9 @@ struct ynl_ntf_base_type { unsigned char data[] __attribute__((aligned(8))); }; +struct nlmsghdr *ynl_msg_start_req(struct ynl_sock *ys, __u32 id); +struct nlmsghdr *ynl_msg_start_dump(struct ynl_sock *ys, __u32 id); + struct nlmsghdr * ynl_gemsg_start_req(struct ynl_sock *ys, __u32 id, __u8 cmd, __u8 version); struct nlmsghdr * diff --git a/tools/net/ynl/lib/ynl.c b/tools/net/ynl/lib/ynl.c index ce32cb35007d6..70f899a540076 100644 --- a/tools/net/ynl/lib/ynl.c +++ b/tools/net/ynl/lib/ynl.c @@ -451,14 +451,14 @@ ynl_gemsg_start(struct ynl_sock *ys, __u32 id, __u16 flags, return nlh; } -void ynl_msg_start_req(struct ynl_sock *ys, __u32 id) +struct nlmsghdr *ynl_msg_start_req(struct ynl_sock *ys, __u32 id) { - ynl_msg_start(ys, id, NLM_F_REQUEST | NLM_F_ACK); + return ynl_msg_start(ys, id, NLM_F_REQUEST | NLM_F_ACK); } -void ynl_msg_start_dump(struct ynl_sock *ys, __u32 id) +struct nlmsghdr *ynl_msg_start_dump(struct ynl_sock *ys, __u32 id) { - ynl_msg_start(ys, id, NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP); + return ynl_msg_start(ys, id, NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP); } struct nlmsghdr * @@ -663,6 +663,7 @@ ynl_sock_create(const struct ynl_family *yf, struct ynl_error *yse) struct sockaddr_nl addr; struct ynl_sock *ys; socklen_t addrlen; + int sock_type; int one = 1; ys = malloc(sizeof(*ys) + 2 * YNL_SOCKET_BUFFER_SIZE); @@ -675,7 +676,9 @@ ynl_sock_create(const struct ynl_family *yf, struct ynl_error *yse) ys->rx_buf = &ys->raw_buf[YNL_SOCKET_BUFFER_SIZE]; ys->ntf_last_next = &ys->ntf_first; - ys->socket = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + sock_type = yf->is_classic ? yf->classic_id : NETLINK_GENERIC; + + ys->socket = socket(AF_NETLINK, SOCK_RAW, sock_type); if (ys->socket < 0) { __perr(yse, "failed to create a netlink socket"); goto err_free_sock; @@ -708,8 +711,9 @@ ynl_sock_create(const struct ynl_family *yf, struct ynl_error *yse) ys->portid = addr.nl_pid; ys->seq = random(); - - if (ynl_sock_read_family(ys, yf->name)) { + if (yf->is_classic) { + ys->family_id = yf->classic_id; + } else if (ynl_sock_read_family(ys, yf->name)) { if (yse) memcpy(yse, &ys->err, sizeof(*yse)); goto err_close_sock; @@ -791,13 +795,21 @@ static int ynl_ntf_parse(struct ynl_sock *ys, const struct nlmsghdr *nlh) struct ynl_parse_arg yarg = { .ys = ys, }; const struct ynl_ntf_info *info; struct ynl_ntf_base_type *rsp; - struct genlmsghdr *gehdr; + __u32 cmd; int ret; - gehdr = ynl_nlmsg_data(nlh); - if (gehdr->cmd >= ys->family->ntf_info_size) + if (ys->family->is_classic) { + cmd = nlh->nlmsg_type; + } else { + struct genlmsghdr *gehdr; + + gehdr = ynl_nlmsg_data(nlh); + cmd = gehdr->cmd; + } + + if (cmd >= ys->family->ntf_info_size) return YNL_PARSE_CB_ERROR; - info = &ys->family->ntf_info[gehdr->cmd]; + info = &ys->family->ntf_info[cmd]; if (!info->cb) return YNL_PARSE_CB_ERROR; @@ -811,7 +823,7 @@ static int ynl_ntf_parse(struct ynl_sock *ys, const struct nlmsghdr *nlh) goto err_free; rsp->family = nlh->nlmsg_type; - rsp->cmd = gehdr->cmd; + rsp->cmd = cmd; *ys->ntf_last_next = rsp; ys->ntf_last_next = &rsp->next; @@ -863,17 +875,22 @@ int ynl_error_parse(struct ynl_parse_arg *yarg, const char *msg) static int ynl_check_alien(struct ynl_sock *ys, const struct nlmsghdr *nlh, __u32 rsp_cmd) { - struct genlmsghdr *gehdr; + if (ys->family->is_classic) { + if (nlh->nlmsg_type != rsp_cmd) + return ynl_ntf_parse(ys, nlh); + } else { + struct genlmsghdr *gehdr; - if (ynl_nlmsg_data_len(nlh) < sizeof(*gehdr)) { - yerr(ys, YNL_ERROR_INV_RESP, - "Kernel responded with truncated message"); - return -1; - } + if (ynl_nlmsg_data_len(nlh) < sizeof(*gehdr)) { + yerr(ys, YNL_ERROR_INV_RESP, + "Kernel responded with truncated message"); + return -1; + } - gehdr = ynl_nlmsg_data(nlh); - if (gehdr->cmd != rsp_cmd) - return ynl_ntf_parse(ys, nlh); + gehdr = ynl_nlmsg_data(nlh); + if (gehdr->cmd != rsp_cmd) + return ynl_ntf_parse(ys, nlh); + } return 0; } diff --git a/tools/net/ynl/lib/ynl.h b/tools/net/ynl/lib/ynl.h index 6cd570b283eab..59256e258130b 100644 --- a/tools/net/ynl/lib/ynl.h +++ b/tools/net/ynl/lib/ynl.h @@ -2,6 +2,7 @@ #ifndef __YNL_C_H #define __YNL_C_H 1 +#include #include #include #include @@ -48,6 +49,8 @@ struct ynl_family { /* private: */ const char *name; size_t hdr_len; + bool is_classic; + __u16 classic_id; const struct ynl_ntf_info *ntf_info; unsigned int ntf_info_size; }; diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 30c0a34b2784e..0d930c17f9638 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -993,9 +993,6 @@ def __init__(self, file_name, exclude_ops): def resolve(self): self.resolve_up(super()) - if self.yaml.get('protocol', 'genetlink') not in {'genetlink', 'genetlink-c', 'genetlink-legacy'}: - raise Exception("Codegen only supported for genetlink") - self.c_name = c_lower(self.ident_name) if 'name-prefix' in self.yaml['operations']: self.op_prefix = c_upper(self.yaml['operations']['name-prefix']) @@ -1042,6 +1039,9 @@ def new_attr_set(self, elem): def new_operation(self, elem, req_value, rsp_value): return Operation(self, elem, req_value, rsp_value) + def is_classic(self): + return self.proto == 'netlink-raw' + def _mark_notify(self): for op in self.msgs.values(): if 'notify' in op: @@ -1240,6 +1240,7 @@ def __init__(self, cw, family, ku_space, op, op_mode, attr_set=None): # 'do' and 'dump' response parsing is identical self.type_consistent = True + self.type_oneside = False if op_mode != 'do' and 'dump' in op: if 'do' in op: if ('reply' in op['do']) != ('reply' in op["dump"]): @@ -1247,7 +1248,8 @@ def __init__(self, cw, family, ku_space, op, op_mode, attr_set=None): elif 'reply' in op['do'] and op["do"]["reply"] != op["dump"]["reply"]: self.type_consistent = False else: - self.type_consistent = False + self.type_consistent = True + self.type_oneside = True self.attr_set = attr_set if not self.attr_set: @@ -1275,6 +1277,9 @@ def __init__(self, cw, family, ku_space, op, op_mode, attr_set=None): if op_mode == 'event': self.struct['reply'] = Struct(family, self.attr_set, type_list=op['event']['attributes']) + def type_empty(self, key): + return len(self.struct[key].attr_list) == 0 and self.fixed_hdr is None + class CodeWriter: def __init__(self, nlib, out_file=None, overwrite=True): @@ -1541,7 +1546,9 @@ def op_prefix(ri, direction, deref=False): suffix += f"{direction_to_suffix[direction]}" else: if direction == 'request': - suffix += '_req_dump' + suffix += '_req' + if not ri.type_oneside: + suffix += '_dump' else: if ri.type_consistent: if deref: @@ -1747,7 +1754,10 @@ def _multi_parse(ri, struct, init_lines, local_vars): ri.cw.p(f'dst->{arg} = {arg};') if ri.fixed_hdr: - ri.cw.p('hdr = ynl_nlmsg_data_offset(nlh, sizeof(struct genlmsghdr));') + if ri.family.is_classic(): + ri.cw.p('hdr = ynl_nlmsg_data(nlh);') + else: + ri.cw.p('hdr = ynl_nlmsg_data_offset(nlh, sizeof(struct genlmsghdr));') ri.cw.p(f"memcpy(&dst->_hdr, hdr, sizeof({ri.fixed_hdr}));") for anest in sorted(all_multi): aspec = struct[anest] @@ -1899,7 +1909,10 @@ def print_req(ri): ri.cw.block_start() ri.cw.write_func_lvar(local_vars) - ri.cw.p(f"nlh = ynl_gemsg_start_req(ys, {ri.nl.get_family_id()}, {ri.op.enum_name}, 1);") + if ri.family.is_classic(): + ri.cw.p(f"nlh = ynl_msg_start_req(ys, {ri.op.enum_name});") + else: + ri.cw.p(f"nlh = ynl_gemsg_start_req(ys, {ri.nl.get_family_id()}, {ri.op.enum_name}, 1);") ri.cw.p(f"ys->req_policy = &{ri.struct['request'].render_name}_nest;") if 'reply' in ri.op[ri.op_mode]: @@ -1968,7 +1981,10 @@ def print_dump(ri): else: ri.cw.p(f'yds.rsp_cmd = {ri.op.rsp_value};') ri.cw.nl() - ri.cw.p(f"nlh = ynl_gemsg_start_dump(ys, {ri.nl.get_family_id()}, {ri.op.enum_name}, 1);") + if ri.family.is_classic(): + ri.cw.p(f"nlh = ynl_msg_start_dump(ys, {ri.op.enum_name});") + else: + ri.cw.p(f"nlh = ynl_gemsg_start_dump(ys, {ri.nl.get_family_id()}, {ri.op.enum_name}, 1);") if ri.fixed_hdr: ri.cw.p("hdr_len = sizeof(req->_hdr);") @@ -2028,7 +2044,7 @@ def _print_type(ri, direction, struct): if not direction and ri.type_name_conflict: suffix += '_' - if ri.op_mode == 'dump': + if ri.op_mode == 'dump' and not ri.type_oneside: suffix += '_dump' ri.cw.block_start(line=f"struct {ri.family.c_name}{suffix}") @@ -2079,7 +2095,7 @@ def print_type_helpers(ri, direction, deref=False): def print_req_type_helpers(ri): - if len(ri.struct["request"].attr_list) == 0: + if ri.type_empty("request"): return print_alloc_wrapper(ri, "request") print_type_helpers(ri, "request") @@ -2102,7 +2118,7 @@ def print_parse_prototype(ri, direction, terminate=True): def print_req_type(ri): - if len(ri.struct["request"].attr_list) == 0: + if ri.type_empty("request"): return print_type(ri, "request") @@ -2755,7 +2771,7 @@ def render_user_family(family, cw, prototype): return if family.ntfs: - cw.block_start(line=f"static const struct ynl_ntf_info {family['name']}_ntf_info[] = ") + cw.block_start(line=f"static const struct ynl_ntf_info {family.c_name}_ntf_info[] = ") for ntf_op_name, ntf_op in family.ntfs.items(): if 'notify' in ntf_op: op = family.ops[ntf_op['notify']] @@ -2775,13 +2791,18 @@ def render_user_family(family, cw, prototype): cw.block_start(f'{symbol} = ') cw.p(f'.name\t\t= "{family.c_name}",') - if family.fixed_header: + if family.is_classic(): + cw.p(f'.is_classic\t= true,') + cw.p(f'.classic_id\t= {family.get("protonum")},') + if family.is_classic(): + cw.p(f'.hdr_len\t= sizeof(struct {c_lower(family.fixed_header)}),') + elif family.fixed_header: cw.p(f'.hdr_len\t= sizeof(struct genlmsghdr) + sizeof(struct {c_lower(family.fixed_header)}),') else: cw.p('.hdr_len\t= sizeof(struct genlmsghdr),') if family.ntfs: - cw.p(f".ntf_info\t= {family['name']}_ntf_info,") - cw.p(f".ntf_info_size\t= YNL_ARRAY_SIZE({family['name']}_ntf_info),") + cw.p(f".ntf_info\t= {family.c_name}_ntf_info,") + cw.p(f".ntf_info_size\t= YNL_ARRAY_SIZE({family.c_name}_ntf_info),") cw.block_end(line=';') @@ -3010,7 +3031,7 @@ def main(): ri = RenderInfo(cw, parsed, args.mode, op, 'dump') print_req_type(ri) print_req_type_helpers(ri) - if not ri.type_consistent: + if not ri.type_consistent or ri.type_oneside: print_rsp_type(ri) print_wrapped_type(ri) print_dump_prototype(ri) @@ -3088,7 +3109,7 @@ def main(): if 'dump' in op: cw.p(f"/* {op.enum_name} - dump */") ri = RenderInfo(cw, parsed, args.mode, op, "dump") - if not ri.type_consistent: + if not ri.type_consistent or ri.type_oneside: parse_rsp_msg(ri, deref=True) print_req_free(ri) print_dump_type_free(ri) diff --git a/tools/net/ynl/samples/.gitignore b/tools/net/ynl/samples/.gitignore index dda6686257a7c..7f9781cf532fd 100644 --- a/tools/net/ynl/samples/.gitignore +++ b/tools/net/ynl/samples/.gitignore @@ -2,4 +2,6 @@ ethtool devlink netdev ovs -page-pool \ No newline at end of file +page-pool +rt-addr +rt-route diff --git a/tools/net/ynl/samples/rt-addr.c b/tools/net/ynl/samples/rt-addr.c new file mode 100644 index 0000000000000..0f4851b4ec57b --- /dev/null +++ b/tools/net/ynl/samples/rt-addr.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include + +#include +#include + +#include "rt-addr-user.h" + +static void rt_addr_print(struct rt_addr_getaddr_rsp *a) +{ + char ifname[IF_NAMESIZE]; + char addr_str[64]; + const char *addr; + const char *name; + + name = if_indextoname(a->_hdr.ifa_index, ifname); + if (name) + printf("%16s: ", name); + + switch (a->_present.address_len) { + case 4: + addr = inet_ntop(AF_INET, a->address, + addr_str, sizeof(addr_str)); + break; + case 16: + addr = inet_ntop(AF_INET6, a->address, + addr_str, sizeof(addr_str)); + break; + default: + addr = NULL; + break; + } + if (addr) + printf("%s", addr); + else + printf("[%d]", a->_present.address_len); + + printf("\n"); +} + +int main(int argc, char **argv) +{ + struct rt_addr_getaddr_list *rsp; + struct rt_addr_getaddr_req *req; + struct ynl_error yerr; + struct ynl_sock *ys; + + ys = ynl_sock_create(&ynl_rt_addr_family, &yerr); + if (!ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return 1; + } + + req = rt_addr_getaddr_req_alloc(); + if (!req) + goto err_destroy; + + rsp = rt_addr_getaddr_dump(ys, req); + rt_addr_getaddr_req_free(req); + if (!rsp) + goto err_close; + + if (ynl_dump_empty(rsp)) + fprintf(stderr, "Error: no addresses reported\n"); + ynl_dump_foreach(rsp, addr) + rt_addr_print(addr); + rt_addr_getaddr_list_free(rsp); + + ynl_sock_destroy(ys); + return 0; + +err_close: + fprintf(stderr, "YNL: %s\n", ys->err.msg); +err_destroy: + ynl_sock_destroy(ys); + return 2; +} diff --git a/tools/net/ynl/samples/rt-route.c b/tools/net/ynl/samples/rt-route.c new file mode 100644 index 0000000000000..9d9c868f88733 --- /dev/null +++ b/tools/net/ynl/samples/rt-route.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include + +#include +#include + +#include "rt-route-user.h" + +static void rt_route_print(struct rt_route_getroute_rsp *r) +{ + char ifname[IF_NAMESIZE]; + char route_str[64]; + const char *route; + const char *name; + + /* Ignore local */ + if (r->_hdr.rtm_table == RT_TABLE_LOCAL) + return; + + if (r->_present.oif) { + name = if_indextoname(r->oif, ifname); + if (name) + printf("oif: %-16s ", name); + } + + if (r->_present.dst_len) { + route = inet_ntop(r->_hdr.rtm_family, r->dst, + route_str, sizeof(route_str)); + printf("dst: %s/%d", route, r->_hdr.rtm_dst_len); + } + + if (r->_present.gateway_len) { + route = inet_ntop(r->_hdr.rtm_family, r->gateway, + route_str, sizeof(route_str)); + printf("gateway: %s ", route); + } + + printf("\n"); +} + +int main(int argc, char **argv) +{ + struct rt_route_getroute_req_dump *req; + struct rt_route_getroute_list *rsp; + struct ynl_error yerr; + struct ynl_sock *ys; + + ys = ynl_sock_create(&ynl_rt_route_family, &yerr); + if (!ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return 1; + } + + req = rt_route_getroute_req_dump_alloc(); + if (!req) + goto err_destroy; + + rsp = rt_route_getroute_dump(ys, req); + rt_route_getroute_req_dump_free(req); + if (!rsp) + goto err_close; + + if (ynl_dump_empty(rsp)) + fprintf(stderr, "Error: no routeesses reported\n"); + ynl_dump_foreach(rsp, route) + rt_route_print(route); + rt_route_getroute_list_free(rsp); + + ynl_sock_destroy(ys); + return 0; + +err_close: + fprintf(stderr, "YNL: %s\n", ys->err.msg); +err_destroy: + ynl_sock_destroy(ys); + return 2; +} diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index c77c8c8e3d9bd..61bb8bf1b5074 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -71,6 +71,7 @@ TARGETS += net/hsr TARGETS += net/mptcp TARGETS += net/netfilter TARGETS += net/openvswitch +TARGETS += net/ovpn TARGETS += net/packetdrill TARGETS += net/rds TARGETS += net/tcp_ao diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index c378d5d07e029..3201a962b3dcd 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -71,8 +71,10 @@ CONFIG_NET_IPGRE=y CONFIG_NET_IPGRE_DEMUX=y CONFIG_NET_IPIP=y CONFIG_NET_MPLS_GSO=y +CONFIG_NET_SCH_BPF=y CONFIG_NET_SCH_FQ=y CONFIG_NET_SCH_INGRESS=y +CONFIG_NET_SCH_HTB=y CONFIG_NET_SCHED=y CONFIG_NETDEVSIM=y CONFIG_NETFILTER=y diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c new file mode 100644 index 0000000000000..c9a54177c84ed --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +#include "network_helpers.h" +#include "bpf_qdisc_fifo.skel.h" +#include "bpf_qdisc_fq.skel.h" + +#define LO_IFINDEX 1 + +static const unsigned int total_bytes = 10 * 1024 * 1024; + +static void do_test(char *qdisc) +{ + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = LO_IFINDEX, + .attach_point = BPF_TC_QDISC, + .parent = TC_H_ROOT, + .handle = 0x8000000, + .qdisc = qdisc); + int srv_fd = -1, cli_fd = -1; + int err; + + err = bpf_tc_hook_create(&hook); + if (!ASSERT_OK(err, "attach qdisc")) + return; + + srv_fd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); + if (!ASSERT_OK_FD(srv_fd, "start server")) + goto done; + + cli_fd = connect_to_fd(srv_fd, 0); + if (!ASSERT_OK_FD(cli_fd, "connect to client")) + goto done; + + err = send_recv_data(srv_fd, cli_fd, total_bytes); + ASSERT_OK(err, "send_recv_data"); + +done: + if (srv_fd != -1) + close(srv_fd); + if (cli_fd != -1) + close(cli_fd); + + bpf_tc_hook_destroy(&hook); +} + +static void test_fifo(void) +{ + struct bpf_qdisc_fifo *fifo_skel; + struct bpf_link *link; + + fifo_skel = bpf_qdisc_fifo__open_and_load(); + if (!ASSERT_OK_PTR(fifo_skel, "bpf_qdisc_fifo__open_and_load")) + return; + + link = bpf_map__attach_struct_ops(fifo_skel->maps.fifo); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { + bpf_qdisc_fifo__destroy(fifo_skel); + return; + } + + do_test("bpf_fifo"); + + bpf_link__destroy(link); + bpf_qdisc_fifo__destroy(fifo_skel); +} + +static void test_fq(void) +{ + struct bpf_qdisc_fq *fq_skel; + struct bpf_link *link; + + fq_skel = bpf_qdisc_fq__open_and_load(); + if (!ASSERT_OK_PTR(fq_skel, "bpf_qdisc_fq__open_and_load")) + return; + + link = bpf_map__attach_struct_ops(fq_skel->maps.fq); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { + bpf_qdisc_fq__destroy(fq_skel); + return; + } + + do_test("bpf_fq"); + + bpf_link__destroy(link); + bpf_qdisc_fq__destroy(fq_skel); +} + +static void test_qdisc_attach_to_mq(void) +{ + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, + .attach_point = BPF_TC_QDISC, + .parent = TC_H_MAKE(1 << 16, 1), + .handle = 0x11 << 16, + .qdisc = "bpf_fifo"); + struct bpf_qdisc_fifo *fifo_skel; + struct bpf_link *link; + int err; + + fifo_skel = bpf_qdisc_fifo__open_and_load(); + if (!ASSERT_OK_PTR(fifo_skel, "bpf_qdisc_fifo__open_and_load")) + return; + + link = bpf_map__attach_struct_ops(fifo_skel->maps.fifo); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { + bpf_qdisc_fifo__destroy(fifo_skel); + return; + } + + SYS(out, "ip link add veth0 type veth peer veth1"); + hook.ifindex = if_nametoindex("veth0"); + SYS(out, "tc qdisc add dev veth0 root handle 1: mq"); + + err = bpf_tc_hook_create(&hook); + ASSERT_OK(err, "attach qdisc"); + + bpf_tc_hook_destroy(&hook); + + SYS(out, "tc qdisc delete dev veth0 root mq"); +out: + bpf_link__destroy(link); + bpf_qdisc_fifo__destroy(fifo_skel); +} + +static void test_qdisc_attach_to_non_root(void) +{ + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = LO_IFINDEX, + .attach_point = BPF_TC_QDISC, + .parent = TC_H_MAKE(1 << 16, 1), + .handle = 0x11 << 16, + .qdisc = "bpf_fifo"); + struct bpf_qdisc_fifo *fifo_skel; + struct bpf_link *link; + int err; + + fifo_skel = bpf_qdisc_fifo__open_and_load(); + if (!ASSERT_OK_PTR(fifo_skel, "bpf_qdisc_fifo__open_and_load")) + return; + + link = bpf_map__attach_struct_ops(fifo_skel->maps.fifo); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { + bpf_qdisc_fifo__destroy(fifo_skel); + return; + } + + SYS(out, "tc qdisc add dev lo root handle 1: htb"); + SYS(out_del_htb, "tc class add dev lo parent 1: classid 1:1 htb rate 75Kbit"); + + err = bpf_tc_hook_create(&hook); + if (!ASSERT_ERR(err, "attach qdisc")) + bpf_tc_hook_destroy(&hook); + +out_del_htb: + SYS(out, "tc qdisc delete dev lo root htb"); +out: + bpf_link__destroy(link); + bpf_qdisc_fifo__destroy(fifo_skel); +} + +void test_bpf_qdisc(void) +{ + struct netns_obj *netns; + + netns = netns_new("bpf_qdisc_ns", true); + if (!ASSERT_OK_PTR(netns, "netns_new")) + return; + + if (test__start_subtest("fifo")) + test_fifo(); + if (test__start_subtest("fq")) + test_fq(); + if (test__start_subtest("attach to mq")) + test_qdisc_attach_to_mq(); + if (test__start_subtest("attach to non root")) + test_qdisc_attach_to_non_root(); + + netns_free(netns); +} diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c index 77d07e0a4a55c..559f45239a83d 100644 --- a/tools/testing/selftests/bpf/prog_tests/linked_list.c +++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c @@ -7,6 +7,7 @@ #include "linked_list.skel.h" #include "linked_list_fail.skel.h" +#include "linked_list_peek.skel.h" static char log_buf[1024 * 1024]; @@ -804,4 +805,5 @@ void test_linked_list(void) test_linked_list_success(LIST_IN_LIST, false); test_linked_list_success(LIST_IN_LIST, true); test_linked_list_success(TEST_ALL, false); + RUN_TESTS(linked_list_peek); } diff --git a/tools/testing/selftests/bpf/prog_tests/rbtree.c b/tools/testing/selftests/bpf/prog_tests/rbtree.c index 9818f06c97c5a..d8f3d7a45fe9f 100644 --- a/tools/testing/selftests/bpf/prog_tests/rbtree.c +++ b/tools/testing/selftests/bpf/prog_tests/rbtree.c @@ -8,6 +8,7 @@ #include "rbtree_fail.skel.h" #include "rbtree_btf_fail__wrong_node_type.skel.h" #include "rbtree_btf_fail__add_wrong_type.skel.h" +#include "rbtree_search.skel.h" static void test_rbtree_add_nodes(void) { @@ -187,3 +188,8 @@ void test_rbtree_fail(void) { RUN_TESTS(rbtree_fail); } + +void test_rbtree_search(void) +{ + RUN_TESTS(rbtree_search); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h b/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h new file mode 100644 index 0000000000000..7e7f2fe04f22f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BPF_QDISC_COMMON_H +#define _BPF_QDISC_COMMON_H + +#define NET_XMIT_SUCCESS 0x00 +#define NET_XMIT_DROP 0x01 /* skb dropped */ +#define NET_XMIT_CN 0x02 /* congestion notification */ + +#define TC_PRIO_CONTROL 7 +#define TC_PRIO_MAX 15 + +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) + +struct bpf_sk_buff_ptr; + +u32 bpf_skb_get_hash(struct sk_buff *p) __ksym; +void bpf_kfree_skb(struct sk_buff *p) __ksym; +void bpf_qdisc_skb_drop(struct sk_buff *p, struct bpf_sk_buff_ptr *to_free) __ksym; +void bpf_qdisc_watchdog_schedule(struct Qdisc *sch, u64 expire, u64 delta_ns) __ksym; +void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb) __ksym; + +static struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb) +{ + return (struct qdisc_skb_cb *)skb->cb; +} + +static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb) +{ + return qdisc_skb_cb(skb)->pkt_len; +} + +#endif diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c new file mode 100644 index 0000000000000..0c7cfb82dae19 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "bpf_experimental.h" +#include "bpf_qdisc_common.h" + +char _license[] SEC("license") = "GPL"; + +struct skb_node { + struct sk_buff __kptr * skb; + struct bpf_list_node node; +}; + +private(A) struct bpf_spin_lock q_fifo_lock; +private(A) struct bpf_list_head q_fifo __contains(skb_node, node); + +SEC("struct_ops/bpf_fifo_enqueue") +int BPF_PROG(bpf_fifo_enqueue, struct sk_buff *skb, struct Qdisc *sch, + struct bpf_sk_buff_ptr *to_free) +{ + struct skb_node *skbn; + u32 pkt_len; + + if (sch->q.qlen == sch->limit) + goto drop; + + skbn = bpf_obj_new(typeof(*skbn)); + if (!skbn) + goto drop; + + pkt_len = qdisc_pkt_len(skb); + + sch->q.qlen++; + skb = bpf_kptr_xchg(&skbn->skb, skb); + if (skb) + bpf_qdisc_skb_drop(skb, to_free); + + bpf_spin_lock(&q_fifo_lock); + bpf_list_push_back(&q_fifo, &skbn->node); + bpf_spin_unlock(&q_fifo_lock); + + sch->qstats.backlog += pkt_len; + return NET_XMIT_SUCCESS; +drop: + bpf_qdisc_skb_drop(skb, to_free); + return NET_XMIT_DROP; +} + +SEC("struct_ops/bpf_fifo_dequeue") +struct sk_buff *BPF_PROG(bpf_fifo_dequeue, struct Qdisc *sch) +{ + struct bpf_list_node *node; + struct sk_buff *skb = NULL; + struct skb_node *skbn; + + bpf_spin_lock(&q_fifo_lock); + node = bpf_list_pop_front(&q_fifo); + bpf_spin_unlock(&q_fifo_lock); + if (!node) + return NULL; + + skbn = container_of(node, struct skb_node, node); + skb = bpf_kptr_xchg(&skbn->skb, skb); + bpf_obj_drop(skbn); + if (!skb) + return NULL; + + sch->qstats.backlog -= qdisc_pkt_len(skb); + bpf_qdisc_bstats_update(sch, skb); + sch->q.qlen--; + + return skb; +} + +SEC("struct_ops/bpf_fifo_init") +int BPF_PROG(bpf_fifo_init, struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + sch->limit = 1000; + return 0; +} + +SEC("struct_ops/bpf_fifo_reset") +void BPF_PROG(bpf_fifo_reset, struct Qdisc *sch) +{ + struct bpf_list_node *node; + struct skb_node *skbn; + int i; + + bpf_for(i, 0, sch->q.qlen) { + struct sk_buff *skb = NULL; + + bpf_spin_lock(&q_fifo_lock); + node = bpf_list_pop_front(&q_fifo); + bpf_spin_unlock(&q_fifo_lock); + + if (!node) + break; + + skbn = container_of(node, struct skb_node, node); + skb = bpf_kptr_xchg(&skbn->skb, skb); + if (skb) + bpf_kfree_skb(skb); + bpf_obj_drop(skbn); + } + sch->q.qlen = 0; +} + +SEC(".struct_ops") +struct Qdisc_ops fifo = { + .enqueue = (void *)bpf_fifo_enqueue, + .dequeue = (void *)bpf_fifo_dequeue, + .init = (void *)bpf_fifo_init, + .reset = (void *)bpf_fifo_reset, + .id = "bpf_fifo", +}; + diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c new file mode 100644 index 0000000000000..7c110a1562247 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c @@ -0,0 +1,750 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* bpf_fq is intended for testing the bpf qdisc infrastructure and not a direct + * copy of sch_fq. bpf_fq implements the scheduling algorithm of sch_fq before + * 29f834aa326e ("net_sched: sch_fq: add 3 bands and WRR scheduling") was + * introduced. It gives each flow a fair chance to transmit packets in a + * round-robin fashion. Note that for flow pacing, bpf_fq currently only + * respects skb->tstamp but not skb->sk->sk_pacing_rate. In addition, if there + * are multiple bpf_fq instances, they will have a shared view of flows and + * configuration since some key data structure such as fq_prio_flows, + * fq_nonprio_flows, and fq_bpf_data are global. + * + * To use bpf_fq alone without running selftests, use the following commands. + * + * 1. Register bpf_fq to the kernel + * bpftool struct_ops register bpf_qdisc_fq.bpf.o /sys/fs/bpf + * 2. Add bpf_fq to an interface + * tc qdisc add dev root handle bpf_fq + * 3. Delete bpf_fq attached to the interface + * tc qdisc delete dev root + * 4. Unregister bpf_fq + * bpftool struct_ops unregister name fq + * + * The qdisc name, bpf_fq, used in tc commands is defined by Qdisc_ops.id. + * The struct_ops_map_name, fq, used in the bpftool command is the name of the + * Qdisc_ops. + * + * SEC(".struct_ops") + * struct Qdisc_ops fq = { + * ... + * .id = "bpf_fq", + * }; + */ + +#include +#include +#include +#include "bpf_experimental.h" +#include "bpf_qdisc_common.h" + +char _license[] SEC("license") = "GPL"; + +#define NSEC_PER_USEC 1000L +#define NSEC_PER_SEC 1000000000L + +#define NUM_QUEUE (1 << 20) + +struct fq_bpf_data { + u32 quantum; + u32 initial_quantum; + u32 flow_refill_delay; + u32 flow_plimit; + u64 horizon; + u32 orphan_mask; + u32 timer_slack; + u64 time_next_delayed_flow; + u64 unthrottle_latency_ns; + u8 horizon_drop; + u32 new_flow_cnt; + u32 old_flow_cnt; + u64 ktime_cache; +}; + +enum { + CLS_RET_PRIO = 0, + CLS_RET_NONPRIO = 1, + CLS_RET_ERR = 2, +}; + +struct skb_node { + u64 tstamp; + struct sk_buff __kptr * skb; + struct bpf_rb_node node; +}; + +struct fq_flow_node { + int credit; + u32 qlen; + u64 age; + u64 time_next_packet; + struct bpf_list_node list_node; + struct bpf_rb_node rb_node; + struct bpf_rb_root queue __contains(skb_node, node); + struct bpf_spin_lock lock; + struct bpf_refcount refcount; +}; + +struct dequeue_nonprio_ctx { + bool stop_iter; + u64 expire; + u64 now; +}; + +struct remove_flows_ctx { + bool gc_only; + u32 reset_cnt; + u32 reset_max; +}; + +struct unset_throttled_flows_ctx { + bool unset_all; + u64 now; +}; + +struct fq_stashed_flow { + struct fq_flow_node __kptr * flow; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __u64); + __type(value, struct fq_stashed_flow); + __uint(max_entries, NUM_QUEUE); +} fq_nonprio_flows SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __u64); + __type(value, struct fq_stashed_flow); + __uint(max_entries, 1); +} fq_prio_flows SEC(".maps"); + +private(A) struct bpf_spin_lock fq_delayed_lock; +private(A) struct bpf_rb_root fq_delayed __contains(fq_flow_node, rb_node); + +private(B) struct bpf_spin_lock fq_new_flows_lock; +private(B) struct bpf_list_head fq_new_flows __contains(fq_flow_node, list_node); + +private(C) struct bpf_spin_lock fq_old_flows_lock; +private(C) struct bpf_list_head fq_old_flows __contains(fq_flow_node, list_node); + +private(D) struct fq_bpf_data q; + +/* Wrapper for bpf_kptr_xchg that expects NULL dst */ +static void bpf_kptr_xchg_back(void *map_val, void *ptr) +{ + void *ret; + + ret = bpf_kptr_xchg(map_val, ptr); + if (ret) + bpf_obj_drop(ret); +} + +static bool skbn_tstamp_less(struct bpf_rb_node *a, const struct bpf_rb_node *b) +{ + struct skb_node *skbn_a; + struct skb_node *skbn_b; + + skbn_a = container_of(a, struct skb_node, node); + skbn_b = container_of(b, struct skb_node, node); + + return skbn_a->tstamp < skbn_b->tstamp; +} + +static bool fn_time_next_packet_less(struct bpf_rb_node *a, const struct bpf_rb_node *b) +{ + struct fq_flow_node *flow_a; + struct fq_flow_node *flow_b; + + flow_a = container_of(a, struct fq_flow_node, rb_node); + flow_b = container_of(b, struct fq_flow_node, rb_node); + + return flow_a->time_next_packet < flow_b->time_next_packet; +} + +static void +fq_flows_add_head(struct bpf_list_head *head, struct bpf_spin_lock *lock, + struct fq_flow_node *flow, u32 *flow_cnt) +{ + bpf_spin_lock(lock); + bpf_list_push_front(head, &flow->list_node); + bpf_spin_unlock(lock); + *flow_cnt += 1; +} + +static void +fq_flows_add_tail(struct bpf_list_head *head, struct bpf_spin_lock *lock, + struct fq_flow_node *flow, u32 *flow_cnt) +{ + bpf_spin_lock(lock); + bpf_list_push_back(head, &flow->list_node); + bpf_spin_unlock(lock); + *flow_cnt += 1; +} + +static void +fq_flows_remove_front(struct bpf_list_head *head, struct bpf_spin_lock *lock, + struct bpf_list_node **node, u32 *flow_cnt) +{ + bpf_spin_lock(lock); + *node = bpf_list_pop_front(head); + bpf_spin_unlock(lock); + *flow_cnt -= 1; +} + +static bool +fq_flows_is_empty(struct bpf_list_head *head, struct bpf_spin_lock *lock) +{ + struct bpf_list_node *node; + + bpf_spin_lock(lock); + node = bpf_list_pop_front(head); + if (node) { + bpf_list_push_front(head, node); + bpf_spin_unlock(lock); + return false; + } + bpf_spin_unlock(lock); + + return true; +} + +/* flow->age is used to denote the state of the flow (not-detached, detached, throttled) + * as well as the timestamp when the flow is detached. + * + * 0: not-detached + * 1 - (~0ULL-1): detached + * ~0ULL: throttled + */ +static void fq_flow_set_detached(struct fq_flow_node *flow) +{ + flow->age = bpf_jiffies64(); +} + +static bool fq_flow_is_detached(struct fq_flow_node *flow) +{ + return flow->age != 0 && flow->age != ~0ULL; +} + +static bool sk_listener(struct sock *sk) +{ + return (1 << sk->__sk_common.skc_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); +} + +static void fq_gc(void); + +static int fq_new_flow(void *flow_map, struct fq_stashed_flow **sflow, u64 hash) +{ + struct fq_stashed_flow tmp = {}; + struct fq_flow_node *flow; + int ret; + + flow = bpf_obj_new(typeof(*flow)); + if (!flow) + return -ENOMEM; + + flow->credit = q.initial_quantum, + flow->qlen = 0, + flow->age = 1, + flow->time_next_packet = 0, + + ret = bpf_map_update_elem(flow_map, &hash, &tmp, 0); + if (ret == -ENOMEM || ret == -E2BIG) { + fq_gc(); + bpf_map_update_elem(&fq_nonprio_flows, &hash, &tmp, 0); + } + + *sflow = bpf_map_lookup_elem(flow_map, &hash); + if (!*sflow) { + bpf_obj_drop(flow); + return -ENOMEM; + } + + bpf_kptr_xchg_back(&(*sflow)->flow, flow); + return 0; +} + +static int +fq_classify(struct sk_buff *skb, struct fq_stashed_flow **sflow) +{ + struct sock *sk = skb->sk; + int ret = CLS_RET_NONPRIO; + u64 hash = 0; + + if ((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL) { + *sflow = bpf_map_lookup_elem(&fq_prio_flows, &hash); + ret = CLS_RET_PRIO; + } else { + if (!sk || sk_listener(sk)) { + hash = bpf_skb_get_hash(skb) & q.orphan_mask; + /* Avoid collision with an existing flow hash, which + * only uses the lower 32 bits of hash, by setting the + * upper half of hash to 1. + */ + hash |= (1ULL << 32); + } else if (sk->__sk_common.skc_state == TCP_CLOSE) { + hash = bpf_skb_get_hash(skb) & q.orphan_mask; + hash |= (1ULL << 32); + } else { + hash = sk->__sk_common.skc_hash; + } + *sflow = bpf_map_lookup_elem(&fq_nonprio_flows, &hash); + } + + if (!*sflow) + ret = fq_new_flow(&fq_nonprio_flows, sflow, hash) < 0 ? + CLS_RET_ERR : CLS_RET_NONPRIO; + + return ret; +} + +static bool fq_packet_beyond_horizon(struct sk_buff *skb) +{ + return (s64)skb->tstamp > (s64)(q.ktime_cache + q.horizon); +} + +SEC("struct_ops/bpf_fq_enqueue") +int BPF_PROG(bpf_fq_enqueue, struct sk_buff *skb, struct Qdisc *sch, + struct bpf_sk_buff_ptr *to_free) +{ + struct fq_flow_node *flow = NULL, *flow_copy; + struct fq_stashed_flow *sflow; + u64 time_to_send, jiffies; + struct skb_node *skbn; + int ret; + + if (sch->q.qlen >= sch->limit) + goto drop; + + if (!skb->tstamp) { + time_to_send = q.ktime_cache = bpf_ktime_get_ns(); + } else { + if (fq_packet_beyond_horizon(skb)) { + q.ktime_cache = bpf_ktime_get_ns(); + if (fq_packet_beyond_horizon(skb)) { + if (q.horizon_drop) + goto drop; + + skb->tstamp = q.ktime_cache + q.horizon; + } + } + time_to_send = skb->tstamp; + } + + ret = fq_classify(skb, &sflow); + if (ret == CLS_RET_ERR) + goto drop; + + flow = bpf_kptr_xchg(&sflow->flow, flow); + if (!flow) + goto drop; + + if (ret == CLS_RET_NONPRIO) { + if (flow->qlen >= q.flow_plimit) { + bpf_kptr_xchg_back(&sflow->flow, flow); + goto drop; + } + + if (fq_flow_is_detached(flow)) { + flow_copy = bpf_refcount_acquire(flow); + + jiffies = bpf_jiffies64(); + if ((s64)(jiffies - (flow_copy->age + q.flow_refill_delay)) > 0) { + if (flow_copy->credit < q.quantum) + flow_copy->credit = q.quantum; + } + flow_copy->age = 0; + fq_flows_add_tail(&fq_new_flows, &fq_new_flows_lock, flow_copy, + &q.new_flow_cnt); + } + } + + skbn = bpf_obj_new(typeof(*skbn)); + if (!skbn) { + bpf_kptr_xchg_back(&sflow->flow, flow); + goto drop; + } + + skbn->tstamp = skb->tstamp = time_to_send; + + sch->qstats.backlog += qdisc_pkt_len(skb); + + skb = bpf_kptr_xchg(&skbn->skb, skb); + if (skb) + bpf_qdisc_skb_drop(skb, to_free); + + bpf_spin_lock(&flow->lock); + bpf_rbtree_add(&flow->queue, &skbn->node, skbn_tstamp_less); + bpf_spin_unlock(&flow->lock); + + flow->qlen++; + bpf_kptr_xchg_back(&sflow->flow, flow); + + sch->q.qlen++; + return NET_XMIT_SUCCESS; + +drop: + bpf_qdisc_skb_drop(skb, to_free); + sch->qstats.drops++; + return NET_XMIT_DROP; +} + +static int fq_unset_throttled_flows(u32 index, struct unset_throttled_flows_ctx *ctx) +{ + struct bpf_rb_node *node = NULL; + struct fq_flow_node *flow; + + bpf_spin_lock(&fq_delayed_lock); + + node = bpf_rbtree_first(&fq_delayed); + if (!node) { + bpf_spin_unlock(&fq_delayed_lock); + return 1; + } + + flow = container_of(node, struct fq_flow_node, rb_node); + if (!ctx->unset_all && flow->time_next_packet > ctx->now) { + q.time_next_delayed_flow = flow->time_next_packet; + bpf_spin_unlock(&fq_delayed_lock); + return 1; + } + + node = bpf_rbtree_remove(&fq_delayed, &flow->rb_node); + + bpf_spin_unlock(&fq_delayed_lock); + + if (!node) + return 1; + + flow = container_of(node, struct fq_flow_node, rb_node); + flow->age = 0; + fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt); + + return 0; +} + +static void fq_flow_set_throttled(struct fq_flow_node *flow) +{ + flow->age = ~0ULL; + + if (q.time_next_delayed_flow > flow->time_next_packet) + q.time_next_delayed_flow = flow->time_next_packet; + + bpf_spin_lock(&fq_delayed_lock); + bpf_rbtree_add(&fq_delayed, &flow->rb_node, fn_time_next_packet_less); + bpf_spin_unlock(&fq_delayed_lock); +} + +static void fq_check_throttled(u64 now) +{ + struct unset_throttled_flows_ctx ctx = { + .unset_all = false, + .now = now, + }; + unsigned long sample; + + if (q.time_next_delayed_flow > now) + return; + + sample = (unsigned long)(now - q.time_next_delayed_flow); + q.unthrottle_latency_ns -= q.unthrottle_latency_ns >> 3; + q.unthrottle_latency_ns += sample >> 3; + + q.time_next_delayed_flow = ~0ULL; + bpf_loop(NUM_QUEUE, fq_unset_throttled_flows, &ctx, 0); +} + +static struct sk_buff* +fq_dequeue_nonprio_flows(u32 index, struct dequeue_nonprio_ctx *ctx) +{ + u64 time_next_packet, time_to_send; + struct bpf_rb_node *rb_node; + struct sk_buff *skb = NULL; + struct bpf_list_head *head; + struct bpf_list_node *node; + struct bpf_spin_lock *lock; + struct fq_flow_node *flow; + struct skb_node *skbn; + bool is_empty; + u32 *cnt; + + if (q.new_flow_cnt) { + head = &fq_new_flows; + lock = &fq_new_flows_lock; + cnt = &q.new_flow_cnt; + } else if (q.old_flow_cnt) { + head = &fq_old_flows; + lock = &fq_old_flows_lock; + cnt = &q.old_flow_cnt; + } else { + if (q.time_next_delayed_flow != ~0ULL) + ctx->expire = q.time_next_delayed_flow; + goto break_loop; + } + + fq_flows_remove_front(head, lock, &node, cnt); + if (!node) + goto break_loop; + + flow = container_of(node, struct fq_flow_node, list_node); + if (flow->credit <= 0) { + flow->credit += q.quantum; + fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt); + return NULL; + } + + bpf_spin_lock(&flow->lock); + rb_node = bpf_rbtree_first(&flow->queue); + if (!rb_node) { + bpf_spin_unlock(&flow->lock); + is_empty = fq_flows_is_empty(&fq_old_flows, &fq_old_flows_lock); + if (head == &fq_new_flows && !is_empty) { + fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt); + } else { + fq_flow_set_detached(flow); + bpf_obj_drop(flow); + } + return NULL; + } + + skbn = container_of(rb_node, struct skb_node, node); + time_to_send = skbn->tstamp; + + time_next_packet = (time_to_send > flow->time_next_packet) ? + time_to_send : flow->time_next_packet; + if (ctx->now < time_next_packet) { + bpf_spin_unlock(&flow->lock); + flow->time_next_packet = time_next_packet; + fq_flow_set_throttled(flow); + return NULL; + } + + rb_node = bpf_rbtree_remove(&flow->queue, rb_node); + bpf_spin_unlock(&flow->lock); + + if (!rb_node) + goto add_flow_and_break; + + skbn = container_of(rb_node, struct skb_node, node); + skb = bpf_kptr_xchg(&skbn->skb, skb); + bpf_obj_drop(skbn); + + if (!skb) + goto add_flow_and_break; + + flow->credit -= qdisc_skb_cb(skb)->pkt_len; + flow->qlen--; + +add_flow_and_break: + fq_flows_add_head(head, lock, flow, cnt); + +break_loop: + ctx->stop_iter = true; + return skb; +} + +static struct sk_buff *fq_dequeue_prio(void) +{ + struct fq_flow_node *flow = NULL; + struct fq_stashed_flow *sflow; + struct bpf_rb_node *rb_node; + struct sk_buff *skb = NULL; + struct skb_node *skbn; + u64 hash = 0; + + sflow = bpf_map_lookup_elem(&fq_prio_flows, &hash); + if (!sflow) + return NULL; + + flow = bpf_kptr_xchg(&sflow->flow, flow); + if (!flow) + return NULL; + + bpf_spin_lock(&flow->lock); + rb_node = bpf_rbtree_first(&flow->queue); + if (!rb_node) { + bpf_spin_unlock(&flow->lock); + goto out; + } + + skbn = container_of(rb_node, struct skb_node, node); + rb_node = bpf_rbtree_remove(&flow->queue, &skbn->node); + bpf_spin_unlock(&flow->lock); + + if (!rb_node) + goto out; + + skbn = container_of(rb_node, struct skb_node, node); + skb = bpf_kptr_xchg(&skbn->skb, skb); + bpf_obj_drop(skbn); + +out: + bpf_kptr_xchg_back(&sflow->flow, flow); + + return skb; +} + +SEC("struct_ops/bpf_fq_dequeue") +struct sk_buff *BPF_PROG(bpf_fq_dequeue, struct Qdisc *sch) +{ + struct dequeue_nonprio_ctx cb_ctx = {}; + struct sk_buff *skb = NULL; + int i; + + if (!sch->q.qlen) + goto out; + + skb = fq_dequeue_prio(); + if (skb) + goto dequeue; + + q.ktime_cache = cb_ctx.now = bpf_ktime_get_ns(); + fq_check_throttled(q.ktime_cache); + bpf_for(i, 0, sch->limit) { + skb = fq_dequeue_nonprio_flows(i, &cb_ctx); + if (cb_ctx.stop_iter) + break; + }; + + if (skb) { +dequeue: + sch->q.qlen--; + sch->qstats.backlog -= qdisc_pkt_len(skb); + bpf_qdisc_bstats_update(sch, skb); + return skb; + } + + if (cb_ctx.expire) + bpf_qdisc_watchdog_schedule(sch, cb_ctx.expire, q.timer_slack); +out: + return NULL; +} + +static int fq_remove_flows_in_list(u32 index, void *ctx) +{ + struct bpf_list_node *node; + struct fq_flow_node *flow; + + bpf_spin_lock(&fq_new_flows_lock); + node = bpf_list_pop_front(&fq_new_flows); + bpf_spin_unlock(&fq_new_flows_lock); + if (!node) { + bpf_spin_lock(&fq_old_flows_lock); + node = bpf_list_pop_front(&fq_old_flows); + bpf_spin_unlock(&fq_old_flows_lock); + if (!node) + return 1; + } + + flow = container_of(node, struct fq_flow_node, list_node); + bpf_obj_drop(flow); + + return 0; +} + +extern unsigned CONFIG_HZ __kconfig; + +/* limit number of collected flows per round */ +#define FQ_GC_MAX 8 +#define FQ_GC_AGE (3*CONFIG_HZ) + +static bool fq_gc_candidate(struct fq_flow_node *flow) +{ + u64 jiffies = bpf_jiffies64(); + + return fq_flow_is_detached(flow) && + ((s64)(jiffies - (flow->age + FQ_GC_AGE)) > 0); +} + +static int +fq_remove_flows(struct bpf_map *flow_map, u64 *hash, + struct fq_stashed_flow *sflow, struct remove_flows_ctx *ctx) +{ + if (sflow->flow && + (!ctx->gc_only || fq_gc_candidate(sflow->flow))) { + bpf_map_delete_elem(flow_map, hash); + ctx->reset_cnt++; + } + + return ctx->reset_cnt < ctx->reset_max ? 0 : 1; +} + +static void fq_gc(void) +{ + struct remove_flows_ctx cb_ctx = { + .gc_only = true, + .reset_cnt = 0, + .reset_max = FQ_GC_MAX, + }; + + bpf_for_each_map_elem(&fq_nonprio_flows, fq_remove_flows, &cb_ctx, 0); +} + +SEC("struct_ops/bpf_fq_reset") +void BPF_PROG(bpf_fq_reset, struct Qdisc *sch) +{ + struct unset_throttled_flows_ctx utf_ctx = { + .unset_all = true, + }; + struct remove_flows_ctx rf_ctx = { + .gc_only = false, + .reset_cnt = 0, + .reset_max = NUM_QUEUE, + }; + struct fq_stashed_flow *sflow; + u64 hash = 0; + + sch->q.qlen = 0; + sch->qstats.backlog = 0; + + bpf_for_each_map_elem(&fq_nonprio_flows, fq_remove_flows, &rf_ctx, 0); + + rf_ctx.reset_cnt = 0; + bpf_for_each_map_elem(&fq_prio_flows, fq_remove_flows, &rf_ctx, 0); + fq_new_flow(&fq_prio_flows, &sflow, hash); + + bpf_loop(NUM_QUEUE, fq_remove_flows_in_list, NULL, 0); + q.new_flow_cnt = 0; + q.old_flow_cnt = 0; + + bpf_loop(NUM_QUEUE, fq_unset_throttled_flows, &utf_ctx, 0); +} + +SEC("struct_ops/bpf_fq_init") +int BPF_PROG(bpf_fq_init, struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = sch->dev_queue->dev; + u32 psched_mtu = dev->mtu + dev->hard_header_len; + struct fq_stashed_flow *sflow; + u64 hash = 0; + + if (fq_new_flow(&fq_prio_flows, &sflow, hash) < 0) + return -ENOMEM; + + sch->limit = 10000; + q.initial_quantum = 10 * psched_mtu; + q.quantum = 2 * psched_mtu; + q.flow_refill_delay = 40; + q.flow_plimit = 100; + q.horizon = 10ULL * NSEC_PER_SEC; + q.horizon_drop = 1; + q.orphan_mask = 1024 - 1; + q.timer_slack = 10 * NSEC_PER_USEC; + q.time_next_delayed_flow = ~0ULL; + q.unthrottle_latency_ns = 0ULL; + q.new_flow_cnt = 0; + q.old_flow_cnt = 0; + + return 0; +} + +SEC(".struct_ops") +struct Qdisc_ops fq = { + .enqueue = (void *)bpf_fq_enqueue, + .dequeue = (void *)bpf_fq_dequeue, + .reset = (void *)bpf_fq_reset, + .init = (void *)bpf_fq_init, + .id = "bpf_fq", +}; diff --git a/tools/testing/selftests/bpf/progs/linked_list_peek.c b/tools/testing/selftests/bpf/progs/linked_list_peek.c new file mode 100644 index 0000000000000..a24ceb34939e3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/linked_list_peek.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" +#include "bpf_experimental.h" + +struct node_data { + struct bpf_list_node l; + int key; +}; + +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) +private(A) struct bpf_spin_lock glock; +private(A) struct bpf_list_head ghead __contains(node_data, l); + +#define list_entry(ptr, type, member) container_of(ptr, type, member) +#define NR_NODES 32 + +int zero = 0; + +SEC("syscall") +__retval(0) +long list_peek(void *ctx) +{ + struct bpf_list_node *l_n; + struct node_data *n; + int i, err = 0; + + bpf_spin_lock(&glock); + l_n = bpf_list_front(&ghead); + bpf_spin_unlock(&glock); + if (l_n) + return __LINE__; + + bpf_spin_lock(&glock); + l_n = bpf_list_back(&ghead); + bpf_spin_unlock(&glock); + if (l_n) + return __LINE__; + + for (i = zero; i < NR_NODES && can_loop; i++) { + n = bpf_obj_new(typeof(*n)); + if (!n) + return __LINE__; + n->key = i; + bpf_spin_lock(&glock); + bpf_list_push_back(&ghead, &n->l); + bpf_spin_unlock(&glock); + } + + bpf_spin_lock(&glock); + + l_n = bpf_list_front(&ghead); + if (!l_n) { + err = __LINE__; + goto done; + } + + n = list_entry(l_n, struct node_data, l); + if (n->key != 0) { + err = __LINE__; + goto done; + } + + l_n = bpf_list_back(&ghead); + if (!l_n) { + err = __LINE__; + goto done; + } + + n = list_entry(l_n, struct node_data, l); + if (n->key != NR_NODES - 1) { + err = __LINE__; + goto done; + } + +done: + bpf_spin_unlock(&glock); + return err; +} + +#define TEST_FB(op, dolock) \ +SEC("syscall") \ +__failure __msg(MSG) \ +long test_##op##_spinlock_##dolock(void *ctx) \ +{ \ + struct bpf_list_node *l_n; \ + __u64 jiffies = 0; \ + \ + if (dolock) \ + bpf_spin_lock(&glock); \ + l_n = bpf_list_##op(&ghead); \ + if (l_n) \ + jiffies = bpf_jiffies64(); \ + if (dolock) \ + bpf_spin_unlock(&glock); \ + \ + return !!jiffies; \ +} + +#define MSG "call bpf_list_{{(front|back).+}}; R0{{(_[rwD])?}}=ptr_or_null_node_data(id={{[0-9]+}},non_own_ref" +TEST_FB(front, true) +TEST_FB(back, true) +#undef MSG + +#define MSG "bpf_spin_lock at off=0 must be held for bpf_list_head" +TEST_FB(front, false) +TEST_FB(back, false) +#undef MSG + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c index dbd5eee8e25e8..b2e24f018a3f9 100644 --- a/tools/testing/selftests/bpf/progs/rbtree_fail.c +++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c @@ -69,11 +69,11 @@ long rbtree_api_nolock_first(void *ctx) } SEC("?tc") -__failure __msg("rbtree_remove node input must be non-owning ref") +__retval(0) long rbtree_api_remove_unadded_node(void *ctx) { struct node_data *n, *m; - struct bpf_rb_node *res; + struct bpf_rb_node *res_n, *res_m; n = bpf_obj_new(typeof(*n)); if (!n) @@ -89,18 +89,20 @@ long rbtree_api_remove_unadded_node(void *ctx) bpf_rbtree_add(&groot, &n->node, less); /* This remove should pass verifier */ - res = bpf_rbtree_remove(&groot, &n->node); - n = container_of(res, struct node_data, node); + res_n = bpf_rbtree_remove(&groot, &n->node); /* This remove shouldn't, m isn't in an rbtree */ - res = bpf_rbtree_remove(&groot, &m->node); - m = container_of(res, struct node_data, node); + res_m = bpf_rbtree_remove(&groot, &m->node); bpf_spin_unlock(&glock); - if (n) - bpf_obj_drop(n); - if (m) - bpf_obj_drop(m); + bpf_obj_drop(m); + if (res_n) + bpf_obj_drop(container_of(res_n, struct node_data, node)); + if (res_m) { + bpf_obj_drop(container_of(res_m, struct node_data, node)); + return 2; + } + return 0; } @@ -178,7 +180,7 @@ long rbtree_api_use_unchecked_remove_retval(void *ctx) } SEC("?tc") -__failure __msg("rbtree_remove node input must be non-owning ref") +__failure __msg("bpf_rbtree_remove can only take non-owning or refcounted bpf_rb_node pointer") long rbtree_api_add_release_unlock_escape(void *ctx) { struct node_data *n; @@ -202,7 +204,7 @@ long rbtree_api_add_release_unlock_escape(void *ctx) } SEC("?tc") -__failure __msg("rbtree_remove node input must be non-owning ref") +__failure __msg("bpf_rbtree_remove can only take non-owning or refcounted bpf_rb_node pointer") long rbtree_api_first_release_unlock_escape(void *ctx) { struct bpf_rb_node *res; diff --git a/tools/testing/selftests/bpf/progs/rbtree_search.c b/tools/testing/selftests/bpf/progs/rbtree_search.c new file mode 100644 index 0000000000000..e948c1b573d8d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/rbtree_search.c @@ -0,0 +1,208 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" +#include "bpf_experimental.h" + +struct node_data { + struct bpf_refcount ref; + struct bpf_rb_node r0; + struct bpf_rb_node r1; + int key0; + int key1; +}; + +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) +private(A) struct bpf_spin_lock glock0; +private(A) struct bpf_rb_root groot0 __contains(node_data, r0); + +private(B) struct bpf_spin_lock glock1; +private(B) struct bpf_rb_root groot1 __contains(node_data, r1); + +#define rb_entry(ptr, type, member) container_of(ptr, type, member) +#define NR_NODES 16 + +int zero = 0; + +static bool less0(struct bpf_rb_node *a, const struct bpf_rb_node *b) +{ + struct node_data *node_a; + struct node_data *node_b; + + node_a = rb_entry(a, struct node_data, r0); + node_b = rb_entry(b, struct node_data, r0); + + return node_a->key0 < node_b->key0; +} + +static bool less1(struct bpf_rb_node *a, const struct bpf_rb_node *b) +{ + struct node_data *node_a; + struct node_data *node_b; + + node_a = rb_entry(a, struct node_data, r1); + node_b = rb_entry(b, struct node_data, r1); + + return node_a->key1 < node_b->key1; +} + +SEC("syscall") +__retval(0) +long rbtree_search(void *ctx) +{ + struct bpf_rb_node *rb_n, *rb_m, *gc_ns[NR_NODES]; + long lookup_key = NR_NODES / 2; + struct node_data *n, *m; + int i, err, nr_gc = 0; + + for (i = zero; i < NR_NODES && can_loop; i++) { + n = bpf_obj_new(typeof(*n)); + if (!n) + return __LINE__; + + m = bpf_refcount_acquire(n); + + n->key0 = i; + m->key1 = i; + + bpf_spin_lock(&glock0); + err = bpf_rbtree_add(&groot0, &n->r0, less0); + bpf_spin_unlock(&glock0); + + bpf_spin_lock(&glock1); + err = bpf_rbtree_add(&groot1, &m->r1, less1); + bpf_spin_unlock(&glock1); + + if (err) + return __LINE__; + } + + n = NULL; + bpf_spin_lock(&glock0); + rb_n = bpf_rbtree_root(&groot0); + while (can_loop) { + if (!rb_n) { + bpf_spin_unlock(&glock0); + return __LINE__; + } + + n = rb_entry(rb_n, struct node_data, r0); + if (lookup_key == n->key0) + break; + if (nr_gc < NR_NODES) + gc_ns[nr_gc++] = rb_n; + if (lookup_key < n->key0) + rb_n = bpf_rbtree_left(&groot0, rb_n); + else + rb_n = bpf_rbtree_right(&groot0, rb_n); + } + + if (!n || lookup_key != n->key0) { + bpf_spin_unlock(&glock0); + return __LINE__; + } + + for (i = 0; i < nr_gc; i++) { + rb_n = gc_ns[i]; + gc_ns[i] = bpf_rbtree_remove(&groot0, rb_n); + } + + m = bpf_refcount_acquire(n); + bpf_spin_unlock(&glock0); + + for (i = 0; i < nr_gc; i++) { + rb_n = gc_ns[i]; + if (rb_n) { + n = rb_entry(rb_n, struct node_data, r0); + bpf_obj_drop(n); + } + } + + if (!m) + return __LINE__; + + bpf_spin_lock(&glock1); + rb_m = bpf_rbtree_remove(&groot1, &m->r1); + bpf_spin_unlock(&glock1); + bpf_obj_drop(m); + if (!rb_m) + return __LINE__; + bpf_obj_drop(rb_entry(rb_m, struct node_data, r1)); + + return 0; +} + +#define TEST_ROOT(dolock) \ +SEC("tc") \ +__failure __msg(MSG) \ +long test_root_spinlock_##dolock(void *ctx) \ +{ \ + struct bpf_rb_node *rb_n; \ + long *cpu_ptr = NULL; \ + \ + if (dolock) \ + bpf_spin_lock(&glock0); \ + rb_n = bpf_rbtree_root(&groot0); \ + if (rb_n) \ + cpu_ptr = bpf_this_cpu_ptr(rb_n); \ + if (dolock) \ + bpf_spin_unlock(&glock0); \ + \ + return !!cpu_ptr; \ +} + +#define TEST_LR(op, dolock) \ +SEC("tc") \ +__failure __msg(MSG) \ +long test_##op##_spinlock_##dolock(void *ctx) \ +{ \ + struct node_data *n = NULL; \ + struct bpf_rb_node *rb_n; \ + __u64 jiffies = 0; \ + \ + bpf_spin_lock(&glock0); \ + rb_n = bpf_rbtree_root(&groot0); \ + if (!rb_n) \ + goto unlock; \ + n = rb_entry(rb_n, struct node_data, r0); \ + n = bpf_refcount_acquire(n); \ + if (!n) \ + goto unlock; \ + bpf_spin_unlock(&glock0); \ + \ + if (dolock) \ + bpf_spin_lock(&glock0); \ + rb_n = bpf_rbtree_##op(&groot0, &n->r0); \ + if (rb_n) \ + jiffies = bpf_jiffies64(); \ + if (dolock) \ +unlock: \ + bpf_spin_unlock(&glock0); \ + \ + return !!jiffies; \ +} + +/* + * Use a spearate MSG macro instead of passing to TEST_XXX(..., MSG) + * to ensure the message itself is not in the bpf prog lineinfo + * which the verifier includes in its log. + * Otherwise, the test_loader will incorrectly match the prog lineinfo + * instead of the log generated by the verifier. + */ +#define MSG "call bpf_rbtree_root{{.+}}; R0{{(_[rwD])?}}=rcu_ptr_or_null_node_data(id={{[0-9]+}},non_own_ref" +TEST_ROOT(true) +#undef MSG +#define MSG "call bpf_rbtree_{{(left|right).+}}; R0{{(_[rwD])?}}=rcu_ptr_or_null_node_data(id={{[0-9]+}},non_own_ref" +TEST_LR(left, true) +TEST_LR(right, true) +#undef MSG + +#define MSG "bpf_spin_lock at off=0 must be held for bpf_rb_root" +TEST_ROOT(false) +TEST_LR(left, false) +TEST_LR(right, false) +#undef MSG + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c b/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c index ccde6a4c63196..683306db8594e 100644 --- a/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c +++ b/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include "xsk_xdp_common.h" struct { @@ -14,6 +16,7 @@ struct { } xsk SEC(".maps"); static unsigned int idx; +int adjust_value = 0; int count = 0; SEC("xdp.frags") int xsk_def_prog(struct xdp_md *xdp) @@ -70,4 +73,51 @@ SEC("xdp") int xsk_xdp_shared_umem(struct xdp_md *xdp) return bpf_redirect_map(&xsk, idx, XDP_DROP); } +SEC("xdp.frags") int xsk_xdp_adjust_tail(struct xdp_md *xdp) +{ + __u32 buff_len, curr_buff_len; + int ret; + + buff_len = bpf_xdp_get_buff_len(xdp); + if (buff_len == 0) + return XDP_DROP; + + ret = bpf_xdp_adjust_tail(xdp, adjust_value); + if (ret < 0) { + /* Handle unsupported cases */ + if (ret == -EOPNOTSUPP) { + /* Set adjust_value to -EOPNOTSUPP to indicate to userspace that this case + * is unsupported + */ + adjust_value = -EOPNOTSUPP; + return bpf_redirect_map(&xsk, 0, XDP_DROP); + } + + return XDP_DROP; + } + + curr_buff_len = bpf_xdp_get_buff_len(xdp); + if (curr_buff_len != buff_len + adjust_value) + return XDP_DROP; + + if (curr_buff_len > buff_len) { + __u32 *pkt_data = (void *)(long)xdp->data; + __u32 len, words_to_end, seq_num; + + len = curr_buff_len - PKT_HDR_ALIGN; + words_to_end = len / sizeof(*pkt_data) - 1; + seq_num = words_to_end; + + /* Convert sequence number to network byte order. Store this in the last 4 bytes of + * the packet. Use 'adjust_value' to determine the position at the end of the + * packet for storing the sequence number. + */ + seq_num = __constant_htonl(words_to_end); + bpf_xdp_store_bytes(xdp, curr_buff_len - sizeof(seq_num), &seq_num, + sizeof(seq_num)); + } + + return bpf_redirect_map(&xsk, 0, XDP_DROP); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/xsk_xdp_common.h b/tools/testing/selftests/bpf/xsk_xdp_common.h index 5a6f36f07383c..45810ff552dae 100644 --- a/tools/testing/selftests/bpf/xsk_xdp_common.h +++ b/tools/testing/selftests/bpf/xsk_xdp_common.h @@ -4,6 +4,7 @@ #define XSK_XDP_COMMON_H_ #define MAX_SOCKETS 2 +#define PKT_HDR_ALIGN (sizeof(struct ethhdr) + 2) /* Just to align the data in the packet */ struct xdp_info { __u64 count; diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 11f047b8af752..0ced4026ee443 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -524,6 +524,8 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, test->nb_sockets = 1; test->fail = false; test->set_ring = false; + test->adjust_tail = false; + test->adjust_tail_support = false; test->mtu = MAX_ETH_PKT_SIZE; test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog; test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk; @@ -757,14 +759,15 @@ static struct pkt_stream *pkt_stream_clone(struct pkt_stream *pkt_stream) return pkt_stream_generate(pkt_stream->nb_pkts, pkt_stream->pkts[0].len); } -static void pkt_stream_replace(struct test_spec *test, u32 nb_pkts, u32 pkt_len) +static void pkt_stream_replace_ifobject(struct ifobject *ifobj, u32 nb_pkts, u32 pkt_len) { - struct pkt_stream *pkt_stream; + ifobj->xsk->pkt_stream = pkt_stream_generate(nb_pkts, pkt_len); +} - pkt_stream = pkt_stream_generate(nb_pkts, pkt_len); - test->ifobj_tx->xsk->pkt_stream = pkt_stream; - pkt_stream = pkt_stream_generate(nb_pkts, pkt_len); - test->ifobj_rx->xsk->pkt_stream = pkt_stream; +static void pkt_stream_replace(struct test_spec *test, u32 nb_pkts, u32 pkt_len) +{ + pkt_stream_replace_ifobject(test->ifobj_tx, nb_pkts, pkt_len); + pkt_stream_replace_ifobject(test->ifobj_rx, nb_pkts, pkt_len); } static void __pkt_stream_replace_half(struct ifobject *ifobj, u32 pkt_len, @@ -991,6 +994,31 @@ static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr) return true; } +static bool is_adjust_tail_supported(struct xsk_xdp_progs *skel_rx) +{ + struct bpf_map *data_map; + int adjust_value = 0; + int key = 0; + int ret; + + data_map = bpf_object__find_map_by_name(skel_rx->obj, "xsk_xdp_.bss"); + if (!data_map || !bpf_map__is_internal(data_map)) { + ksft_print_msg("Error: could not find bss section of XDP program\n"); + exit_with_error(errno); + } + + ret = bpf_map_lookup_elem(bpf_map__fd(data_map), &key, &adjust_value); + if (ret) { + ksft_print_msg("Error: bpf_map_lookup_elem failed with error %d\n", ret); + exit_with_error(errno); + } + + /* Set the 'adjust_value' variable to -EOPNOTSUPP in the XDP program if the adjust_tail + * helper is not supported. Skip the adjust_tail test case in this scenario. + */ + return adjust_value != -EOPNOTSUPP; +} + static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 expected_pkt_nb, u32 bytes_processed) { @@ -1767,8 +1795,13 @@ static void *worker_testapp_validate_rx(void *arg) if (!err && ifobject->validation_func) err = ifobject->validation_func(ifobject); - if (err) - report_failure(test); + + if (err) { + if (test->adjust_tail && !is_adjust_tail_supported(ifobject->xdp_progs)) + test->adjust_tail_support = false; + else + report_failure(test); + } pthread_exit(NULL); } @@ -2515,6 +2548,71 @@ static int testapp_hw_sw_max_ring_size(struct test_spec *test) return testapp_validate_traffic(test); } +static int testapp_xdp_adjust_tail(struct test_spec *test, int adjust_value) +{ + struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; + struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; + + test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_adjust_tail, + skel_tx->progs.xsk_xdp_adjust_tail, + skel_rx->maps.xsk, skel_tx->maps.xsk); + + skel_rx->bss->adjust_value = adjust_value; + + return testapp_validate_traffic(test); +} + +static int testapp_adjust_tail(struct test_spec *test, u32 value, u32 pkt_len) +{ + int ret; + + test->adjust_tail_support = true; + test->adjust_tail = true; + test->total_steps = 1; + + pkt_stream_replace_ifobject(test->ifobj_tx, DEFAULT_BATCH_SIZE, pkt_len); + pkt_stream_replace_ifobject(test->ifobj_rx, DEFAULT_BATCH_SIZE, pkt_len + value); + + ret = testapp_xdp_adjust_tail(test, value); + if (ret) + return ret; + + if (!test->adjust_tail_support) { + ksft_test_result_skip("%s %sResize pkt with bpf_xdp_adjust_tail() not supported\n", + mode_string(test), busy_poll_string(test)); + return TEST_SKIP; + } + + return 0; +} + +static int testapp_adjust_tail_shrink(struct test_spec *test) +{ + /* Shrink by 4 bytes for testing purpose */ + return testapp_adjust_tail(test, -4, MIN_PKT_SIZE * 2); +} + +static int testapp_adjust_tail_shrink_mb(struct test_spec *test) +{ + test->mtu = MAX_ETH_JUMBO_SIZE; + /* Shrink by the frag size */ + return testapp_adjust_tail(test, -XSK_UMEM__MAX_FRAME_SIZE, XSK_UMEM__LARGE_FRAME_SIZE * 2); +} + +static int testapp_adjust_tail_grow(struct test_spec *test) +{ + /* Grow by 4 bytes for testing purpose */ + return testapp_adjust_tail(test, 4, MIN_PKT_SIZE * 2); +} + +static int testapp_adjust_tail_grow_mb(struct test_spec *test) +{ + test->mtu = MAX_ETH_JUMBO_SIZE; + /* Grow by (frag_size - last_frag_Size) - 1 to stay inside the last fragment */ + return testapp_adjust_tail(test, (XSK_UMEM__MAX_FRAME_SIZE / 2) - 1, + XSK_UMEM__LARGE_FRAME_SIZE * 2); +} + static void run_pkt_test(struct test_spec *test) { int ret; @@ -2621,6 +2719,10 @@ static const struct test_spec tests[] = { {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags}, {.name = "HW_SW_MIN_RING_SIZE", .test_func = testapp_hw_sw_min_ring_size}, {.name = "HW_SW_MAX_RING_SIZE", .test_func = testapp_hw_sw_max_ring_size}, + {.name = "XDP_ADJUST_TAIL_SHRINK", .test_func = testapp_adjust_tail_shrink}, + {.name = "XDP_ADJUST_TAIL_SHRINK_MULTI_BUFF", .test_func = testapp_adjust_tail_shrink_mb}, + {.name = "XDP_ADJUST_TAIL_GROW", .test_func = testapp_adjust_tail_grow}, + {.name = "XDP_ADJUST_TAIL_GROW_MULTI_BUFF", .test_func = testapp_adjust_tail_grow_mb}, }; static void print_tests(void) diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h index e46e823f6a1aa..67fc44b2813b2 100644 --- a/tools/testing/selftests/bpf/xskxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -173,6 +173,8 @@ struct test_spec { u16 nb_sockets; bool fail; bool set_ring; + bool adjust_tail; + bool adjust_tail_support; enum test_mode mode; char name[MAX_TEST_NAME_SIZE]; }; diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 130d532b7e67a..3cfef51538230 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -33,7 +33,6 @@ CONFIG_NETFILTER_ADVANCED=y CONFIG_NF_CONNTRACK=m CONFIG_IPV6_MROUTE=y CONFIG_IPV6_SIT=y -CONFIG_IP_DCCP=m CONFIG_NF_NAT=m CONFIG_IP6_NF_IPTABLES=m CONFIG_IP_NF_IPTABLES=m diff --git a/tools/testing/selftests/net/forwarding/bridge_igmp.sh b/tools/testing/selftests/net/forwarding/bridge_igmp.sh index e6a3e04fd83f3..d4e7dd6593548 100755 --- a/tools/testing/selftests/net/forwarding/bridge_igmp.sh +++ b/tools/testing/selftests/net/forwarding/bridge_igmp.sh @@ -1,10 +1,24 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="v2reportleave_test v3include_test v3inc_allow_test v3inc_is_include_test \ - v3inc_is_exclude_test v3inc_to_exclude_test v3exc_allow_test v3exc_is_include_test \ - v3exc_is_exclude_test v3exc_to_exclude_test v3inc_block_test v3exc_block_test \ - v3exc_timeout_test v3star_ex_auto_add_test" +ALL_TESTS=" + v2reportleave_test + v3include_test + v3inc_allow_test + v3inc_is_include_test + v3inc_is_exclude_test + v3inc_to_exclude_test + v3exc_allow_test + v3exc_is_include_test + v3exc_is_exclude_test + v3exc_to_exclude_test + v3inc_block_test + v3exc_block_test + v3exc_timeout_test + v3star_ex_auto_add_test + v2per_vlan_snooping_port_stp_test + v2per_vlan_snooping_vlan_stp_test +" NUM_NETIFS=4 CHECK_TC="yes" TEST_GROUP="239.10.10.10" @@ -554,6 +568,64 @@ v3star_ex_auto_add_test() v3cleanup $swp2 $TEST_GROUP } +v2per_vlan_snooping_stp_test() +{ + local is_port=$1 + + local msg="port" + [[ $is_port -ne 1 ]] && msg="vlan" + + ip link set br0 up type bridge vlan_filtering 1 \ + mcast_igmp_version 2 \ + mcast_snooping 1 \ + mcast_vlan_snooping 1 \ + mcast_querier 1 \ + mcast_stats_enabled 1 + bridge vlan global set vid 1 dev br0 \ + mcast_snooping 1 \ + mcast_querier 1 \ + mcast_query_interval 100 \ + mcast_startup_query_count 0 + [[ $is_port -eq 1 ]] && bridge link set dev $swp1 state 0 + [[ $is_port -ne 1 ]] && bridge vlan set vid 1 dev $swp1 state 4 + sleep 5 + local tx_s=$(ip -j -p stats show dev $swp1 \ + group xstats_slave subgroup bridge suite mcast \ + | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]') + + [[ $is_port -eq 1 ]] && bridge link set dev $swp1 state 3 + [[ $is_port -ne 1 ]] && bridge vlan set vid 1 dev $swp1 state 3 + sleep 5 + local tx_e=$(ip -j -p stats show dev $swp1 \ + group xstats_slave subgroup bridge suite mcast \ + | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]') + + RET=0 + local tx=$(expr $tx_e - $tx_s) + test $tx -gt 0 + check_err $? "No IGMP queries after STP state becomes forwarding" + log_test "per vlan snooping with $msg stp state change" + + # restore settings + bridge vlan global set vid 1 dev br0 \ + mcast_querier 0 \ + mcast_query_interval 12500 \ + mcast_startup_query_count 2 + ip link set br0 up type bridge vlan_filtering 0 \ + mcast_vlan_snooping 0 \ + mcast_stats_enabled 0 +} + +v2per_vlan_snooping_port_stp_test() +{ + v2per_vlan_snooping_stp_test 1 +} + +v2per_vlan_snooping_vlan_stp_test() +{ + v2per_vlan_snooping_stp_test 0 +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/net/forwarding/bridge_mld.sh b/tools/testing/selftests/net/forwarding/bridge_mld.sh index f84ab2e657547..4cacef5a813aa 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mld.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mld.sh @@ -1,10 +1,23 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="mldv2include_test mldv2inc_allow_test mldv2inc_is_include_test mldv2inc_is_exclude_test \ - mldv2inc_to_exclude_test mldv2exc_allow_test mldv2exc_is_include_test \ - mldv2exc_is_exclude_test mldv2exc_to_exclude_test mldv2inc_block_test \ - mldv2exc_block_test mldv2exc_timeout_test mldv2star_ex_auto_add_test" +ALL_TESTS=" + mldv2include_test + mldv2inc_allow_test + mldv2inc_is_include_test + mldv2inc_is_exclude_test + mldv2inc_to_exclude_test + mldv2exc_allow_test + mldv2exc_is_include_test + mldv2exc_is_exclude_test + mldv2exc_to_exclude_test + mldv2inc_block_test + mldv2exc_block_test + mldv2exc_timeout_test + mldv2star_ex_auto_add_test + mldv2per_vlan_snooping_port_stp_test + mldv2per_vlan_snooping_vlan_stp_test +" NUM_NETIFS=4 CHECK_TC="yes" TEST_GROUP="ff02::cc" @@ -554,6 +567,66 @@ mldv2star_ex_auto_add_test() mldv2cleanup $swp2 } +mldv2per_vlan_snooping_stp_test() +{ + local is_port=$1 + + local msg="port" + [[ $is_port -ne 1 ]] && msg="vlan" + + ip link set br0 up type bridge vlan_filtering 1 \ + mcast_mld_version 2 \ + mcast_snooping 1 \ + mcast_vlan_snooping 1 \ + mcast_querier 1 \ + mcast_stats_enabled 1 + bridge vlan global set vid 1 dev br0 \ + mcast_mld_version 2 \ + mcast_snooping 1 \ + mcast_querier 1 \ + mcast_query_interval 100 \ + mcast_startup_query_count 0 + + [[ $is_port -eq 1 ]] && bridge link set dev $swp1 state 0 + [[ $is_port -ne 1 ]] && bridge vlan set vid 1 dev $swp1 state 4 + sleep 5 + local tx_s=$(ip -j -p stats show dev $swp1 \ + group xstats_slave subgroup bridge suite mcast \ + | jq '.[]["multicast"]["mld_queries"]["tx_v2"]') + [[ $is_port -eq 1 ]] && bridge link set dev $swp1 state 3 + [[ $is_port -ne 1 ]] && bridge vlan set vid 1 dev $swp1 state 3 + sleep 5 + local tx_e=$(ip -j -p stats show dev $swp1 \ + group xstats_slave subgroup bridge suite mcast \ + | jq '.[]["multicast"]["mld_queries"]["tx_v2"]') + + RET=0 + local tx=$(expr $tx_e - $tx_s) + test $tx -gt 0 + check_err $? "No MLD queries after STP state becomes forwarding" + log_test "per vlan snooping with $msg stp state change" + + # restore settings + bridge vlan global set vid 1 dev br0 \ + mcast_querier 0 \ + mcast_query_interval 12500 \ + mcast_startup_query_count 2 \ + mcast_mld_version 1 + ip link set br0 up type bridge vlan_filtering 0 \ + mcast_vlan_snooping 0 \ + mcast_stats_enabled 0 +} + +mldv2per_vlan_snooping_port_stp_test() +{ + mldv2per_vlan_snooping_stp_test 1 +} + +mldv2per_vlan_snooping_vlan_stp_test() +{ + mldv2per_vlan_snooping_stp_test 0 +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/net/forwarding/config b/tools/testing/selftests/net/forwarding/config index 8d7a1a004b7c3..18fd69d8d937d 100644 --- a/tools/testing/selftests/net/forwarding/config +++ b/tools/testing/selftests/net/forwarding/config @@ -1,6 +1,7 @@ CONFIG_BRIDGE=m CONFIG_VLAN_8021Q=m CONFIG_BRIDGE_VLAN_FILTERING=y +CONFIG_BRIDGE_IGMP_SNOOPING=y CONFIG_NET_L3_MASTER_DEV=y CONFIG_IPV6_MULTIPLE_TABLES=y CONFIG_NET_VRF=m diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py index 8986c584cb371..6329ae805abff 100644 --- a/tools/testing/selftests/net/lib/py/ynl.py +++ b/tools/testing/selftests/net/lib/py/ynl.py @@ -39,12 +39,12 @@ def __init__(self, recv_size=0): class RtnlFamily(YnlFamily): def __init__(self, recv_size=0): - super().__init__((SPEC_PATH / Path('rt_link.yaml')).as_posix(), + super().__init__((SPEC_PATH / Path('rt-link.yaml')).as_posix(), schema='', recv_size=recv_size) class RtnlAddrFamily(YnlFamily): def __init__(self, recv_size=0): - super().__init__((SPEC_PATH / Path('rt_addr.yaml')).as_posix(), + super().__init__((SPEC_PATH / Path('rt-addr.yaml')).as_posix(), schema='', recv_size=recv_size) class NetdevFamily(YnlFamily): diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index c83a8b47bbdfa..ac1349c4b9e54 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -180,13 +180,26 @@ static void xgetnameinfo(const struct sockaddr *addr, socklen_t addrlen, } static void xgetaddrinfo(const char *node, const char *service, - const struct addrinfo *hints, + struct addrinfo *hints, struct addrinfo **res) { +again: int err = getaddrinfo(node, service, hints, res); if (err) { - const char *errstr = getxinfo_strerr(err); + const char *errstr; + + /* glibc starts to support MPTCP since v2.42. + * For older versions, use IPPROTO_TCP to resolve, + * and use TCP/MPTCP to create socket. + * Link: https://sourceware.org/git/?p=glibc.git;a=commit;h=a8e9022e0f82 + */ + if (err == EAI_SOCKTYPE) { + hints->ai_protocol = IPPROTO_TCP; + goto again; + } + + errstr = getxinfo_strerr(err); fprintf(stderr, "Fatal: getaddrinfo(%s:%s): %s\n", node ? node : "", service ? service : "", errstr); @@ -292,7 +305,7 @@ static int sock_listen_mptcp(const char * const listenaddr, { int sock = -1; struct addrinfo hints = { - .ai_protocol = IPPROTO_TCP, + .ai_protocol = IPPROTO_MPTCP, .ai_socktype = SOCK_STREAM, .ai_flags = AI_PASSIVE | AI_NUMERICHOST }; @@ -356,7 +369,7 @@ static int sock_connect_mptcp(const char * const remoteaddr, int infd, struct wstate *winfo) { struct addrinfo hints = { - .ai_protocol = IPPROTO_TCP, + .ai_protocol = IPPROTO_MPTCP, .ai_socktype = SOCK_STREAM, }; struct addrinfo *a, *addr; diff --git a/tools/testing/selftests/net/mptcp/mptcp_diag.c b/tools/testing/selftests/net/mptcp/mptcp_diag.c index 284286c524cfe..37d5015ad08c4 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_diag.c +++ b/tools/testing/selftests/net/mptcp/mptcp_diag.c @@ -185,9 +185,10 @@ static void parse_nlmsg(struct nlmsghdr *nlh) } } -static void recv_nlmsg(int fd, struct nlmsghdr *nlh) +static void recv_nlmsg(int fd) { char rcv_buff[8192]; + struct nlmsghdr *nlh = (struct nlmsghdr *)rcv_buff; struct sockaddr_nl rcv_nladdr = { .nl_family = AF_NETLINK }; @@ -204,7 +205,6 @@ static void recv_nlmsg(int fd, struct nlmsghdr *nlh) int len; len = recvmsg(fd, &rcv_msg, 0); - nlh = (struct nlmsghdr *)rcv_buff; while (NLMSG_OK(nlh, len)) { if (nlh->nlmsg_type == NLMSG_DONE) { @@ -225,7 +225,6 @@ static void recv_nlmsg(int fd, struct nlmsghdr *nlh) static void get_mptcpinfo(__u32 token) { - struct nlmsghdr *nlh = NULL; int fd; fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG); @@ -233,7 +232,7 @@ static void get_mptcpinfo(__u32 token) die_perror("Netlink socket"); send_query(fd, token); - recv_nlmsg(fd, nlh); + recv_nlmsg(fd); close(fd); } diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index befa66f5a366b..b8af65373b3ad 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -62,6 +62,7 @@ unset sflags unset fastclose unset fullmesh unset speed +unset join_syn_rej unset join_csum_ns1 unset join_csum_ns2 unset join_fail_nr @@ -1403,6 +1404,7 @@ chk_join_nr() local syn_nr=$1 local syn_ack_nr=$2 local ack_nr=$3 + local syn_rej=${join_syn_rej:-0} local csum_ns1=${join_csum_ns1:-0} local csum_ns2=${join_csum_ns2:-0} local fail_nr=${join_fail_nr:-0} @@ -1468,6 +1470,15 @@ chk_join_nr() fail_test "got $count JOIN[s] ack HMAC failure expected 0" fi + count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinRejected") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$syn_rej" ]; then + rc=${KSFT_FAIL} + print_check "syn rejected" + fail_test "got $count JOIN[s] syn rejected expected $syn_rej" + fi + print_results "join Rx" ${rc} join_syn_tx="${join_syn_tx:-${syn_nr}}" \ @@ -1963,7 +1974,8 @@ subflows_tests() pm_nl_set_limits $ns2 0 1 pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 0 + join_syn_rej=1 \ + chk_join_nr 1 1 0 fi # subflow @@ -1992,7 +2004,8 @@ subflows_tests() pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 2 2 1 + join_syn_rej=1 \ + chk_join_nr 2 2 1 fi # single subflow, dev @@ -3061,7 +3074,8 @@ syncookies_tests() pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 2 1 1 + join_syn_rej=1 \ + chk_join_nr 2 1 1 fi # test signal address with cookies @@ -3545,7 +3559,8 @@ userspace_tests() pm_nl_set_limits $ns2 1 1 pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 0 + join_syn_rej=1 \ + chk_join_nr 1 1 0 fi # userspace pm type does not send join @@ -3568,7 +3583,8 @@ userspace_tests() pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow sflags=backup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 0 + join_syn_rej=1 \ + chk_join_nr 1 1 0 chk_prio_nr 0 0 0 0 fi diff --git a/tools/testing/selftests/net/ovpn/.gitignore b/tools/testing/selftests/net/ovpn/.gitignore new file mode 100644 index 0000000000000..ee44c081ca7c0 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0+ +ovpn-cli diff --git a/tools/testing/selftests/net/ovpn/Makefile b/tools/testing/selftests/net/ovpn/Makefile new file mode 100644 index 0000000000000..2d102878cb6dd --- /dev/null +++ b/tools/testing/selftests/net/ovpn/Makefile @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2020-2025 OpenVPN, Inc. +# +CFLAGS = -pedantic -Wextra -Wall -Wl,--no-as-needed -g -O0 -ggdb $(KHDR_INCLUDES) +VAR_CFLAGS = $(shell pkg-config --cflags libnl-3.0 libnl-genl-3.0 2>/dev/null) +ifeq ($(VAR_CFLAGS),) +VAR_CFLAGS = -I/usr/include/libnl3 +endif +CFLAGS += $(VAR_CFLAGS) + + +LDLIBS = -lmbedtls -lmbedcrypto +VAR_LDLIBS = $(shell pkg-config --libs libnl-3.0 libnl-genl-3.0 2>/dev/null) +ifeq ($(VAR_LDLIBS),) +VAR_LDLIBS = -lnl-genl-3 -lnl-3 +endif +LDLIBS += $(VAR_LDLIBS) + + +TEST_FILES = common.sh + +TEST_PROGS = test.sh \ + test-chachapoly.sh \ + test-tcp.sh \ + test-float.sh \ + test-close-socket.sh \ + test-close-socket-tcp.sh + +TEST_GEN_FILES := ovpn-cli + +include ../../lib.mk diff --git a/tools/testing/selftests/net/ovpn/common.sh b/tools/testing/selftests/net/ovpn/common.sh new file mode 100644 index 0000000000000..7502292a1ee03 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/common.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2020-2025 OpenVPN, Inc. +# +# Author: Antonio Quartulli + +UDP_PEERS_FILE=${UDP_PEERS_FILE:-udp_peers.txt} +TCP_PEERS_FILE=${TCP_PEERS_FILE:-tcp_peers.txt} +OVPN_CLI=${OVPN_CLI:-./ovpn-cli} +ALG=${ALG:-aes} +PROTO=${PROTO:-UDP} +FLOAT=${FLOAT:-0} + +create_ns() { + ip netns add peer${1} +} + +setup_ns() { + MODE="P2P" + + if [ ${1} -eq 0 ]; then + MODE="MP" + for p in $(seq 1 ${NUM_PEERS}); do + ip link add veth${p} netns peer0 type veth peer name veth${p} netns peer${p} + + ip -n peer0 addr add 10.10.${p}.1/24 dev veth${p} + ip -n peer0 link set veth${p} up + + ip -n peer${p} addr add 10.10.${p}.2/24 dev veth${p} + ip -n peer${p} link set veth${p} up + done + fi + + ip netns exec peer${1} ${OVPN_CLI} new_iface tun${1} $MODE + ip -n peer${1} addr add ${2} dev tun${1} + ip -n peer${1} link set tun${1} up +} + +add_peer() { + if [ "${PROTO}" == "UDP" ]; then + if [ ${1} -eq 0 ]; then + ip netns exec peer0 ${OVPN_CLI} new_multi_peer tun0 1 ${UDP_PEERS_FILE} + + for p in $(seq 1 ${NUM_PEERS}); do + ip netns exec peer0 ${OVPN_CLI} new_key tun0 ${p} 1 0 ${ALG} 0 \ + data64.key + done + else + ip netns exec peer${1} ${OVPN_CLI} new_peer tun${1} ${1} 1 10.10.${1}.1 1 + ip netns exec peer${1} ${OVPN_CLI} new_key tun${1} ${1} 1 0 ${ALG} 1 \ + data64.key + fi + else + if [ ${1} -eq 0 ]; then + (ip netns exec peer0 ${OVPN_CLI} listen tun0 1 ${TCP_PEERS_FILE} && { + for p in $(seq 1 ${NUM_PEERS}); do + ip netns exec peer0 ${OVPN_CLI} new_key tun0 ${p} 1 0 \ + ${ALG} 0 data64.key + done + }) & + sleep 5 + else + ip netns exec peer${1} ${OVPN_CLI} connect tun${1} ${1} 10.10.${1}.1 1 \ + data64.key + fi + fi +} + +cleanup() { + # some ovpn-cli processes sleep in background so they need manual poking + killall $(basename ${OVPN_CLI}) 2>/dev/null || true + + # netns peer0 is deleted without erasing ifaces first + for p in $(seq 1 10); do + ip -n peer${p} link set tun${p} down 2>/dev/null || true + ip netns exec peer${p} ${OVPN_CLI} del_iface tun${p} 2>/dev/null || true + done + for p in $(seq 1 10); do + ip -n peer0 link del veth${p} 2>/dev/null || true + done + for p in $(seq 0 10); do + ip netns del peer${p} 2>/dev/null || true + done +} + +if [ "${PROTO}" == "UDP" ]; then + NUM_PEERS=${NUM_PEERS:-$(wc -l ${UDP_PEERS_FILE} | awk '{print $1}')} +else + NUM_PEERS=${NUM_PEERS:-$(wc -l ${TCP_PEERS_FILE} | awk '{print $1}')} +fi + + diff --git a/tools/testing/selftests/net/ovpn/config b/tools/testing/selftests/net/ovpn/config new file mode 100644 index 0000000000000..71946ba9fa175 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/config @@ -0,0 +1,10 @@ +CONFIG_NET=y +CONFIG_INET=y +CONFIG_STREAM_PARSER=y +CONFIG_NET_UDP_TUNNEL=y +CONFIG_DST_CACHE=y +CONFIG_CRYPTO=y +CONFIG_CRYPTO_AES=y +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_CHACHA20POLY1305=y +CONFIG_OVPN=m diff --git a/tools/testing/selftests/net/ovpn/data64.key b/tools/testing/selftests/net/ovpn/data64.key new file mode 100644 index 0000000000000..a99e88c4e290f --- /dev/null +++ b/tools/testing/selftests/net/ovpn/data64.key @@ -0,0 +1,5 @@ +jRqMACN7d7/aFQNT8S7jkrBD8uwrgHbG5OQZP2eu4R1Y7tfpS2bf5RHv06Vi163CGoaIiTX99R3B +ia9ycAH8Wz1+9PWv51dnBLur9jbShlgZ2QHLtUc4a/gfT7zZwULXuuxdLnvR21DDeMBaTbkgbai9 +uvAa7ne1liIgGFzbv+Bas4HDVrygxIxuAnP5Qgc3648IJkZ0QEXPF+O9f0n5+QIvGCxkAUVx+5K6 +KIs+SoeWXnAopELmoGSjUpFtJbagXK82HfdqpuUxT2Tnuef0/14SzVE/vNleBNu2ZbyrSAaah8tE +BofkPJUBFY+YQcfZNM5Dgrw3i+Bpmpq/gpdg5w== diff --git a/tools/testing/selftests/net/ovpn/ovpn-cli.c b/tools/testing/selftests/net/ovpn/ovpn-cli.c new file mode 100644 index 0000000000000..69e41fc07fbca --- /dev/null +++ b/tools/testing/selftests/net/ovpn/ovpn-cli.c @@ -0,0 +1,2376 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel accelerator + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include + +/* defines to make checkpatch happy */ +#define strscpy strncpy +#define __always_unused __attribute__((__unused__)) + +/* libnl < 3.5.0 does not set the NLA_F_NESTED on its own, therefore we + * have to explicitly do it to prevent the kernel from failing upon + * parsing of the message + */ +#define nla_nest_start(_msg, _type) \ + nla_nest_start(_msg, (_type) | NLA_F_NESTED) + +/* libnl < 3.11.0 does not implement nla_get_uint() */ +uint64_t ovpn_nla_get_uint(struct nlattr *attr) +{ + if (nla_len(attr) == sizeof(uint32_t)) + return nla_get_u32(attr); + else + return nla_get_u64(attr); +} + +typedef int (*ovpn_nl_cb)(struct nl_msg *msg, void *arg); + +enum ovpn_key_direction { + KEY_DIR_IN = 0, + KEY_DIR_OUT, +}; + +#define KEY_LEN (256 / 8) +#define NONCE_LEN 8 + +#define PEER_ID_UNDEF 0x00FFFFFF +#define MAX_PEERS 10 + +struct nl_ctx { + struct nl_sock *nl_sock; + struct nl_msg *nl_msg; + struct nl_cb *nl_cb; + + int ovpn_dco_id; +}; + +enum ovpn_cmd { + CMD_INVALID, + CMD_NEW_IFACE, + CMD_DEL_IFACE, + CMD_LISTEN, + CMD_CONNECT, + CMD_NEW_PEER, + CMD_NEW_MULTI_PEER, + CMD_SET_PEER, + CMD_DEL_PEER, + CMD_GET_PEER, + CMD_NEW_KEY, + CMD_DEL_KEY, + CMD_GET_KEY, + CMD_SWAP_KEYS, + CMD_LISTEN_MCAST, +}; + +struct ovpn_ctx { + enum ovpn_cmd cmd; + + __u8 key_enc[KEY_LEN]; + __u8 key_dec[KEY_LEN]; + __u8 nonce[NONCE_LEN]; + + enum ovpn_cipher_alg cipher; + + sa_family_t sa_family; + + unsigned long peer_id; + unsigned long lport; + + union { + struct sockaddr_in in4; + struct sockaddr_in6 in6; + } remote; + + union { + struct sockaddr_in in4; + struct sockaddr_in6 in6; + } peer_ip; + + bool peer_ip_set; + + unsigned int ifindex; + char ifname[IFNAMSIZ]; + enum ovpn_mode mode; + bool mode_set; + + int socket; + int cli_sockets[MAX_PEERS]; + + __u32 keepalive_interval; + __u32 keepalive_timeout; + + enum ovpn_key_direction key_dir; + enum ovpn_key_slot key_slot; + int key_id; + + const char *peers_file; +}; + +static int ovpn_nl_recvmsgs(struct nl_ctx *ctx) +{ + int ret; + + ret = nl_recvmsgs(ctx->nl_sock, ctx->nl_cb); + + switch (ret) { + case -NLE_INTR: + fprintf(stderr, + "netlink received interrupt due to signal - ignoring\n"); + break; + case -NLE_NOMEM: + fprintf(stderr, "netlink out of memory error\n"); + break; + case -NLE_AGAIN: + fprintf(stderr, + "netlink reports blocking read - aborting wait\n"); + break; + default: + if (ret) + fprintf(stderr, "netlink reports error (%d): %s\n", + ret, nl_geterror(-ret)); + break; + } + + return ret; +} + +static struct nl_ctx *nl_ctx_alloc_flags(struct ovpn_ctx *ovpn, int cmd, + int flags) +{ + struct nl_ctx *ctx; + int err, ret; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return NULL; + + ctx->nl_sock = nl_socket_alloc(); + if (!ctx->nl_sock) { + fprintf(stderr, "cannot allocate netlink socket\n"); + goto err_free; + } + + nl_socket_set_buffer_size(ctx->nl_sock, 8192, 8192); + + ret = genl_connect(ctx->nl_sock); + if (ret) { + fprintf(stderr, "cannot connect to generic netlink: %s\n", + nl_geterror(ret)); + goto err_sock; + } + + /* enable Extended ACK for detailed error reporting */ + err = 1; + setsockopt(nl_socket_get_fd(ctx->nl_sock), SOL_NETLINK, NETLINK_EXT_ACK, + &err, sizeof(err)); + + ctx->ovpn_dco_id = genl_ctrl_resolve(ctx->nl_sock, OVPN_FAMILY_NAME); + if (ctx->ovpn_dco_id < 0) { + fprintf(stderr, "cannot find ovpn_dco netlink component: %d\n", + ctx->ovpn_dco_id); + goto err_free; + } + + ctx->nl_msg = nlmsg_alloc(); + if (!ctx->nl_msg) { + fprintf(stderr, "cannot allocate netlink message\n"); + goto err_sock; + } + + ctx->nl_cb = nl_cb_alloc(NL_CB_DEFAULT); + if (!ctx->nl_cb) { + fprintf(stderr, "failed to allocate netlink callback\n"); + goto err_msg; + } + + nl_socket_set_cb(ctx->nl_sock, ctx->nl_cb); + + genlmsg_put(ctx->nl_msg, 0, 0, ctx->ovpn_dco_id, 0, flags, cmd, 0); + + if (ovpn->ifindex > 0) + NLA_PUT_U32(ctx->nl_msg, OVPN_A_IFINDEX, ovpn->ifindex); + + return ctx; +nla_put_failure: +err_msg: + nlmsg_free(ctx->nl_msg); +err_sock: + nl_socket_free(ctx->nl_sock); +err_free: + free(ctx); + return NULL; +} + +static struct nl_ctx *nl_ctx_alloc(struct ovpn_ctx *ovpn, int cmd) +{ + return nl_ctx_alloc_flags(ovpn, cmd, 0); +} + +static void nl_ctx_free(struct nl_ctx *ctx) +{ + if (!ctx) + return; + + nl_socket_free(ctx->nl_sock); + nlmsg_free(ctx->nl_msg); + nl_cb_put(ctx->nl_cb); + free(ctx); +} + +static int ovpn_nl_cb_error(struct sockaddr_nl (*nla)__always_unused, + struct nlmsgerr *err, void *arg) +{ + struct nlmsghdr *nlh = (struct nlmsghdr *)err - 1; + struct nlattr *tb_msg[NLMSGERR_ATTR_MAX + 1]; + int len = nlh->nlmsg_len; + struct nlattr *attrs; + int *ret = arg; + int ack_len = sizeof(*nlh) + sizeof(int) + sizeof(*nlh); + + *ret = err->error; + + if (!(nlh->nlmsg_flags & NLM_F_ACK_TLVS)) + return NL_STOP; + + if (!(nlh->nlmsg_flags & NLM_F_CAPPED)) + ack_len += err->msg.nlmsg_len - sizeof(*nlh); + + if (len <= ack_len) + return NL_STOP; + + attrs = (void *)((uint8_t *)nlh + ack_len); + len -= ack_len; + + nla_parse(tb_msg, NLMSGERR_ATTR_MAX, attrs, len, NULL); + if (tb_msg[NLMSGERR_ATTR_MSG]) { + len = strnlen((char *)nla_data(tb_msg[NLMSGERR_ATTR_MSG]), + nla_len(tb_msg[NLMSGERR_ATTR_MSG])); + fprintf(stderr, "kernel error: %*s\n", len, + (char *)nla_data(tb_msg[NLMSGERR_ATTR_MSG])); + } + + if (tb_msg[NLMSGERR_ATTR_MISS_NEST]) { + fprintf(stderr, "missing required nesting type %u\n", + nla_get_u32(tb_msg[NLMSGERR_ATTR_MISS_NEST])); + } + + if (tb_msg[NLMSGERR_ATTR_MISS_TYPE]) { + fprintf(stderr, "missing required attribute type %u\n", + nla_get_u32(tb_msg[NLMSGERR_ATTR_MISS_TYPE])); + } + + return NL_STOP; +} + +static int ovpn_nl_cb_finish(struct nl_msg (*msg)__always_unused, + void *arg) +{ + int *status = arg; + + *status = 0; + return NL_SKIP; +} + +static int ovpn_nl_cb_ack(struct nl_msg (*msg)__always_unused, + void *arg) +{ + int *status = arg; + + *status = 0; + return NL_STOP; +} + +static int ovpn_nl_msg_send(struct nl_ctx *ctx, ovpn_nl_cb cb) +{ + int status = 1; + + nl_cb_err(ctx->nl_cb, NL_CB_CUSTOM, ovpn_nl_cb_error, &status); + nl_cb_set(ctx->nl_cb, NL_CB_FINISH, NL_CB_CUSTOM, ovpn_nl_cb_finish, + &status); + nl_cb_set(ctx->nl_cb, NL_CB_ACK, NL_CB_CUSTOM, ovpn_nl_cb_ack, &status); + + if (cb) + nl_cb_set(ctx->nl_cb, NL_CB_VALID, NL_CB_CUSTOM, cb, ctx); + + nl_send_auto_complete(ctx->nl_sock, ctx->nl_msg); + + while (status == 1) + ovpn_nl_recvmsgs(ctx); + + if (status < 0) + fprintf(stderr, "failed to send netlink message: %s (%d)\n", + strerror(-status), status); + + return status; +} + +static int ovpn_parse_key(const char *file, struct ovpn_ctx *ctx) +{ + int idx_enc, idx_dec, ret = -1; + unsigned char *ckey = NULL; + __u8 *bkey = NULL; + size_t olen = 0; + long ckey_len; + FILE *fp; + + fp = fopen(file, "r"); + if (!fp) { + fprintf(stderr, "cannot open: %s\n", file); + return -1; + } + + /* get file size */ + fseek(fp, 0L, SEEK_END); + ckey_len = ftell(fp); + rewind(fp); + + /* if the file is longer, let's just read a portion */ + if (ckey_len > 256) + ckey_len = 256; + + ckey = malloc(ckey_len); + if (!ckey) + goto err; + + ret = fread(ckey, 1, ckey_len, fp); + if (ret != ckey_len) { + fprintf(stderr, + "couldn't read enough data from key file: %dbytes read\n", + ret); + goto err; + } + + olen = 0; + ret = mbedtls_base64_decode(NULL, 0, &olen, ckey, ckey_len); + if (ret != MBEDTLS_ERR_BASE64_BUFFER_TOO_SMALL) { + char buf[256]; + + mbedtls_strerror(ret, buf, sizeof(buf)); + fprintf(stderr, "unexpected base64 error1: %s (%d)\n", buf, + ret); + + goto err; + } + + bkey = malloc(olen); + if (!bkey) { + fprintf(stderr, "cannot allocate binary key buffer\n"); + goto err; + } + + ret = mbedtls_base64_decode(bkey, olen, &olen, ckey, ckey_len); + if (ret) { + char buf[256]; + + mbedtls_strerror(ret, buf, sizeof(buf)); + fprintf(stderr, "unexpected base64 error2: %s (%d)\n", buf, + ret); + + goto err; + } + + if (olen < 2 * KEY_LEN + NONCE_LEN) { + fprintf(stderr, + "not enough data in key file, found %zdB but needs %dB\n", + olen, 2 * KEY_LEN + NONCE_LEN); + goto err; + } + + switch (ctx->key_dir) { + case KEY_DIR_IN: + idx_enc = 0; + idx_dec = 1; + break; + case KEY_DIR_OUT: + idx_enc = 1; + idx_dec = 0; + break; + default: + goto err; + } + + memcpy(ctx->key_enc, bkey + KEY_LEN * idx_enc, KEY_LEN); + memcpy(ctx->key_dec, bkey + KEY_LEN * idx_dec, KEY_LEN); + memcpy(ctx->nonce, bkey + 2 * KEY_LEN, NONCE_LEN); + + ret = 0; + +err: + fclose(fp); + free(bkey); + free(ckey); + + return ret; +} + +static int ovpn_parse_cipher(const char *cipher, struct ovpn_ctx *ctx) +{ + if (strcmp(cipher, "aes") == 0) + ctx->cipher = OVPN_CIPHER_ALG_AES_GCM; + else if (strcmp(cipher, "chachapoly") == 0) + ctx->cipher = OVPN_CIPHER_ALG_CHACHA20_POLY1305; + else if (strcmp(cipher, "none") == 0) + ctx->cipher = OVPN_CIPHER_ALG_NONE; + else + return -ENOTSUP; + + return 0; +} + +static int ovpn_parse_key_direction(const char *dir, struct ovpn_ctx *ctx) +{ + int in_dir; + + in_dir = strtoll(dir, NULL, 10); + switch (in_dir) { + case KEY_DIR_IN: + case KEY_DIR_OUT: + ctx->key_dir = in_dir; + break; + default: + fprintf(stderr, + "invalid key direction provided. Can be 0 or 1 only\n"); + return -1; + } + + return 0; +} + +static int ovpn_socket(struct ovpn_ctx *ctx, sa_family_t family, int proto) +{ + struct sockaddr_storage local_sock = { 0 }; + struct sockaddr_in6 *in6; + struct sockaddr_in *in; + int ret, s, sock_type; + size_t sock_len; + + if (proto == IPPROTO_UDP) + sock_type = SOCK_DGRAM; + else if (proto == IPPROTO_TCP) + sock_type = SOCK_STREAM; + else + return -EINVAL; + + s = socket(family, sock_type, 0); + if (s < 0) { + perror("cannot create socket"); + return -1; + } + + switch (family) { + case AF_INET: + in = (struct sockaddr_in *)&local_sock; + in->sin_family = family; + in->sin_port = htons(ctx->lport); + in->sin_addr.s_addr = htonl(INADDR_ANY); + sock_len = sizeof(*in); + break; + case AF_INET6: + in6 = (struct sockaddr_in6 *)&local_sock; + in6->sin6_family = family; + in6->sin6_port = htons(ctx->lport); + in6->sin6_addr = in6addr_any; + sock_len = sizeof(*in6); + break; + default: + return -1; + } + + int opt = 1; + + ret = setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + + if (ret < 0) { + perror("setsockopt for SO_REUSEADDR"); + return ret; + } + + ret = setsockopt(s, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)); + if (ret < 0) { + perror("setsockopt for SO_REUSEPORT"); + return ret; + } + + if (family == AF_INET6) { + opt = 0; + if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, &opt, + sizeof(opt))) { + perror("failed to set IPV6_V6ONLY"); + return -1; + } + } + + ret = bind(s, (struct sockaddr *)&local_sock, sock_len); + if (ret < 0) { + perror("cannot bind socket"); + goto err_socket; + } + + ctx->socket = s; + ctx->sa_family = family; + return 0; + +err_socket: + close(s); + return -1; +} + +static int ovpn_udp_socket(struct ovpn_ctx *ctx, sa_family_t family) +{ + return ovpn_socket(ctx, family, IPPROTO_UDP); +} + +static int ovpn_listen(struct ovpn_ctx *ctx, sa_family_t family) +{ + int ret; + + ret = ovpn_socket(ctx, family, IPPROTO_TCP); + if (ret < 0) + return ret; + + ret = listen(ctx->socket, 10); + if (ret < 0) { + perror("listen"); + close(ctx->socket); + return -1; + } + + return 0; +} + +static int ovpn_accept(struct ovpn_ctx *ctx) +{ + socklen_t socklen; + int ret; + + socklen = sizeof(ctx->remote); + ret = accept(ctx->socket, (struct sockaddr *)&ctx->remote, &socklen); + if (ret < 0) { + perror("accept"); + goto err; + } + + fprintf(stderr, "Connection received!\n"); + + switch (socklen) { + case sizeof(struct sockaddr_in): + case sizeof(struct sockaddr_in6): + break; + default: + fprintf(stderr, "error: expecting IPv4 or IPv6 connection\n"); + close(ret); + ret = -EINVAL; + goto err; + } + + return ret; +err: + close(ctx->socket); + return ret; +} + +static int ovpn_connect(struct ovpn_ctx *ovpn) +{ + socklen_t socklen; + int s, ret; + + s = socket(ovpn->remote.in4.sin_family, SOCK_STREAM, 0); + if (s < 0) { + perror("cannot create socket"); + return -1; + } + + switch (ovpn->remote.in4.sin_family) { + case AF_INET: + socklen = sizeof(struct sockaddr_in); + break; + case AF_INET6: + socklen = sizeof(struct sockaddr_in6); + break; + default: + return -EOPNOTSUPP; + } + + ret = connect(s, (struct sockaddr *)&ovpn->remote, socklen); + if (ret < 0) { + perror("connect"); + goto err; + } + + fprintf(stderr, "connected\n"); + + ovpn->socket = s; + + return 0; +err: + close(s); + return ret; +} + +static int ovpn_new_peer(struct ovpn_ctx *ovpn, bool is_tcp) +{ + struct nlattr *attr; + struct nl_ctx *ctx; + int ret = -1; + + ctx = nl_ctx_alloc(ovpn, OVPN_CMD_PEER_NEW); + if (!ctx) + return -ENOMEM; + + attr = nla_nest_start(ctx->nl_msg, OVPN_A_PEER); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_ID, ovpn->peer_id); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_SOCKET, ovpn->socket); + + if (!is_tcp) { + switch (ovpn->remote.in4.sin_family) { + case AF_INET: + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_REMOTE_IPV4, + ovpn->remote.in4.sin_addr.s_addr); + NLA_PUT_U16(ctx->nl_msg, OVPN_A_PEER_REMOTE_PORT, + ovpn->remote.in4.sin_port); + break; + case AF_INET6: + NLA_PUT(ctx->nl_msg, OVPN_A_PEER_REMOTE_IPV6, + sizeof(ovpn->remote.in6.sin6_addr), + &ovpn->remote.in6.sin6_addr); + NLA_PUT_U32(ctx->nl_msg, + OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID, + ovpn->remote.in6.sin6_scope_id); + NLA_PUT_U16(ctx->nl_msg, OVPN_A_PEER_REMOTE_PORT, + ovpn->remote.in6.sin6_port); + break; + default: + fprintf(stderr, + "Invalid family for remote socket address\n"); + goto nla_put_failure; + } + } + + if (ovpn->peer_ip_set) { + switch (ovpn->peer_ip.in4.sin_family) { + case AF_INET: + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_VPN_IPV4, + ovpn->peer_ip.in4.sin_addr.s_addr); + break; + case AF_INET6: + NLA_PUT(ctx->nl_msg, OVPN_A_PEER_VPN_IPV6, + sizeof(struct in6_addr), + &ovpn->peer_ip.in6.sin6_addr); + break; + default: + fprintf(stderr, "Invalid family for peer address\n"); + goto nla_put_failure; + } + } + + nla_nest_end(ctx->nl_msg, attr); + + ret = ovpn_nl_msg_send(ctx, NULL); +nla_put_failure: + nl_ctx_free(ctx); + return ret; +} + +static int ovpn_set_peer(struct ovpn_ctx *ovpn) +{ + struct nlattr *attr; + struct nl_ctx *ctx; + int ret = -1; + + ctx = nl_ctx_alloc(ovpn, OVPN_CMD_PEER_SET); + if (!ctx) + return -ENOMEM; + + attr = nla_nest_start(ctx->nl_msg, OVPN_A_PEER); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_ID, ovpn->peer_id); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_KEEPALIVE_INTERVAL, + ovpn->keepalive_interval); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_KEEPALIVE_TIMEOUT, + ovpn->keepalive_timeout); + nla_nest_end(ctx->nl_msg, attr); + + ret = ovpn_nl_msg_send(ctx, NULL); +nla_put_failure: + nl_ctx_free(ctx); + return ret; +} + +static int ovpn_del_peer(struct ovpn_ctx *ovpn) +{ + struct nlattr *attr; + struct nl_ctx *ctx; + int ret = -1; + + ctx = nl_ctx_alloc(ovpn, OVPN_CMD_PEER_DEL); + if (!ctx) + return -ENOMEM; + + attr = nla_nest_start(ctx->nl_msg, OVPN_A_PEER); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_ID, ovpn->peer_id); + nla_nest_end(ctx->nl_msg, attr); + + ret = ovpn_nl_msg_send(ctx, NULL); +nla_put_failure: + nl_ctx_free(ctx); + return ret; +} + +static int ovpn_handle_peer(struct nl_msg *msg, void (*arg)__always_unused) +{ + struct nlattr *pattrs[OVPN_A_PEER_MAX + 1]; + struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg)); + struct nlattr *attrs[OVPN_A_MAX + 1]; + __u16 rport = 0, lport = 0; + + nla_parse(attrs, OVPN_A_MAX, genlmsg_attrdata(gnlh, 0), + genlmsg_attrlen(gnlh, 0), NULL); + + if (!attrs[OVPN_A_PEER]) { + fprintf(stderr, "no packet content in netlink message\n"); + return NL_SKIP; + } + + nla_parse(pattrs, OVPN_A_PEER_MAX, nla_data(attrs[OVPN_A_PEER]), + nla_len(attrs[OVPN_A_PEER]), NULL); + + if (pattrs[OVPN_A_PEER_ID]) + fprintf(stderr, "* Peer %u\n", + nla_get_u32(pattrs[OVPN_A_PEER_ID])); + + if (pattrs[OVPN_A_PEER_SOCKET_NETNSID]) + fprintf(stderr, "\tsocket NetNS ID: %d\n", + nla_get_s32(pattrs[OVPN_A_PEER_SOCKET_NETNSID])); + + if (pattrs[OVPN_A_PEER_VPN_IPV4]) { + char buf[INET_ADDRSTRLEN]; + + inet_ntop(AF_INET, nla_data(pattrs[OVPN_A_PEER_VPN_IPV4]), + buf, sizeof(buf)); + fprintf(stderr, "\tVPN IPv4: %s\n", buf); + } + + if (pattrs[OVPN_A_PEER_VPN_IPV6]) { + char buf[INET6_ADDRSTRLEN]; + + inet_ntop(AF_INET6, nla_data(pattrs[OVPN_A_PEER_VPN_IPV6]), + buf, sizeof(buf)); + fprintf(stderr, "\tVPN IPv6: %s\n", buf); + } + + if (pattrs[OVPN_A_PEER_LOCAL_PORT]) + lport = ntohs(nla_get_u16(pattrs[OVPN_A_PEER_LOCAL_PORT])); + + if (pattrs[OVPN_A_PEER_REMOTE_PORT]) + rport = ntohs(nla_get_u16(pattrs[OVPN_A_PEER_REMOTE_PORT])); + + if (pattrs[OVPN_A_PEER_REMOTE_IPV6]) { + void *ip = pattrs[OVPN_A_PEER_REMOTE_IPV6]; + char buf[INET6_ADDRSTRLEN]; + int scope_id = -1; + + if (pattrs[OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID]) { + void *p = pattrs[OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID]; + + scope_id = nla_get_u32(p); + } + + inet_ntop(AF_INET6, nla_data(ip), buf, sizeof(buf)); + fprintf(stderr, "\tRemote: %s:%hu (scope-id: %u)\n", buf, rport, + scope_id); + + if (pattrs[OVPN_A_PEER_LOCAL_IPV6]) { + void *ip = pattrs[OVPN_A_PEER_LOCAL_IPV6]; + + inet_ntop(AF_INET6, nla_data(ip), buf, sizeof(buf)); + fprintf(stderr, "\tLocal: %s:%hu\n", buf, lport); + } + } + + if (pattrs[OVPN_A_PEER_REMOTE_IPV4]) { + void *ip = pattrs[OVPN_A_PEER_REMOTE_IPV4]; + char buf[INET_ADDRSTRLEN]; + + inet_ntop(AF_INET, nla_data(ip), buf, sizeof(buf)); + fprintf(stderr, "\tRemote: %s:%hu\n", buf, rport); + + if (pattrs[OVPN_A_PEER_LOCAL_IPV4]) { + void *p = pattrs[OVPN_A_PEER_LOCAL_IPV4]; + + inet_ntop(AF_INET, nla_data(p), buf, sizeof(buf)); + fprintf(stderr, "\tLocal: %s:%hu\n", buf, lport); + } + } + + if (pattrs[OVPN_A_PEER_KEEPALIVE_INTERVAL]) { + void *p = pattrs[OVPN_A_PEER_KEEPALIVE_INTERVAL]; + + fprintf(stderr, "\tKeepalive interval: %u sec\n", + nla_get_u32(p)); + } + + if (pattrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT]) + fprintf(stderr, "\tKeepalive timeout: %u sec\n", + nla_get_u32(pattrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT])); + + if (pattrs[OVPN_A_PEER_VPN_RX_BYTES]) + fprintf(stderr, "\tVPN RX bytes: %" PRIu64 "\n", + ovpn_nla_get_uint(pattrs[OVPN_A_PEER_VPN_RX_BYTES])); + + if (pattrs[OVPN_A_PEER_VPN_TX_BYTES]) + fprintf(stderr, "\tVPN TX bytes: %" PRIu64 "\n", + ovpn_nla_get_uint(pattrs[OVPN_A_PEER_VPN_TX_BYTES])); + + if (pattrs[OVPN_A_PEER_VPN_RX_PACKETS]) + fprintf(stderr, "\tVPN RX packets: %" PRIu64 "\n", + ovpn_nla_get_uint(pattrs[OVPN_A_PEER_VPN_RX_PACKETS])); + + if (pattrs[OVPN_A_PEER_VPN_TX_PACKETS]) + fprintf(stderr, "\tVPN TX packets: %" PRIu64 "\n", + ovpn_nla_get_uint(pattrs[OVPN_A_PEER_VPN_TX_PACKETS])); + + if (pattrs[OVPN_A_PEER_LINK_RX_BYTES]) + fprintf(stderr, "\tLINK RX bytes: %" PRIu64 "\n", + ovpn_nla_get_uint(pattrs[OVPN_A_PEER_LINK_RX_BYTES])); + + if (pattrs[OVPN_A_PEER_LINK_TX_BYTES]) + fprintf(stderr, "\tLINK TX bytes: %" PRIu64 "\n", + ovpn_nla_get_uint(pattrs[OVPN_A_PEER_LINK_TX_BYTES])); + + if (pattrs[OVPN_A_PEER_LINK_RX_PACKETS]) + fprintf(stderr, "\tLINK RX packets: %" PRIu64 "\n", + ovpn_nla_get_uint(pattrs[OVPN_A_PEER_LINK_RX_PACKETS])); + + if (pattrs[OVPN_A_PEER_LINK_TX_PACKETS]) + fprintf(stderr, "\tLINK TX packets: %" PRIu64 "\n", + ovpn_nla_get_uint(pattrs[OVPN_A_PEER_LINK_TX_PACKETS])); + + return NL_SKIP; +} + +static int ovpn_get_peer(struct ovpn_ctx *ovpn) +{ + int flags = 0, ret = -1; + struct nlattr *attr; + struct nl_ctx *ctx; + + if (ovpn->peer_id == PEER_ID_UNDEF) + flags = NLM_F_DUMP; + + ctx = nl_ctx_alloc_flags(ovpn, OVPN_CMD_PEER_GET, flags); + if (!ctx) + return -ENOMEM; + + if (ovpn->peer_id != PEER_ID_UNDEF) { + attr = nla_nest_start(ctx->nl_msg, OVPN_A_PEER); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_ID, ovpn->peer_id); + nla_nest_end(ctx->nl_msg, attr); + } + + ret = ovpn_nl_msg_send(ctx, ovpn_handle_peer); +nla_put_failure: + nl_ctx_free(ctx); + return ret; +} + +static int ovpn_new_key(struct ovpn_ctx *ovpn) +{ + struct nlattr *keyconf, *key_dir; + struct nl_ctx *ctx; + int ret = -1; + + ctx = nl_ctx_alloc(ovpn, OVPN_CMD_KEY_NEW); + if (!ctx) + return -ENOMEM; + + keyconf = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_PEER_ID, ovpn->peer_id); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_SLOT, ovpn->key_slot); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_KEY_ID, ovpn->key_id); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_CIPHER_ALG, ovpn->cipher); + + key_dir = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF_ENCRYPT_DIR); + NLA_PUT(ctx->nl_msg, OVPN_A_KEYDIR_CIPHER_KEY, KEY_LEN, ovpn->key_enc); + NLA_PUT(ctx->nl_msg, OVPN_A_KEYDIR_NONCE_TAIL, NONCE_LEN, ovpn->nonce); + nla_nest_end(ctx->nl_msg, key_dir); + + key_dir = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF_DECRYPT_DIR); + NLA_PUT(ctx->nl_msg, OVPN_A_KEYDIR_CIPHER_KEY, KEY_LEN, ovpn->key_dec); + NLA_PUT(ctx->nl_msg, OVPN_A_KEYDIR_NONCE_TAIL, NONCE_LEN, ovpn->nonce); + nla_nest_end(ctx->nl_msg, key_dir); + + nla_nest_end(ctx->nl_msg, keyconf); + + ret = ovpn_nl_msg_send(ctx, NULL); +nla_put_failure: + nl_ctx_free(ctx); + return ret; +} + +static int ovpn_del_key(struct ovpn_ctx *ovpn) +{ + struct nlattr *keyconf; + struct nl_ctx *ctx; + int ret = -1; + + ctx = nl_ctx_alloc(ovpn, OVPN_CMD_KEY_DEL); + if (!ctx) + return -ENOMEM; + + keyconf = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_PEER_ID, ovpn->peer_id); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_SLOT, ovpn->key_slot); + nla_nest_end(ctx->nl_msg, keyconf); + + ret = ovpn_nl_msg_send(ctx, NULL); +nla_put_failure: + nl_ctx_free(ctx); + return ret; +} + +static int ovpn_handle_key(struct nl_msg *msg, void (*arg)__always_unused) +{ + struct nlattr *kattrs[OVPN_A_KEYCONF_MAX + 1]; + struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg)); + struct nlattr *attrs[OVPN_A_MAX + 1]; + + nla_parse(attrs, OVPN_A_MAX, genlmsg_attrdata(gnlh, 0), + genlmsg_attrlen(gnlh, 0), NULL); + + if (!attrs[OVPN_A_KEYCONF]) { + fprintf(stderr, "no packet content in netlink message\n"); + return NL_SKIP; + } + + nla_parse(kattrs, OVPN_A_KEYCONF_MAX, nla_data(attrs[OVPN_A_KEYCONF]), + nla_len(attrs[OVPN_A_KEYCONF]), NULL); + + if (kattrs[OVPN_A_KEYCONF_PEER_ID]) + fprintf(stderr, "* Peer %u\n", + nla_get_u32(kattrs[OVPN_A_KEYCONF_PEER_ID])); + if (kattrs[OVPN_A_KEYCONF_SLOT]) { + fprintf(stderr, "\t- Slot: "); + switch (nla_get_u32(kattrs[OVPN_A_KEYCONF_SLOT])) { + case OVPN_KEY_SLOT_PRIMARY: + fprintf(stderr, "primary\n"); + break; + case OVPN_KEY_SLOT_SECONDARY: + fprintf(stderr, "secondary\n"); + break; + default: + fprintf(stderr, "invalid (%u)\n", + nla_get_u32(kattrs[OVPN_A_KEYCONF_SLOT])); + break; + } + } + if (kattrs[OVPN_A_KEYCONF_KEY_ID]) + fprintf(stderr, "\t- Key ID: %u\n", + nla_get_u32(kattrs[OVPN_A_KEYCONF_KEY_ID])); + if (kattrs[OVPN_A_KEYCONF_CIPHER_ALG]) { + fprintf(stderr, "\t- Cipher: "); + switch (nla_get_u32(kattrs[OVPN_A_KEYCONF_CIPHER_ALG])) { + case OVPN_CIPHER_ALG_NONE: + fprintf(stderr, "none\n"); + break; + case OVPN_CIPHER_ALG_AES_GCM: + fprintf(stderr, "aes-gcm\n"); + break; + case OVPN_CIPHER_ALG_CHACHA20_POLY1305: + fprintf(stderr, "chacha20poly1305\n"); + break; + default: + fprintf(stderr, "invalid (%u)\n", + nla_get_u32(kattrs[OVPN_A_KEYCONF_CIPHER_ALG])); + break; + } + } + + return NL_SKIP; +} + +static int ovpn_get_key(struct ovpn_ctx *ovpn) +{ + struct nlattr *keyconf; + struct nl_ctx *ctx; + int ret = -1; + + ctx = nl_ctx_alloc(ovpn, OVPN_CMD_KEY_GET); + if (!ctx) + return -ENOMEM; + + keyconf = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_PEER_ID, ovpn->peer_id); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_SLOT, ovpn->key_slot); + nla_nest_end(ctx->nl_msg, keyconf); + + ret = ovpn_nl_msg_send(ctx, ovpn_handle_key); +nla_put_failure: + nl_ctx_free(ctx); + return ret; +} + +static int ovpn_swap_keys(struct ovpn_ctx *ovpn) +{ + struct nl_ctx *ctx; + struct nlattr *kc; + int ret = -1; + + ctx = nl_ctx_alloc(ovpn, OVPN_CMD_KEY_SWAP); + if (!ctx) + return -ENOMEM; + + kc = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_PEER_ID, ovpn->peer_id); + nla_nest_end(ctx->nl_msg, kc); + + ret = ovpn_nl_msg_send(ctx, NULL); +nla_put_failure: + nl_ctx_free(ctx); + return ret; +} + +/* Helper function used to easily add attributes to a rtnl message */ +static int ovpn_addattr(struct nlmsghdr *n, int maxlen, int type, + const void *data, int alen) +{ + int len = RTA_LENGTH(alen); + struct rtattr *rta; + + if ((int)(NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len)) > maxlen) { + fprintf(stderr, "%s: rtnl: message exceeded bound of %d\n", + __func__, maxlen); + return -EMSGSIZE; + } + + rta = nlmsg_tail(n); + rta->rta_type = type; + rta->rta_len = len; + + if (!data) + memset(RTA_DATA(rta), 0, alen); + else + memcpy(RTA_DATA(rta), data, alen); + + n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len); + + return 0; +} + +static struct rtattr *ovpn_nest_start(struct nlmsghdr *msg, size_t max_size, + int attr) +{ + struct rtattr *nest = nlmsg_tail(msg); + + if (ovpn_addattr(msg, max_size, attr, NULL, 0) < 0) + return NULL; + + return nest; +} + +static void ovpn_nest_end(struct nlmsghdr *msg, struct rtattr *nest) +{ + nest->rta_len = (uint8_t *)nlmsg_tail(msg) - (uint8_t *)nest; +} + +#define RT_SNDBUF_SIZE (1024 * 2) +#define RT_RCVBUF_SIZE (1024 * 4) + +/* Open RTNL socket */ +static int ovpn_rt_socket(void) +{ + int sndbuf = RT_SNDBUF_SIZE, rcvbuf = RT_RCVBUF_SIZE, fd; + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (fd < 0) { + fprintf(stderr, "%s: cannot open netlink socket\n", __func__); + return fd; + } + + if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf, + sizeof(sndbuf)) < 0) { + fprintf(stderr, "%s: SO_SNDBUF\n", __func__); + close(fd); + return -1; + } + + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf, + sizeof(rcvbuf)) < 0) { + fprintf(stderr, "%s: SO_RCVBUF\n", __func__); + close(fd); + return -1; + } + + return fd; +} + +/* Bind socket to Netlink subsystem */ +static int ovpn_rt_bind(int fd, uint32_t groups) +{ + struct sockaddr_nl local = { 0 }; + socklen_t addr_len; + + local.nl_family = AF_NETLINK; + local.nl_groups = groups; + + if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) { + fprintf(stderr, "%s: cannot bind netlink socket: %d\n", + __func__, errno); + return -errno; + } + + addr_len = sizeof(local); + if (getsockname(fd, (struct sockaddr *)&local, &addr_len) < 0) { + fprintf(stderr, "%s: cannot getsockname: %d\n", __func__, + errno); + return -errno; + } + + if (addr_len != sizeof(local)) { + fprintf(stderr, "%s: wrong address length %d\n", __func__, + addr_len); + return -EINVAL; + } + + if (local.nl_family != AF_NETLINK) { + fprintf(stderr, "%s: wrong address family %d\n", __func__, + local.nl_family); + return -EINVAL; + } + + return 0; +} + +typedef int (*ovpn_parse_reply_cb)(struct nlmsghdr *msg, void *arg); + +/* Send Netlink message and run callback on reply (if specified) */ +static int ovpn_rt_send(struct nlmsghdr *payload, pid_t peer, + unsigned int groups, ovpn_parse_reply_cb cb, + void *arg_cb) +{ + int len, rem_len, fd, ret, rcv_len; + struct sockaddr_nl nladdr = { 0 }; + struct nlmsgerr *err; + struct nlmsghdr *h; + char buf[1024 * 16]; + struct iovec iov = { + .iov_base = payload, + .iov_len = payload->nlmsg_len, + }; + struct msghdr nlmsg = { + .msg_name = &nladdr, + .msg_namelen = sizeof(nladdr), + .msg_iov = &iov, + .msg_iovlen = 1, + }; + + nladdr.nl_family = AF_NETLINK; + nladdr.nl_pid = peer; + nladdr.nl_groups = groups; + + payload->nlmsg_seq = time(NULL); + + /* no need to send reply */ + if (!cb) + payload->nlmsg_flags |= NLM_F_ACK; + + fd = ovpn_rt_socket(); + if (fd < 0) { + fprintf(stderr, "%s: can't open rtnl socket\n", __func__); + return -errno; + } + + ret = ovpn_rt_bind(fd, 0); + if (ret < 0) { + fprintf(stderr, "%s: can't bind rtnl socket\n", __func__); + ret = -errno; + goto out; + } + + ret = sendmsg(fd, &nlmsg, 0); + if (ret < 0) { + fprintf(stderr, "%s: rtnl: error on sendmsg()\n", __func__); + ret = -errno; + goto out; + } + + /* prepare buffer to store RTNL replies */ + memset(buf, 0, sizeof(buf)); + iov.iov_base = buf; + + while (1) { + /* + * iov_len is modified by recvmsg(), therefore has to be initialized before + * using it again + */ + iov.iov_len = sizeof(buf); + rcv_len = recvmsg(fd, &nlmsg, 0); + if (rcv_len < 0) { + if (errno == EINTR || errno == EAGAIN) { + fprintf(stderr, "%s: interrupted call\n", + __func__); + continue; + } + fprintf(stderr, "%s: rtnl: error on recvmsg()\n", + __func__); + ret = -errno; + goto out; + } + + if (rcv_len == 0) { + fprintf(stderr, + "%s: rtnl: socket reached unexpected EOF\n", + __func__); + ret = -EIO; + goto out; + } + + if (nlmsg.msg_namelen != sizeof(nladdr)) { + fprintf(stderr, + "%s: sender address length: %u (expected %zu)\n", + __func__, nlmsg.msg_namelen, sizeof(nladdr)); + ret = -EIO; + goto out; + } + + h = (struct nlmsghdr *)buf; + while (rcv_len >= (int)sizeof(*h)) { + len = h->nlmsg_len; + rem_len = len - sizeof(*h); + + if (rem_len < 0 || len > rcv_len) { + if (nlmsg.msg_flags & MSG_TRUNC) { + fprintf(stderr, "%s: truncated message\n", + __func__); + ret = -EIO; + goto out; + } + fprintf(stderr, "%s: malformed message: len=%d\n", + __func__, len); + ret = -EIO; + goto out; + } + + if (h->nlmsg_type == NLMSG_DONE) { + ret = 0; + goto out; + } + + if (h->nlmsg_type == NLMSG_ERROR) { + err = (struct nlmsgerr *)NLMSG_DATA(h); + if (rem_len < (int)sizeof(struct nlmsgerr)) { + fprintf(stderr, "%s: ERROR truncated\n", + __func__); + ret = -EIO; + goto out; + } + + if (err->error) { + fprintf(stderr, "%s: (%d) %s\n", + __func__, err->error, + strerror(-err->error)); + ret = err->error; + goto out; + } + + ret = 0; + if (cb) { + int r = cb(h, arg_cb); + + if (r <= 0) + ret = r; + } + goto out; + } + + if (cb) { + int r = cb(h, arg_cb); + + if (r <= 0) { + ret = r; + goto out; + } + } else { + fprintf(stderr, "%s: RTNL: unexpected reply\n", + __func__); + } + + rcv_len -= NLMSG_ALIGN(len); + h = (struct nlmsghdr *)((uint8_t *)h + + NLMSG_ALIGN(len)); + } + + if (nlmsg.msg_flags & MSG_TRUNC) { + fprintf(stderr, "%s: message truncated\n", __func__); + continue; + } + + if (rcv_len) { + fprintf(stderr, "%s: rtnl: %d not parsed bytes\n", + __func__, rcv_len); + ret = -1; + goto out; + } + } +out: + close(fd); + + return ret; +} + +struct ovpn_link_req { + struct nlmsghdr n; + struct ifinfomsg i; + char buf[256]; +}; + +static int ovpn_new_iface(struct ovpn_ctx *ovpn) +{ + struct rtattr *linkinfo, *data; + struct ovpn_link_req req = { 0 }; + int ret = -1; + + fprintf(stdout, "Creating interface %s with mode %u\n", ovpn->ifname, + ovpn->mode); + + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(req.i)); + req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + req.n.nlmsg_type = RTM_NEWLINK; + + if (ovpn_addattr(&req.n, sizeof(req), IFLA_IFNAME, ovpn->ifname, + strlen(ovpn->ifname) + 1) < 0) + goto err; + + linkinfo = ovpn_nest_start(&req.n, sizeof(req), IFLA_LINKINFO); + if (!linkinfo) + goto err; + + if (ovpn_addattr(&req.n, sizeof(req), IFLA_INFO_KIND, OVPN_FAMILY_NAME, + strlen(OVPN_FAMILY_NAME) + 1) < 0) + goto err; + + if (ovpn->mode_set) { + data = ovpn_nest_start(&req.n, sizeof(req), IFLA_INFO_DATA); + if (!data) + goto err; + + if (ovpn_addattr(&req.n, sizeof(req), IFLA_OVPN_MODE, + &ovpn->mode, sizeof(uint8_t)) < 0) + goto err; + + ovpn_nest_end(&req.n, data); + } + + ovpn_nest_end(&req.n, linkinfo); + + req.i.ifi_family = AF_PACKET; + + ret = ovpn_rt_send(&req.n, 0, 0, NULL, NULL); +err: + return ret; +} + +static int ovpn_del_iface(struct ovpn_ctx *ovpn) +{ + struct ovpn_link_req req = { 0 }; + + fprintf(stdout, "Deleting interface %s ifindex %u\n", ovpn->ifname, + ovpn->ifindex); + + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(req.i)); + req.n.nlmsg_flags = NLM_F_REQUEST; + req.n.nlmsg_type = RTM_DELLINK; + + req.i.ifi_family = AF_PACKET; + req.i.ifi_index = ovpn->ifindex; + + return ovpn_rt_send(&req.n, 0, 0, NULL, NULL); +} + +static int nl_seq_check(struct nl_msg (*msg)__always_unused, + void (*arg)__always_unused) +{ + return NL_OK; +} + +struct mcast_handler_args { + const char *group; + int id; +}; + +static int mcast_family_handler(struct nl_msg *msg, void *arg) +{ + struct mcast_handler_args *grp = arg; + struct nlattr *tb[CTRL_ATTR_MAX + 1]; + struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg)); + struct nlattr *mcgrp; + int rem_mcgrp; + + nla_parse(tb, CTRL_ATTR_MAX, genlmsg_attrdata(gnlh, 0), + genlmsg_attrlen(gnlh, 0), NULL); + + if (!tb[CTRL_ATTR_MCAST_GROUPS]) + return NL_SKIP; + + nla_for_each_nested(mcgrp, tb[CTRL_ATTR_MCAST_GROUPS], rem_mcgrp) { + struct nlattr *tb_mcgrp[CTRL_ATTR_MCAST_GRP_MAX + 1]; + + nla_parse(tb_mcgrp, CTRL_ATTR_MCAST_GRP_MAX, + nla_data(mcgrp), nla_len(mcgrp), NULL); + + if (!tb_mcgrp[CTRL_ATTR_MCAST_GRP_NAME] || + !tb_mcgrp[CTRL_ATTR_MCAST_GRP_ID]) + continue; + if (strncmp(nla_data(tb_mcgrp[CTRL_ATTR_MCAST_GRP_NAME]), + grp->group, nla_len(tb_mcgrp[CTRL_ATTR_MCAST_GRP_NAME]))) + continue; + grp->id = nla_get_u32(tb_mcgrp[CTRL_ATTR_MCAST_GRP_ID]); + break; + } + + return NL_SKIP; +} + +static int mcast_error_handler(struct sockaddr_nl (*nla)__always_unused, + struct nlmsgerr *err, void *arg) +{ + int *ret = arg; + + *ret = err->error; + return NL_STOP; +} + +static int mcast_ack_handler(struct nl_msg (*msg)__always_unused, void *arg) +{ + int *ret = arg; + + *ret = 0; + return NL_STOP; +} + +static int ovpn_handle_msg(struct nl_msg *msg, void *arg) +{ + struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg)); + struct nlattr *attrs[OVPN_A_MAX + 1]; + struct nlmsghdr *nlh = nlmsg_hdr(msg); + char ifname[IF_NAMESIZE]; + int *ret = arg; + __u32 ifindex; + + fprintf(stderr, "received message from ovpn-dco\n"); + + *ret = -1; + + if (!genlmsg_valid_hdr(nlh, 0)) { + fprintf(stderr, "invalid header\n"); + return NL_STOP; + } + + if (nla_parse(attrs, OVPN_A_MAX, genlmsg_attrdata(gnlh, 0), + genlmsg_attrlen(gnlh, 0), NULL)) { + fprintf(stderr, "received bogus data from ovpn-dco\n"); + return NL_STOP; + } + + if (!attrs[OVPN_A_IFINDEX]) { + fprintf(stderr, "no ifindex in this message\n"); + return NL_STOP; + } + + ifindex = nla_get_u32(attrs[OVPN_A_IFINDEX]); + if (!if_indextoname(ifindex, ifname)) { + fprintf(stderr, "cannot resolve ifname for ifindex: %u\n", + ifindex); + return NL_STOP; + } + + switch (gnlh->cmd) { + case OVPN_CMD_PEER_DEL_NTF: + fprintf(stdout, "received CMD_PEER_DEL_NTF\n"); + break; + case OVPN_CMD_KEY_SWAP_NTF: + fprintf(stdout, "received CMD_KEY_SWAP_NTF\n"); + break; + default: + fprintf(stderr, "received unknown command: %d\n", gnlh->cmd); + return NL_STOP; + } + + *ret = 0; + return NL_OK; +} + +static int ovpn_get_mcast_id(struct nl_sock *sock, const char *family, + const char *group) +{ + struct nl_msg *msg; + struct nl_cb *cb; + int ret, ctrlid; + struct mcast_handler_args grp = { + .group = group, + .id = -ENOENT, + }; + + msg = nlmsg_alloc(); + if (!msg) + return -ENOMEM; + + cb = nl_cb_alloc(NL_CB_DEFAULT); + if (!cb) { + ret = -ENOMEM; + goto out_fail_cb; + } + + ctrlid = genl_ctrl_resolve(sock, "nlctrl"); + + genlmsg_put(msg, 0, 0, ctrlid, 0, 0, CTRL_CMD_GETFAMILY, 0); + + ret = -ENOBUFS; + NLA_PUT_STRING(msg, CTRL_ATTR_FAMILY_NAME, family); + + ret = nl_send_auto_complete(sock, msg); + if (ret < 0) + goto nla_put_failure; + + ret = 1; + + nl_cb_err(cb, NL_CB_CUSTOM, mcast_error_handler, &ret); + nl_cb_set(cb, NL_CB_ACK, NL_CB_CUSTOM, mcast_ack_handler, &ret); + nl_cb_set(cb, NL_CB_VALID, NL_CB_CUSTOM, mcast_family_handler, &grp); + + while (ret > 0) + nl_recvmsgs(sock, cb); + + if (ret == 0) + ret = grp.id; + nla_put_failure: + nl_cb_put(cb); + out_fail_cb: + nlmsg_free(msg); + return ret; +} + +static int ovpn_listen_mcast(void) +{ + struct nl_sock *sock; + struct nl_cb *cb; + int mcid, ret; + + sock = nl_socket_alloc(); + if (!sock) { + fprintf(stderr, "cannot allocate netlink socket\n"); + goto err_free; + } + + nl_socket_set_buffer_size(sock, 8192, 8192); + + ret = genl_connect(sock); + if (ret < 0) { + fprintf(stderr, "cannot connect to generic netlink: %s\n", + nl_geterror(ret)); + goto err_free; + } + + mcid = ovpn_get_mcast_id(sock, OVPN_FAMILY_NAME, OVPN_MCGRP_PEERS); + if (mcid < 0) { + fprintf(stderr, "cannot get mcast group: %s\n", + nl_geterror(mcid)); + goto err_free; + } + + ret = nl_socket_add_membership(sock, mcid); + if (ret) { + fprintf(stderr, "failed to join mcast group: %d\n", ret); + goto err_free; + } + + ret = 1; + cb = nl_cb_alloc(NL_CB_DEFAULT); + nl_cb_set(cb, NL_CB_SEQ_CHECK, NL_CB_CUSTOM, nl_seq_check, NULL); + nl_cb_set(cb, NL_CB_VALID, NL_CB_CUSTOM, ovpn_handle_msg, &ret); + nl_cb_err(cb, NL_CB_CUSTOM, ovpn_nl_cb_error, &ret); + + while (ret == 1) { + int err = nl_recvmsgs(sock, cb); + + if (err < 0) { + fprintf(stderr, + "cannot receive netlink message: (%d) %s\n", + err, nl_geterror(-err)); + ret = -1; + break; + } + } + + nl_cb_put(cb); +err_free: + nl_socket_free(sock); + return ret; +} + +static void usage(const char *cmd) +{ + fprintf(stderr, + "Usage %s [arguments..]\n", + cmd); + fprintf(stderr, "where can be one of the following\n\n"); + + fprintf(stderr, "* new_iface [mode]: create new ovpn interface\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tmode:\n"); + fprintf(stderr, "\t\t- P2P for peer-to-peer mode (i.e. client)\n"); + fprintf(stderr, "\t\t- MP for multi-peer mode (i.e. server)\n"); + + fprintf(stderr, "* del_iface : delete ovpn interface\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + + fprintf(stderr, + "* listen [ipv6]: listen for incoming peer TCP connections\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tlport: TCP port to listen to\n"); + fprintf(stderr, + "\tpeers_file: file containing one peer per line: Line format:\n"); + fprintf(stderr, "\t\t \n"); + fprintf(stderr, + "\tipv6: whether the socket should listen to the IPv6 wildcard address\n"); + + fprintf(stderr, + "* connect [key_file]: start connecting peer of TCP-based VPN session\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tpeer_id: peer ID of the connecting peer\n"); + fprintf(stderr, "\traddr: peer IP address to connect to\n"); + fprintf(stderr, "\trport: peer TCP port to connect to\n"); + fprintf(stderr, + "\tkey_file: file containing the symmetric key for encryption\n"); + + fprintf(stderr, + "* new_peer [vpnaddr]: add new peer\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tlport: local UDP port to bind to\n"); + fprintf(stderr, + "\tpeer_id: peer ID to be used in data packets to/from this peer\n"); + fprintf(stderr, "\traddr: peer IP address\n"); + fprintf(stderr, "\trport: peer UDP port\n"); + fprintf(stderr, "\tvpnaddr: peer VPN IP\n"); + + fprintf(stderr, + "* new_multi_peer : add multiple peers as listed in the file\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tlport: local UDP port to bind to\n"); + fprintf(stderr, + "\tpeers_file: text file containing one peer per line. Line format:\n"); + fprintf(stderr, "\t\t \n"); + + fprintf(stderr, + "* set_peer : set peer attributes\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tpeer_id: peer ID of the peer to modify\n"); + fprintf(stderr, + "\tkeepalive_interval: interval for sending ping messages\n"); + fprintf(stderr, + "\tkeepalive_timeout: time after which a peer is timed out\n"); + + fprintf(stderr, "* del_peer : delete peer\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tpeer_id: peer ID of the peer to delete\n"); + + fprintf(stderr, "* get_peer [peer_id]: retrieve peer(s) status\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, + "\tpeer_id: peer ID of the peer to query. All peers are returned if omitted\n"); + + fprintf(stderr, + "* new_key : set data channel key\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, + "\tpeer_id: peer ID of the peer to configure the key for\n"); + fprintf(stderr, "\tslot: either 1 (primary) or 2 (secondary)\n"); + fprintf(stderr, "\tkey_id: an ID from 0 to 7\n"); + fprintf(stderr, + "\tcipher: cipher to use, supported: aes (AES-GCM), chachapoly (CHACHA20POLY1305)\n"); + fprintf(stderr, + "\tkey_dir: key direction, must 0 on one host and 1 on the other\n"); + fprintf(stderr, "\tkey_file: file containing the pre-shared key\n"); + + fprintf(stderr, + "* del_key [slot]: erase existing data channel key\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tpeer_id: peer ID of the peer to modify\n"); + fprintf(stderr, "\tslot: slot to erase. PRIMARY if omitted\n"); + + fprintf(stderr, + "* get_key : retrieve non sensible key data\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tpeer_id: peer ID of the peer to query\n"); + fprintf(stderr, "\tslot: either 1 (primary) or 2 (secondary)\n"); + + fprintf(stderr, + "* swap_keys : swap content of primary and secondary key slots\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tpeer_id: peer ID of the peer to modify\n"); + + fprintf(stderr, + "* listen_mcast: listen to ovpn netlink multicast messages\n"); +} + +static int ovpn_parse_remote(struct ovpn_ctx *ovpn, const char *host, + const char *service, const char *vpnip) +{ + int ret; + struct addrinfo *result; + struct addrinfo hints = { + .ai_family = ovpn->sa_family, + .ai_socktype = SOCK_DGRAM, + .ai_protocol = IPPROTO_UDP + }; + + if (host) { + ret = getaddrinfo(host, service, &hints, &result); + if (ret == EAI_NONAME || ret == EAI_FAIL) + return -1; + + if (!(result->ai_family == AF_INET && + result->ai_addrlen == sizeof(struct sockaddr_in)) && + !(result->ai_family == AF_INET6 && + result->ai_addrlen == sizeof(struct sockaddr_in6))) { + ret = -EINVAL; + goto out; + } + + memcpy(&ovpn->remote, result->ai_addr, result->ai_addrlen); + } + + if (vpnip) { + ret = getaddrinfo(vpnip, NULL, &hints, &result); + if (ret == EAI_NONAME || ret == EAI_FAIL) + return -1; + + if (!(result->ai_family == AF_INET && + result->ai_addrlen == sizeof(struct sockaddr_in)) && + !(result->ai_family == AF_INET6 && + result->ai_addrlen == sizeof(struct sockaddr_in6))) { + ret = -EINVAL; + goto out; + } + + memcpy(&ovpn->peer_ip, result->ai_addr, result->ai_addrlen); + ovpn->sa_family = result->ai_family; + + ovpn->peer_ip_set = true; + } + + ret = 0; +out: + freeaddrinfo(result); + return ret; +} + +static int ovpn_parse_new_peer(struct ovpn_ctx *ovpn, const char *peer_id, + const char *raddr, const char *rport, + const char *vpnip) +{ + ovpn->peer_id = strtoul(peer_id, NULL, 10); + if (errno == ERANGE || ovpn->peer_id > PEER_ID_UNDEF) { + fprintf(stderr, "peer ID value out of range\n"); + return -1; + } + + return ovpn_parse_remote(ovpn, raddr, rport, vpnip); +} + +static int ovpn_parse_key_slot(const char *arg, struct ovpn_ctx *ovpn) +{ + int slot = strtoul(arg, NULL, 10); + + if (errno == ERANGE || slot < 1 || slot > 2) { + fprintf(stderr, "key slot out of range\n"); + return -1; + } + + switch (slot) { + case 1: + ovpn->key_slot = OVPN_KEY_SLOT_PRIMARY; + break; + case 2: + ovpn->key_slot = OVPN_KEY_SLOT_SECONDARY; + break; + } + + return 0; +} + +static int ovpn_send_tcp_data(int socket) +{ + uint16_t len = htons(1000); + uint8_t buf[1002]; + int ret; + + memcpy(buf, &len, sizeof(len)); + memset(buf + sizeof(len), 0x86, sizeof(buf) - sizeof(len)); + + ret = send(socket, buf, sizeof(buf), MSG_NOSIGNAL); + + fprintf(stdout, "Sent %u bytes over TCP socket\n", ret); + + return ret > 0 ? 0 : ret; +} + +static int ovpn_recv_tcp_data(int socket) +{ + uint8_t buf[1002]; + uint16_t len; + int ret; + + ret = recv(socket, buf, sizeof(buf), MSG_NOSIGNAL); + + if (ret < 2) { + fprintf(stderr, ">>>> Error while reading TCP data: %d\n", ret); + return ret; + } + + memcpy(&len, buf, sizeof(len)); + len = ntohs(len); + + fprintf(stdout, ">>>> Received %u bytes over TCP socket, header: %u\n", + ret, len); + + return 0; +} + +static enum ovpn_cmd ovpn_parse_cmd(const char *cmd) +{ + if (!strcmp(cmd, "new_iface")) + return CMD_NEW_IFACE; + + if (!strcmp(cmd, "del_iface")) + return CMD_DEL_IFACE; + + if (!strcmp(cmd, "listen")) + return CMD_LISTEN; + + if (!strcmp(cmd, "connect")) + return CMD_CONNECT; + + if (!strcmp(cmd, "new_peer")) + return CMD_NEW_PEER; + + if (!strcmp(cmd, "new_multi_peer")) + return CMD_NEW_MULTI_PEER; + + if (!strcmp(cmd, "set_peer")) + return CMD_SET_PEER; + + if (!strcmp(cmd, "del_peer")) + return CMD_DEL_PEER; + + if (!strcmp(cmd, "get_peer")) + return CMD_GET_PEER; + + if (!strcmp(cmd, "new_key")) + return CMD_NEW_KEY; + + if (!strcmp(cmd, "del_key")) + return CMD_DEL_KEY; + + if (!strcmp(cmd, "get_key")) + return CMD_GET_KEY; + + if (!strcmp(cmd, "swap_keys")) + return CMD_SWAP_KEYS; + + if (!strcmp(cmd, "listen_mcast")) + return CMD_LISTEN_MCAST; + + return CMD_INVALID; +} + +/* Send process to background and waits for signal. + * + * This helper is called at the end of commands + * creating sockets, so that the latter stay alive + * along with the process that created them. + * + * A signal is expected to be delivered in order to + * terminate the waiting processes + */ +static void ovpn_waitbg(void) +{ + daemon(1, 1); + pause(); +} + +static int ovpn_run_cmd(struct ovpn_ctx *ovpn) +{ + char peer_id[10], vpnip[INET6_ADDRSTRLEN], raddr[128], rport[10]; + int n, ret; + FILE *fp; + + switch (ovpn->cmd) { + case CMD_NEW_IFACE: + ret = ovpn_new_iface(ovpn); + break; + case CMD_DEL_IFACE: + ret = ovpn_del_iface(ovpn); + break; + case CMD_LISTEN: + ret = ovpn_listen(ovpn, ovpn->sa_family); + if (ret < 0) { + fprintf(stderr, "cannot listen on TCP socket\n"); + return ret; + } + + fp = fopen(ovpn->peers_file, "r"); + if (!fp) { + fprintf(stderr, "cannot open file: %s\n", + ovpn->peers_file); + return -1; + } + + int num_peers = 0; + + while ((n = fscanf(fp, "%s %s\n", peer_id, vpnip)) == 2) { + struct ovpn_ctx peer_ctx = { 0 }; + + if (num_peers == MAX_PEERS) { + fprintf(stderr, "max peers reached!\n"); + return -E2BIG; + } + + peer_ctx.ifindex = ovpn->ifindex; + peer_ctx.sa_family = ovpn->sa_family; + + peer_ctx.socket = ovpn_accept(ovpn); + if (peer_ctx.socket < 0) { + fprintf(stderr, "cannot accept connection!\n"); + return -1; + } + + /* store peer sockets to test TCP I/O */ + ovpn->cli_sockets[num_peers] = peer_ctx.socket; + + ret = ovpn_parse_new_peer(&peer_ctx, peer_id, NULL, + NULL, vpnip); + if (ret < 0) { + fprintf(stderr, "error while parsing line\n"); + return -1; + } + + ret = ovpn_new_peer(&peer_ctx, true); + if (ret < 0) { + fprintf(stderr, + "cannot add peer to VPN: %s %s\n", + peer_id, vpnip); + return ret; + } + num_peers++; + } + + for (int i = 0; i < num_peers; i++) { + ret = ovpn_recv_tcp_data(ovpn->cli_sockets[i]); + if (ret < 0) + break; + } + ovpn_waitbg(); + break; + case CMD_CONNECT: + ret = ovpn_connect(ovpn); + if (ret < 0) { + fprintf(stderr, "cannot connect TCP socket\n"); + return ret; + } + + ret = ovpn_new_peer(ovpn, true); + if (ret < 0) { + fprintf(stderr, "cannot add peer to VPN\n"); + close(ovpn->socket); + return ret; + } + + if (ovpn->cipher != OVPN_CIPHER_ALG_NONE) { + ret = ovpn_new_key(ovpn); + if (ret < 0) { + fprintf(stderr, "cannot set key\n"); + return ret; + } + } + + ret = ovpn_send_tcp_data(ovpn->socket); + ovpn_waitbg(); + break; + case CMD_NEW_PEER: + ret = ovpn_udp_socket(ovpn, AF_INET6); + if (ret < 0) + return ret; + + ret = ovpn_new_peer(ovpn, false); + ovpn_waitbg(); + break; + case CMD_NEW_MULTI_PEER: + ret = ovpn_udp_socket(ovpn, AF_INET6); + if (ret < 0) + return ret; + + fp = fopen(ovpn->peers_file, "r"); + if (!fp) { + fprintf(stderr, "cannot open file: %s\n", + ovpn->peers_file); + return -1; + } + + while ((n = fscanf(fp, "%s %s %s %s\n", peer_id, raddr, rport, + vpnip)) == 4) { + struct ovpn_ctx peer_ctx = { 0 }; + + peer_ctx.ifindex = ovpn->ifindex; + peer_ctx.socket = ovpn->socket; + peer_ctx.sa_family = AF_UNSPEC; + + ret = ovpn_parse_new_peer(&peer_ctx, peer_id, raddr, + rport, vpnip); + if (ret < 0) { + fprintf(stderr, "error while parsing line\n"); + return -1; + } + + ret = ovpn_new_peer(&peer_ctx, false); + if (ret < 0) { + fprintf(stderr, + "cannot add peer to VPN: %s %s %s %s\n", + peer_id, raddr, rport, vpnip); + return ret; + } + } + ovpn_waitbg(); + break; + case CMD_SET_PEER: + ret = ovpn_set_peer(ovpn); + break; + case CMD_DEL_PEER: + ret = ovpn_del_peer(ovpn); + break; + case CMD_GET_PEER: + if (ovpn->peer_id == PEER_ID_UNDEF) + fprintf(stderr, "List of peers connected to: %s\n", + ovpn->ifname); + + ret = ovpn_get_peer(ovpn); + break; + case CMD_NEW_KEY: + ret = ovpn_new_key(ovpn); + break; + case CMD_DEL_KEY: + ret = ovpn_del_key(ovpn); + break; + case CMD_GET_KEY: + ret = ovpn_get_key(ovpn); + break; + case CMD_SWAP_KEYS: + ret = ovpn_swap_keys(ovpn); + break; + case CMD_LISTEN_MCAST: + ret = ovpn_listen_mcast(); + break; + case CMD_INVALID: + break; + } + + return ret; +} + +static int ovpn_parse_cmd_args(struct ovpn_ctx *ovpn, int argc, char *argv[]) +{ + int ret; + + /* no args required for LISTEN_MCAST */ + if (ovpn->cmd == CMD_LISTEN_MCAST) + return 0; + + /* all commands need an ifname */ + if (argc < 3) + return -EINVAL; + + strscpy(ovpn->ifname, argv[2], IFNAMSIZ - 1); + ovpn->ifname[IFNAMSIZ - 1] = '\0'; + + /* all commands, except NEW_IFNAME, needs an ifindex */ + if (ovpn->cmd != CMD_NEW_IFACE) { + ovpn->ifindex = if_nametoindex(ovpn->ifname); + if (!ovpn->ifindex) { + fprintf(stderr, "cannot find interface: %s\n", + strerror(errno)); + return -1; + } + } + + switch (ovpn->cmd) { + case CMD_NEW_IFACE: + if (argc < 4) + break; + + if (!strcmp(argv[3], "P2P")) { + ovpn->mode = OVPN_MODE_P2P; + } else if (!strcmp(argv[3], "MP")) { + ovpn->mode = OVPN_MODE_MP; + } else { + fprintf(stderr, "Cannot parse iface mode: %s\n", + argv[3]); + return -1; + } + ovpn->mode_set = true; + break; + case CMD_DEL_IFACE: + break; + case CMD_LISTEN: + if (argc < 5) + return -EINVAL; + + ovpn->lport = strtoul(argv[3], NULL, 10); + if (errno == ERANGE || ovpn->lport > 65535) { + fprintf(stderr, "lport value out of range\n"); + return -1; + } + + ovpn->peers_file = argv[4]; + + if (argc > 5 && !strcmp(argv[5], "ipv6")) + ovpn->sa_family = AF_INET6; + break; + case CMD_CONNECT: + if (argc < 6) + return -EINVAL; + + ovpn->sa_family = AF_INET; + + ret = ovpn_parse_new_peer(ovpn, argv[3], argv[4], argv[5], + NULL); + if (ret < 0) { + fprintf(stderr, "Cannot parse remote peer data\n"); + return -1; + } + + if (argc > 6) { + ovpn->key_slot = OVPN_KEY_SLOT_PRIMARY; + ovpn->key_id = 0; + ovpn->cipher = OVPN_CIPHER_ALG_AES_GCM; + ovpn->key_dir = KEY_DIR_OUT; + + ret = ovpn_parse_key(argv[6], ovpn); + if (ret) + return -1; + } + break; + case CMD_NEW_PEER: + if (argc < 7) + return -EINVAL; + + ovpn->lport = strtoul(argv[4], NULL, 10); + if (errno == ERANGE || ovpn->lport > 65535) { + fprintf(stderr, "lport value out of range\n"); + return -1; + } + + const char *vpnip = (argc > 7) ? argv[7] : NULL; + + ret = ovpn_parse_new_peer(ovpn, argv[3], argv[5], argv[6], + vpnip); + if (ret < 0) + return -1; + break; + case CMD_NEW_MULTI_PEER: + if (argc < 5) + return -EINVAL; + + ovpn->lport = strtoul(argv[3], NULL, 10); + if (errno == ERANGE || ovpn->lport > 65535) { + fprintf(stderr, "lport value out of range\n"); + return -1; + } + + ovpn->peers_file = argv[4]; + break; + case CMD_SET_PEER: + if (argc < 6) + return -EINVAL; + + ovpn->peer_id = strtoul(argv[3], NULL, 10); + if (errno == ERANGE || ovpn->peer_id > PEER_ID_UNDEF) { + fprintf(stderr, "peer ID value out of range\n"); + return -1; + } + + ovpn->keepalive_interval = strtoul(argv[4], NULL, 10); + if (errno == ERANGE) { + fprintf(stderr, + "keepalive interval value out of range\n"); + return -1; + } + + ovpn->keepalive_timeout = strtoul(argv[5], NULL, 10); + if (errno == ERANGE) { + fprintf(stderr, + "keepalive interval value out of range\n"); + return -1; + } + break; + case CMD_DEL_PEER: + if (argc < 4) + return -EINVAL; + + ovpn->peer_id = strtoul(argv[3], NULL, 10); + if (errno == ERANGE || ovpn->peer_id > PEER_ID_UNDEF) { + fprintf(stderr, "peer ID value out of range\n"); + return -1; + } + break; + case CMD_GET_PEER: + ovpn->peer_id = PEER_ID_UNDEF; + if (argc > 3) { + ovpn->peer_id = strtoul(argv[3], NULL, 10); + if (errno == ERANGE || ovpn->peer_id > PEER_ID_UNDEF) { + fprintf(stderr, "peer ID value out of range\n"); + return -1; + } + } + break; + case CMD_NEW_KEY: + if (argc < 9) + return -EINVAL; + + ovpn->peer_id = strtoul(argv[3], NULL, 10); + if (errno == ERANGE) { + fprintf(stderr, "peer ID value out of range\n"); + return -1; + } + + ret = ovpn_parse_key_slot(argv[4], ovpn); + if (ret) + return -1; + + ovpn->key_id = strtoul(argv[5], NULL, 10); + if (errno == ERANGE || ovpn->key_id > 2) { + fprintf(stderr, "key ID out of range\n"); + return -1; + } + + ret = ovpn_parse_cipher(argv[6], ovpn); + if (ret < 0) + return -1; + + ret = ovpn_parse_key_direction(argv[7], ovpn); + if (ret < 0) + return -1; + + ret = ovpn_parse_key(argv[8], ovpn); + if (ret) + return -1; + break; + case CMD_DEL_KEY: + if (argc < 4) + return -EINVAL; + + ovpn->peer_id = strtoul(argv[3], NULL, 10); + if (errno == ERANGE) { + fprintf(stderr, "peer ID value out of range\n"); + return -1; + } + + ret = ovpn_parse_key_slot(argv[4], ovpn); + if (ret) + return ret; + break; + case CMD_GET_KEY: + if (argc < 5) + return -EINVAL; + + ovpn->peer_id = strtoul(argv[3], NULL, 10); + if (errno == ERANGE) { + fprintf(stderr, "peer ID value out of range\n"); + return -1; + } + + ret = ovpn_parse_key_slot(argv[4], ovpn); + if (ret) + return ret; + break; + case CMD_SWAP_KEYS: + if (argc < 4) + return -EINVAL; + + ovpn->peer_id = strtoul(argv[3], NULL, 10); + if (errno == ERANGE) { + fprintf(stderr, "peer ID value out of range\n"); + return -1; + } + break; + case CMD_LISTEN_MCAST: + break; + case CMD_INVALID: + break; + } + + return 0; +} + +int main(int argc, char *argv[]) +{ + struct ovpn_ctx ovpn; + int ret; + + if (argc < 2) { + usage(argv[0]); + return -1; + } + + memset(&ovpn, 0, sizeof(ovpn)); + ovpn.sa_family = AF_INET; + ovpn.cipher = OVPN_CIPHER_ALG_NONE; + + ovpn.cmd = ovpn_parse_cmd(argv[1]); + if (ovpn.cmd == CMD_INVALID) { + fprintf(stderr, "Error: unknown command.\n\n"); + usage(argv[0]); + return -1; + } + + ret = ovpn_parse_cmd_args(&ovpn, argc, argv); + if (ret < 0) { + fprintf(stderr, "Error: invalid arguments.\n\n"); + if (ret == -EINVAL) + usage(argv[0]); + return ret; + } + + ret = ovpn_run_cmd(&ovpn); + if (ret) + fprintf(stderr, "Cannot execute command: %s (%d)\n", + strerror(-ret), ret); + + return ret; +} diff --git a/tools/testing/selftests/net/ovpn/tcp_peers.txt b/tools/testing/selftests/net/ovpn/tcp_peers.txt new file mode 100644 index 0000000000000..d753eebe8716e --- /dev/null +++ b/tools/testing/selftests/net/ovpn/tcp_peers.txt @@ -0,0 +1,5 @@ +1 5.5.5.2 +2 5.5.5.3 +3 5.5.5.4 +4 5.5.5.5 +5 5.5.5.6 diff --git a/tools/testing/selftests/net/ovpn/test-chachapoly.sh b/tools/testing/selftests/net/ovpn/test-chachapoly.sh new file mode 100755 index 0000000000000..32504079a2b89 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/test-chachapoly.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2025 OpenVPN, Inc. +# +# Author: Antonio Quartulli + +ALG="chachapoly" + +source test.sh diff --git a/tools/testing/selftests/net/ovpn/test-close-socket-tcp.sh b/tools/testing/selftests/net/ovpn/test-close-socket-tcp.sh new file mode 100755 index 0000000000000..093d44772ffdf --- /dev/null +++ b/tools/testing/selftests/net/ovpn/test-close-socket-tcp.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2025 OpenVPN, Inc. +# +# Author: Antonio Quartulli + +PROTO="TCP" + +source test-close-socket.sh diff --git a/tools/testing/selftests/net/ovpn/test-close-socket.sh b/tools/testing/selftests/net/ovpn/test-close-socket.sh new file mode 100755 index 0000000000000..5e48a8b679287 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/test-close-socket.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2020-2025 OpenVPN, Inc. +# +# Author: Antonio Quartulli + +#set -x +set -e + +source ./common.sh + +cleanup + +modprobe -q ovpn || true + +for p in $(seq 0 ${NUM_PEERS}); do + create_ns ${p} +done + +for p in $(seq 0 ${NUM_PEERS}); do + setup_ns ${p} 5.5.5.$((${p} + 1))/24 +done + +for p in $(seq 0 ${NUM_PEERS}); do + add_peer ${p} +done + +for p in $(seq 1 ${NUM_PEERS}); do + ip netns exec peer0 ${OVPN_CLI} set_peer tun0 ${p} 60 120 + ip netns exec peer${p} ${OVPN_CLI} set_peer tun${p} ${p} 60 120 +done + +sleep 1 + +for p in $(seq 1 ${NUM_PEERS}); do + ip netns exec peer0 ping -qfc 500 -w 3 5.5.5.$((${p} + 1)) +done + +ip netns exec peer0 iperf3 -1 -s & +sleep 1 +ip netns exec peer1 iperf3 -Z -t 3 -c 5.5.5.1 + +cleanup + +modprobe -r ovpn || true diff --git a/tools/testing/selftests/net/ovpn/test-float.sh b/tools/testing/selftests/net/ovpn/test-float.sh new file mode 100755 index 0000000000000..ba5d725e18b07 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/test-float.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2025 OpenVPN, Inc. +# +# Author: Antonio Quartulli + +FLOAT="1" + +source test.sh diff --git a/tools/testing/selftests/net/ovpn/test-tcp.sh b/tools/testing/selftests/net/ovpn/test-tcp.sh new file mode 100755 index 0000000000000..ba3f1f315a349 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/test-tcp.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2025 OpenVPN, Inc. +# +# Author: Antonio Quartulli + +PROTO="TCP" + +source test.sh diff --git a/tools/testing/selftests/net/ovpn/test.sh b/tools/testing/selftests/net/ovpn/test.sh new file mode 100755 index 0000000000000..7b62897b02408 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/test.sh @@ -0,0 +1,113 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2020-2025 OpenVPN, Inc. +# +# Author: Antonio Quartulli + +#set -x +set -e + +source ./common.sh + +cleanup + +modprobe -q ovpn || true + +for p in $(seq 0 ${NUM_PEERS}); do + create_ns ${p} +done + +for p in $(seq 0 ${NUM_PEERS}); do + setup_ns ${p} 5.5.5.$((${p} + 1))/24 +done + +for p in $(seq 0 ${NUM_PEERS}); do + add_peer ${p} +done + +for p in $(seq 1 ${NUM_PEERS}); do + ip netns exec peer0 ${OVPN_CLI} set_peer tun0 ${p} 60 120 + ip netns exec peer${p} ${OVPN_CLI} set_peer tun${p} ${p} 60 120 +done + +sleep 1 + +for p in $(seq 1 ${NUM_PEERS}); do + ip netns exec peer0 ping -qfc 500 -w 3 5.5.5.$((${p} + 1)) +done + +if [ "$FLOAT" == "1" ]; then + # make clients float.. + for p in $(seq 1 ${NUM_PEERS}); do + ip -n peer${p} addr del 10.10.${p}.2/24 dev veth${p} + ip -n peer${p} addr add 10.10.${p}.3/24 dev veth${p} + done + for p in $(seq 1 ${NUM_PEERS}); do + ip netns exec peer${p} ping -qfc 500 -w 3 5.5.5.1 + done +fi + +ip netns exec peer0 iperf3 -1 -s & +sleep 1 +ip netns exec peer1 iperf3 -Z -t 3 -c 5.5.5.1 + +echo "Adding secondary key and then swap:" +for p in $(seq 1 ${NUM_PEERS}); do + ip netns exec peer0 ${OVPN_CLI} new_key tun0 ${p} 2 1 ${ALG} 0 data64.key + ip netns exec peer${p} ${OVPN_CLI} new_key tun${p} ${p} 2 1 ${ALG} 1 data64.key + ip netns exec peer${p} ${OVPN_CLI} swap_keys tun${p} ${p} +done + +sleep 1 + +echo "Querying all peers:" +ip netns exec peer0 ${OVPN_CLI} get_peer tun0 +ip netns exec peer1 ${OVPN_CLI} get_peer tun1 + +echo "Querying peer 1:" +ip netns exec peer0 ${OVPN_CLI} get_peer tun0 1 + +echo "Querying non-existent peer 10:" +ip netns exec peer0 ${OVPN_CLI} get_peer tun0 10 || true + +echo "Deleting peer 1:" +ip netns exec peer0 ${OVPN_CLI} del_peer tun0 1 +ip netns exec peer1 ${OVPN_CLI} del_peer tun1 1 + +echo "Querying keys:" +for p in $(seq 2 ${NUM_PEERS}); do + ip netns exec peer${p} ${OVPN_CLI} get_key tun${p} ${p} 1 + ip netns exec peer${p} ${OVPN_CLI} get_key tun${p} ${p} 2 +done + +echo "Deleting peer while sending traffic:" +(ip netns exec peer2 ping -qf -w 4 5.5.5.1)& +sleep 2 +ip netns exec peer0 ${OVPN_CLI} del_peer tun0 2 +# following command fails in TCP mode +# (both ends get conn reset when one peer disconnects) +ip netns exec peer2 ${OVPN_CLI} del_peer tun2 2 || true + +echo "Deleting keys:" +for p in $(seq 3 ${NUM_PEERS}); do + ip netns exec peer${p} ${OVPN_CLI} del_key tun${p} ${p} 1 + ip netns exec peer${p} ${OVPN_CLI} del_key tun${p} ${p} 2 +done + +echo "Setting timeout to 3s MP:" +for p in $(seq 3 ${NUM_PEERS}); do + ip netns exec peer0 ${OVPN_CLI} set_peer tun0 ${p} 3 3 || true + ip netns exec peer${p} ${OVPN_CLI} set_peer tun${p} ${p} 0 0 +done +# wait for peers to timeout +sleep 5 + +echo "Setting timeout to 3s P2P:" +for p in $(seq 3 ${NUM_PEERS}); do + ip netns exec peer${p} ${OVPN_CLI} set_peer tun${p} ${p} 3 3 +done +sleep 5 + +cleanup + +modprobe -r ovpn || true diff --git a/tools/testing/selftests/net/ovpn/udp_peers.txt b/tools/testing/selftests/net/ovpn/udp_peers.txt new file mode 100644 index 0000000000000..32f14bd9347a6 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/udp_peers.txt @@ -0,0 +1,5 @@ +1 10.10.1.2 1 5.5.5.2 +2 10.10.2.2 1 5.5.5.3 +3 10.10.3.2 1 5.5.5.4 +4 10.10.4.2 1 5.5.5.5 +5 10.10.5.2 1 5.5.5.6 diff --git a/tools/testing/selftests/net/reuseport_addr_any.c b/tools/testing/selftests/net/reuseport_addr_any.c index b8475cb29be7a..1c43401a1c807 100644 --- a/tools/testing/selftests/net/reuseport_addr_any.c +++ b/tools/testing/selftests/net/reuseport_addr_any.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -21,10 +20,6 @@ #include #include -#ifndef SOL_DCCP -#define SOL_DCCP 269 -#endif - static const char *IP4_ADDR = "127.0.0.1"; static const char *IP6_ADDR = "::1"; static const char *IP4_MAPPED6 = "::ffff:127.0.0.1"; @@ -86,15 +81,6 @@ static void build_rcv_fd(int family, int proto, int *rcv_fds, int count, if (proto == SOCK_STREAM && listen(rcv_fds[i], 10)) error(1, errno, "tcp: failed to listen on receive port"); - else if (proto == SOCK_DCCP) { - if (setsockopt(rcv_fds[i], SOL_DCCP, - DCCP_SOCKOPT_SERVICE, - &(int) {htonl(42)}, sizeof(int))) - error(1, errno, "failed to setsockopt"); - - if (listen(rcv_fds[i], 10)) - error(1, errno, "dccp: failed to listen on receive port"); - } } } @@ -148,11 +134,6 @@ static int connect_and_send(int family, int proto) if (fd < 0) error(1, errno, "failed to create send socket"); - if (proto == SOCK_DCCP && - setsockopt(fd, SOL_DCCP, DCCP_SOCKOPT_SERVICE, - &(int){htonl(42)}, sizeof(int))) - error(1, errno, "failed to setsockopt"); - if (bind(fd, saddr, sz)) error(1, errno, "failed to bind send socket"); @@ -175,7 +156,7 @@ static int receive_once(int epfd, int proto) if (i < 0) error(1, errno, "epoll_wait failed"); - if (proto == SOCK_STREAM || proto == SOCK_DCCP) { + if (proto == SOCK_STREAM) { fd = accept(ev.data.fd, NULL, NULL); if (fd < 0) error(1, errno, "failed to accept"); @@ -243,20 +224,6 @@ static void run_one_test(int fam_send, int fam_rcv, int proto, static void test_proto(int proto, const char *proto_str) { - if (proto == SOCK_DCCP) { - int test_fd; - - test_fd = socket(AF_INET, proto, 0); - if (test_fd < 0) { - if (errno == ESOCKTNOSUPPORT) { - fprintf(stderr, "DCCP not supported: skipping DCCP tests\n"); - return; - } else - error(1, errno, "failed to create a DCCP socket"); - } - close(test_fd); - } - fprintf(stderr, "%s IPv4 ... ", proto_str); run_one_test(AF_INET, AF_INET, proto, IP4_ADDR); @@ -271,7 +238,6 @@ int main(void) { test_proto(SOCK_DGRAM, "UDP"); test_proto(SOCK_STREAM, "TCP"); - test_proto(SOCK_DCCP, "DCCP"); fprintf(stderr, "SUCCESS\n"); return 0; diff --git a/tools/testing/selftests/net/test_bridge_neigh_suppress.sh b/tools/testing/selftests/net/test_bridge_neigh_suppress.sh index 02b986c9c247d..9067197c90550 100755 --- a/tools/testing/selftests/net/test_bridge_neigh_suppress.sh +++ b/tools/testing/selftests/net/test_bridge_neigh_suppress.sh @@ -51,7 +51,9 @@ ret=0 # All tests in this script. Can be overridden with -t option. TESTS=" neigh_suppress_arp + neigh_suppress_uc_arp neigh_suppress_ns + neigh_suppress_uc_ns neigh_vlan_suppress_arp neigh_vlan_suppress_ns " @@ -388,6 +390,52 @@ neigh_suppress_arp() neigh_suppress_arp_common $vid $sip $tip } +neigh_suppress_uc_arp_common() +{ + local vid=$1; shift + local sip=$1; shift + local tip=$1; shift + local tmac + + echo + echo "Unicast ARP, per-port ARP suppression - VLAN $vid" + echo "-----------------------------------------------" + + run_cmd "bridge -n $sw1 link set dev vx0 neigh_suppress on" + run_cmd "bridge -n $sw1 -d link show dev vx0 | grep \"neigh_suppress on\"" + log_test $? 0 "\"neigh_suppress\" is on" + + tmac=$(ip -n $h2 -j -p link show eth0.$vid | jq -r '.[]["address"]') + run_cmd "bridge -n $sw1 fdb replace $tmac dev vx0 master static vlan $vid" + run_cmd "ip -n $sw1 neigh replace $tip lladdr $tmac nud permanent dev br0.$vid" + + run_cmd "tc -n $h1 qdisc replace dev eth0.$vid clsact" + run_cmd "tc -n $h1 filter replace dev eth0.$vid ingress pref 1 handle 101 proto arp flower arp_sip $tip arp_op reply action pass" + + run_cmd "tc -n $h2 qdisc replace dev eth0.$vid clsact" + run_cmd "tc -n $h2 filter replace dev eth0.$vid egress pref 1 handle 101 proto arp flower arp_tip $sip arp_op reply action pass" + + run_cmd "ip netns exec $h1 mausezahn eth0.$vid -c 1 -a own -b $tmac -t arp 'request sip=$sip, tip=$tip, tmac=$tmac' -q" + tc_check_packets $h1 "dev eth0.$vid ingress" 101 1 + log_test $? 0 "Unicast ARP, suppression on, h1 filter" + tc_check_packets $h2 "dev eth0.$vid egress" 101 1 + log_test $? 0 "Unicast ARP, suppression on, h2 filter" +} + +neigh_suppress_uc_arp() +{ + local vid=10 + local sip=192.0.2.1 + local tip=192.0.2.2 + + neigh_suppress_uc_arp_common $vid $sip $tip + + vid=20 + sip=192.0.2.17 + tip=192.0.2.18 + neigh_suppress_uc_arp_common $vid $sip $tip +} + neigh_suppress_ns_common() { local vid=$1; shift @@ -494,6 +542,78 @@ neigh_suppress_ns() neigh_suppress_ns_common $vid $saddr $daddr $maddr } +icmpv6_header_get() +{ + local csum=$1; shift + local tip=$1; shift + local type + local p + + # Type 135 (Neighbor Solicitation), hex format + type="87" + p=$(: + )"$type:"$( : ICMPv6.type + )"00:"$( : ICMPv6.code + )"$csum:"$( : ICMPv6.checksum + )"00:00:00:00:"$( : Reserved + )"$tip:"$( : Target Address + ) + echo $p +} + +neigh_suppress_uc_ns_common() +{ + local vid=$1; shift + local sip=$1; shift + local dip=$1; shift + local full_dip=$1; shift + local csum=$1; shift + local tmac + + echo + echo "Unicast NS, per-port NS suppression - VLAN $vid" + echo "---------------------------------------------" + + run_cmd "bridge -n $sw1 link set dev vx0 neigh_suppress on" + run_cmd "bridge -n $sw1 -d link show dev vx0 | grep \"neigh_suppress on\"" + log_test $? 0 "\"neigh_suppress\" is on" + + tmac=$(ip -n $h2 -j -p link show eth0.$vid | jq -r '.[]["address"]') + run_cmd "bridge -n $sw1 fdb replace $tmac dev vx0 master static vlan $vid" + run_cmd "ip -n $sw1 -6 neigh replace $dip lladdr $tmac nud permanent dev br0.$vid" + + run_cmd "tc -n $h1 qdisc replace dev eth0.$vid clsact" + run_cmd "tc -n $h1 filter replace dev eth0.$vid ingress pref 1 handle 101 proto ipv6 flower ip_proto icmpv6 src_ip $dip type 136 code 0 action pass" + + run_cmd "tc -n $h2 qdisc replace dev eth0.$vid clsact" + run_cmd "tc -n $h2 filter replace dev eth0.$vid egress pref 1 handle 101 proto ipv6 flower ip_proto icmpv6 dst_ip $sip type 136 code 0 action pass" + + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid -c 1 -a own -b $tmac -A $sip -B $dip -t ip hop=255,next=58,payload=$(icmpv6_header_get $csum $full_dip) -q" + tc_check_packets $h1 "dev eth0.$vid ingress" 101 1 + log_test $? 0 "Unicast NS, suppression on, h1 filter" + tc_check_packets $h2 "dev eth0.$vid egress" 101 1 + log_test $? 0 "Unicast NS, suppression on, h2 filter" +} + +neigh_suppress_uc_ns() +{ + local vid=10 + local saddr=2001:db8:1::1 + local daddr=2001:db8:1::2 + local full_daddr=20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:02 + local csum="ef:79" + + neigh_suppress_uc_ns_common $vid $saddr $daddr $full_daddr $csum + + vid=20 + saddr=2001:db8:2::1 + daddr=2001:db8:2::2 + full_daddr=20:01:0d:b8:00:02:00:00:00:00:00:00:00:00:00:02 + csum="ef:76" + + neigh_suppress_uc_ns_common $vid $saddr $daddr $full_daddr $csum +} + neigh_vlan_suppress_arp() { local vid1=10 @@ -825,6 +945,11 @@ if [ ! -x "$(command -v jq)" ]; then exit $ksft_skip fi +if [ ! -x "$(command -v mausezahn)" ]; then + echo "SKIP: Could not run test without mausezahn tool" + exit $ksft_skip +fi + bridge link help 2>&1 | grep -q "neigh_vlan_suppress" if [ $? -ne 0 ]; then echo "SKIP: iproute2 bridge too old, missing per-VLAN neighbor suppression support" diff --git a/tools/testing/selftests/tc-testing/tdc.sh b/tools/testing/selftests/tc-testing/tdc.sh index cddff1772e104..589b18ed758a6 100755 --- a/tools/testing/selftests/tc-testing/tdc.sh +++ b/tools/testing/selftests/tc-testing/tdc.sh @@ -31,6 +31,10 @@ try_modprobe act_skbedit try_modprobe act_skbmod try_modprobe act_tunnel_key try_modprobe act_vlan +try_modprobe act_ife +try_modprobe act_meta_mark +try_modprobe act_meta_skbtcindex +try_modprobe act_meta_skbprio try_modprobe cls_basic try_modprobe cls_bpf try_modprobe cls_cgroup