Source code for polyvers.engrave

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2015-2018 European Commission (JRC);
# Licensed under the EUPL 1.2+ (the 'Licence');
# You may not use this work except in compliance with the Licence.
# You may obtain a copy of the Licence at: http://ec.europa.eu/idabc/eupl
#
"""Search and replace version-ids in files."""

from collections import defaultdict
from pathlib import Path
from typing import List, Tuple, Sequence, Set, Match, Dict
import logging

from . import pvproject
from ._vendor.traitlets.traitlets import (
    Dict as DictTrait, Bool as BoolTrait, Tuple as TupleTrait)
from ._vendor.traitlets.traitlets import Bytes, Instance
from .cmdlet import cmdlets
from .utils import fileutil as fu


log = logging.getLogger(__name__)


def _as_glob_pattern_pair(fpath):
    """24
    Add '**' in relative names, eliminate comments and split in positive/negatives

    :return:
        a 2-tuple(positive, negative), one always None
    """
    fpath = fpath.strip()

    ## Remove comments/empty-lines.
    if not fpath or fpath.startswith('#') or fpath.startswith('..'):
        return (None, None)

    if fpath.startswith('!'):
        # raise NotImplementedError('Negative match pattern %r not supported!' %
        #                           fpath)
        (positive, negative) = _as_glob_pattern_pair(fpath[1:])
        return (negative, positive)

    ## TODO: Handle '!' and escaping with '\' like .gitignore
    if fpath.startswith(('./', '/')):
        fpath = fpath.lstrip('./')
        fpath = fpath.lstrip('/')
    else:
        fpath = '**/' + fpath

    return (fpath.replace('\\', ''), None)


def _prepare_glob_pairs(patterns):
    pat_pairs = [_as_glob_pattern_pair(fpat) for fpat in patterns]
    pat_pairs = [pair for pair in pat_pairs if any(pair)]

    return pat_pairs


def _glob_find_files(pattern_pairs: Tuple[str, str], mybase: Path):
    from boltons.setutils import IndexedSet as iset

    files = iset()
    notfiles = set()  # type: ignore
    for positive, negative in pattern_pairs:
        if positive:
            new_files = iset(mybase.glob(positive))
            cleared_files = [f for f in new_files
                             if not any(nf in f.parents for nf in notfiles)]
            files.update(cleared_files)
        elif negative:
            new_notfiles = mybase.glob(negative)
            notfiles.update(new_notfiles)
        else:
            raise AssertionError("Both in (positive, negative) pair are None!")

    return files


def _glob_filter_in_mybase(files: pvproject.FPaths,
                           mybase: Path):
    assert all(isinstance(f, Path) for f in files)
    nfiles = []
    for f in files:
        try:
            rpath = f.relative_to(mybase)
            if '..' not in str(rpath):
                nfiles.append(f)
        except ValueError as _:
            "Skip it, outside mybase"

    return nfiles


def _glob_filter_out_other_bases(files: pvproject.FPaths,
                                 other_bases: pvproject.FPaths):
    if not other_bases:
        return files

    assert all(isinstance(f, Path) for f in files)
    assert all(isinstance(f, Path) for f in other_bases)

    nfiles = [f for f in files
              if not any(fu._is_base_or_same(obase, f) in (None, True)
                         for obase in other_bases)]

    return nfiles


def _gitignore_files(files: pvproject.FPaths) -> pvproject.FPaths:
    import subprocess as sbp
    from boltons.setutils import IndexedSet as iset

    if not files:
        return []

    files_lines = '\n'.join('%s' % f for f in files).encode('utf-8')
    p = sbp.run('git check-ignore --stdin'.split(),
                input=files_lines, stdout=sbp.PIPE,
                check=True)

    ignored = p.stdout.decode('utf-8').splitlines()

    return list(iset(files) - set(ignored))


[docs]def glob_files(patterns: List[str],
               mybase: pvproject.FLike = '.',
               other_bases: pvproject.FLikeList = None) -> pvproject.FPaths:
    """
    Glob files in `mybase` but not in `other_bases` (unless bases coincide).

    - Supports exclude patterns: ``!foo``.
    - If `mybase` is in `other_bases`, it doesn't change the results.
    """
    pattern_pairs = _prepare_glob_pairs(patterns)

    mybase = Path(mybase)
    files = _glob_find_files(pattern_pairs, mybase)
    files = _gitignore_files(files)

    files = _glob_filter_in_mybase(files, mybase)
    if other_bases:
        ## Keep bases only inside mybase, but
        # Exclude bases coinciding with mybase.
        #
        other_ppaths = [Path(ob) for ob in other_bases]
        other_ppaths = [ob for ob in other_ppaths
                        if not fu._is_same_file(mybase, ob) and
                        fu._is_base_or_same(mybase, ob)]
        files = _glob_filter_out_other_bases(files, other_ppaths)

    assert all(isinstance(f, Path) for f in files)
    return files


Range = Tuple[int, int]


[docs]def overlapped_matches(matches: Sequence[Match],
                       no_touch=False,
                       ) -> Set[Match]:
    """
    :param no_touch:
        if true, all three (0,1), (1,2) (2,3) overlap on 1 and 2.
    """
    import itertools as itt
    import operator

    op = operator.le if no_touch else operator.lt

    def overlap(a, b) -> Set[Match]:
        # from https://stackoverflow.com/a/3269471/548792
        return op(a[0], b[1]) and op(b[0], a[1])

    all_pairs = itt.combinations(matches, 2)
    overlapped: Set[Match] = set()
    for m1, m2 in all_pairs:
        if m1 not in overlapped and overlap(m1.span(), m2.span()):
            overlapped.add(m2)

    return overlapped


GlobTruples = List[Tuple[pvproject.Project, pvproject.Engrave, Path]]
GraftsMap = Dict[Path, List[Tuple[pvproject.Project,
                                  pvproject.Engrave,
                                  pvproject.Graft]]]
MatchQruple = Tuple[pvproject.Project,
                    pvproject.Engrave,
                    pvproject.Graft,
                    Match]
MatchMap = Dict[Path, List[MatchQruple]]


[docs]class FileProcessor(cmdlets.Spec):

    _fpath_bytes: Dict[Path, Tuple[bytes, bool]] = DictTrait(  # type: ignore
        key_trait=Instance(Path),
        value_trait=TupleTrait(Bytes(),
                               BoolTrait()))

    def _set_file_bytes(self, fpath: Path, fbytes: bytes) -> bytes:
        key = fpath.resolve(strict=True)
        if key in self._fpath_bytes:
            orig_fbytes, _changed = self._fpath_bytes[key]
            changed = fbytes != orig_fbytes
        else:
            ## Just read file.
            changed = False
        self._fpath_bytes[key] = (fbytes, changed)

        return fbytes

    def _read_file(self, fpath: Path) -> bytes:
        key = fpath.resolve(strict=True)
        fbytes, _changed = self._fpath_bytes.get(key, (None, None))
        if fbytes is None:
            with self.errlogged(OSError,
                                token='fread',
                                doing="reading file '%s'" % fpath):
                fbytes = self._set_file_bytes(fpath, fpath.read_bytes())
                self.log.debug("Read %i-bytes from file-to-engrave '%s'.",
                               len(fbytes), fpath)

        return fbytes

    def _write_all_files(self):
        for fpath, (fbytes, changed) in self._fpath_bytes.items():
            if not changed:
                self.log.debug("Skipped untouched file '%s'.", fpath)
                continue

            if not self.dry_run:
                with self.errlogged(OSError,
                                    token='fwrite',
                                    doing="writing file '%s'" % fpath):
                    fpath.write_bytes(fbytes)

            self.log.info("Written %i-bytes in engraved file '%s'.",
                          len(fbytes), fpath)

    match_map: MatchMap = DictTrait(key_trait=Instance(Path))  # type: ignore
#                                     TupleTrait((Instance(pvproject.Project),
#                                                 Instance(Engrave),
#                                                 Instance(Graft),
#                                                 ListTrait(Instance(Match))))))

    def nmatches(self):
        return sum(len(qruple) for qruple in self.match_map.values())

    def grafted_files(self, all_searched=False) -> List[Path]:
        return sorted(fpath
                      for fpath, (_fbytes, changed)
                      in self._fpath_bytes.items()
                      if all_searched or changed)

    def _glob_project(self,
                      project: pvproject.Project,
                      other_bases: pvproject.FLikeList = ()
                      ) -> GlobTruples:
        mybase = project.basepath
        glob_truples: GlobTruples = []
        for eng in project.active_engraves():
            with self.errlogged(
                token='glob',
                doing="globbing %.28s%s" % (eng, eng.globs)
            ):
                globs = [project.interp(gs, _escaped_for='glob')
                         for gs in eng.globs
                         if gs is not None]
                hit_fpaths = glob_files(  # type: ignore # (interp may be null)
                    globs, mybase=mybase or '.', other_bases=other_bases)
                glob_truples.extend((project, eng, fp)
                                    for fp in hit_fpaths)

        return glob_truples

    def _reindex_glob_results_on_fpaths(self, gtruples: GlobTruples
                                        ) -> GraftsMap:
        igtruples: GraftsMap = defaultdict(list)
        for prj, eng, fpath in gtruples:
            igtruples[fpath].extend((prj, eng, graft)
                                    for graft in eng.grafts)
        return igtruples or {}

    def _glob_all_projects(self,
                           projects: Sequence[pvproject.Project],
                           all_projects: Sequence[pvproject.Project]
                           ) -> GraftsMap:
        other_bases = [prj.basepath for prj in all_projects if prj.basepath]
        glob_truples = []
        for prj in projects:
            with self.errlogged(token='glob',
                                doing="globbing %.28s" % prj):
                glob_truples.extend(self._glob_project(prj, other_bases))

        return self._reindex_glob_results_on_fpaths(glob_truples)

    def _scan_all_grafts(self, grafts_map: GraftsMap) -> MatchMap:
        match_map: MatchMap = defaultdict(list)
        for fpath, graft_truple in grafts_map.items():
            fbytes = self._read_file(fpath)
            for prj, eng, graft in graft_truple:
                with self.errlogged(token='scan',
                                    doing="scanning '%s' for %.28s.%.28s" %
                                    (fpath, prj, eng)):
                    matches = graft.collect_matches(fbytes, prj)
                    self.log.debug(
                        "Scanned %i matches in %i-bytes text of file '%s': "
                        "\n  matches: %s\n  %s\n  %s \n  %s",
                        len(matches), len(fbytes), fpath,
                        '\n    '.join(str(m) for m in [''] + matches),  # type: ignore
                        graft, eng, prj)

                    sliced_matches = graft.sliced_matches(matches)
                    if len(sliced_matches) != len(matches):
                        self.log.debug(
                            "Sliced %i out of %i matches in file '%s' for %s.",
                            len(sliced_matches), len(matches), fpath, graft)

                match_map[fpath].extend((prj, eng, graft, m)
                                        for m in matches)

        return match_map or {}

    def _drop_overlapping_matches(self, match_map: MatchMap) -> MatchMap:
        """Sorts also matches on the starting-points."""
        good_match_map = {}
        for fpath, mqruples in match_map.items():
            mqruples = sorted(mqruples, key=lambda mq: mq[-1].start())
            all_file_matches = [mq[-1] for mq in mqruples]

            bad_matches = overlapped_matches(all_file_matches, no_touch=True)
            if bad_matches:
                self.log.debug(
                    "Found %i out of %i overlapping matches for file '%s'."
                    "\n  Overlaps: %s",
                    len(bad_matches), len(all_file_matches), fpath,
                    ', '.join(str(s) for s in bad_matches))

            good_match_map[fpath] = [mq
                                     for mq in mqruples
                                     if mq[-1] not in bad_matches]

        return good_match_map

    def scan_projects(self,
                      projects: Sequence[pvproject.Project],
                      all_projects: Sequence[pvproject.Project] = None
                      ) -> MatchMap:
        assert projects
        grafts_map = self._glob_all_projects(projects, all_projects or projects)
        match_map = self._scan_all_grafts(grafts_map)
        match_map = self._drop_overlapping_matches(match_map)

        self.match_map = match_map

        return match_map

    def _graft_match(self,
                     graft: pvproject.Graft,
                     fbytes: bytes,
                     match: Match,
                     offset: int,
                     project: 'pvproject.Project',
                     ) -> Tuple[bytes, int]:
        """
        :param graft:
            a graft with a non-null :attr:`pvproject.Graft.subst`
        :return:
            the substituted fbytes
        """
        subst = graft.subst_resolved(project)
        if subst is not None:
            mstart, mend = match.span()
            new_text = match.expand(subst)
            head = fbytes[:mstart + offset]
            tail = fbytes[mend + offset:]
            fbytes = head + new_text + tail
            offset += len(new_text) - (mend - mstart)

        return fbytes, offset

    def engrave_matches(self):
        match_map = self.match_map
        for fpath, mqruples in match_map.items():
            if not mqruples:
                continue

            fbytes = self._read_file(fpath)
            offset = 0  # File growth/shrink as substituted?
            for prj, eng, graft, match in (mq
                                           for mq in mqruples
                                           if mq[2].subst):
                with self.errlogged(token='subst',
                                    doing="subst '%s' with %.28s.%.28s.%.28s.%.28s" %
                                    (fpath, prj, eng, graft, match)):

                    fbytes, offset = self._graft_match(
                        graft, fbytes, match, offset, prj)
                    self.log.debug(
                        "Substituted match in %i(%+i)-bytes file '%s': "
                        "\n  %s\n  %s\n  %s \n  %s",
                        len(fbytes), offset, fpath,
                        match, graft, eng, prj)

            self._set_file_bytes(fpath, fbytes)

        self._write_all_files()