forked from Gitlink/gitea-1120-rc1
Exclude generated files from language statistics (#11653)
* Update go-enry to v2.5.2
This commit is contained in:
parent
e8955173a9
commit
bd2335671f
2
go.mod
2
go.mod
|
@ -37,7 +37,7 @@ require (
|
|||
github.com/facebookgo/subset v0.0.0-20150612182917-8dac2c3c4870 // indirect
|
||||
github.com/gliderlabs/ssh v0.2.2
|
||||
github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a // indirect
|
||||
github.com/go-enry/go-enry/v2 v2.3.0
|
||||
github.com/go-enry/go-enry/v2 v2.5.2
|
||||
github.com/go-git/go-billy/v5 v5.0.0
|
||||
github.com/go-git/go-git/v5 v5.0.0
|
||||
github.com/go-openapi/jsonreference v0.19.3 // indirect
|
||||
|
|
12
go.sum
12
go.sum
|
@ -193,10 +193,10 @@ github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a h1:FQqo
|
|||
github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a/go.mod h1:/20jfyN9Y5QPEAprSgKAUr+glWDY39ZiUEAYOEv5dsE=
|
||||
github.com/glycerine/goconvey v0.0.0-20190410193231-58a59202ab31 h1:gclg6gY70GLy3PbkQ1AERPfmLMMagS60DKF78eWwLn8=
|
||||
github.com/glycerine/goconvey v0.0.0-20190410193231-58a59202ab31/go.mod h1:Ogl1Tioa0aV7gstGFO7KhffUsb9M4ydbEbbxpcEDc24=
|
||||
github.com/go-enry/go-enry/v2 v2.3.0 h1:o8KwgY6uSplysrIpj+Y42J/xGPp90ogVpxE2Z3s8Unk=
|
||||
github.com/go-enry/go-enry/v2 v2.3.0/go.mod h1:+xFJwbqWi15bvqFHb2ELUWVRKFQtwB61+sDrkvvxxGI=
|
||||
github.com/go-enry/go-oniguruma v1.2.0 h1:oBO9XC1IDT9+AoWW5oFsa/7gFeOPacEqDbyXZKWXuDs=
|
||||
github.com/go-enry/go-oniguruma v1.2.0/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
|
||||
github.com/go-enry/go-enry/v2 v2.5.2 h1:3f3PFAO6JitWkPi1GQ5/m6Xu4gNL1U5soJ8QaYqJ0YQ=
|
||||
github.com/go-enry/go-enry/v2 v2.5.2/go.mod h1:GVzIiAytiS5uT/QiuakK7TF1u4xDab87Y8V5EJRpsIQ=
|
||||
github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo=
|
||||
github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
|
||||
github.com/go-git/gcfg v1.5.0 h1:Q5ViNfGF8zFgyJWPqYwA7qGFoMTEiBmdlkcfRmpIMa4=
|
||||
github.com/go-git/gcfg v1.5.0/go.mod h1:5m20vg6GwYabIxaOonVkTdrILxQMpEShl1xiMF4ua+E=
|
||||
github.com/go-git/go-billy/v5 v5.0.0 h1:7NQHvd9FVid8VL4qVUMm8XifBK+2xCoZ2lSk0agRrHM=
|
||||
|
@ -616,8 +616,6 @@ github.com/tinylib/msgp v1.1.0/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDW
|
|||
github.com/tinylib/msgp v1.1.2 h1:gWmO7n0Ys2RBEb7GPYB9Ujq8Mk5p2U08lRnmMcGy6BQ=
|
||||
github.com/tinylib/msgp v1.1.2/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE=
|
||||
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
|
||||
github.com/toqueteos/trie v1.0.0 h1:8i6pXxNUXNRAqP246iibb7w/pSFquNTQ+uNfriG7vlk=
|
||||
github.com/toqueteos/trie v1.0.0/go.mod h1:Ywk48QhEqhU1+DwhMkJ2x7eeGxDHiGkAdc9+0DYcbsM=
|
||||
github.com/toqueteos/webbrowser v1.2.0 h1:tVP/gpK69Fx+qMJKsLE7TD8LuGWPnEV71wBN9rrstGQ=
|
||||
github.com/toqueteos/webbrowser v1.2.0/go.mod h1:XWoZq4cyp9WeUeak7w7LXRUQf1F1ATJMir8RTqb4ayM=
|
||||
github.com/tstranex/u2f v1.0.0 h1:HhJkSzDDlVSVIVt7pDJwCHQj67k7A5EeBgPmeD+pVsQ=
|
||||
|
@ -876,8 +874,6 @@ gopkg.in/testfixtures.v2 v2.5.0 h1:N08B7l2GzFQenyYbzqthDnKAA+cmb17iAZhhFxr7JHw=
|
|||
gopkg.in/testfixtures.v2 v2.5.0/go.mod h1:vyAq+MYCgNpR29qitQdLZhdbLFf4mR/2MFJRFoQZZ2M=
|
||||
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
|
||||
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
|
||||
gopkg.in/toqueteos/substring.v1 v1.0.2 h1:urLqCeMm6x/eTuQa1oZerNw8N1KNOIp5hD5kGL7lFsE=
|
||||
gopkg.in/toqueteos/substring.v1 v1.0.2/go.mod h1:Eb2Z1UYehlVK8LYW2WBVR2rwbujsz3aX8XDrM1vbNew=
|
||||
gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME=
|
||||
gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI=
|
||||
gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74=
|
||||
|
|
|
@ -10,8 +10,8 @@ import (
|
|||
"github.com/go-enry/go-enry/v2"
|
||||
)
|
||||
|
||||
// GetCodeLanguageWithCallback detects code language based on file name and content using callback
|
||||
func GetCodeLanguageWithCallback(filename string, contentFunc func() ([]byte, error)) string {
|
||||
// GetCodeLanguage detects code language based on file name and content
|
||||
func GetCodeLanguage(filename string, content []byte) string {
|
||||
if language, ok := enry.GetLanguageByExtension(filename); ok {
|
||||
return language
|
||||
}
|
||||
|
@ -20,17 +20,9 @@ func GetCodeLanguageWithCallback(filename string, contentFunc func() ([]byte, er
|
|||
return language
|
||||
}
|
||||
|
||||
content, err := contentFunc()
|
||||
if err != nil {
|
||||
if len(content) == 0 {
|
||||
return enry.OtherLanguage
|
||||
}
|
||||
|
||||
return enry.GetLanguage(filepath.Base(filename), content)
|
||||
}
|
||||
|
||||
// GetCodeLanguage detects code language based on file name and content
|
||||
func GetCodeLanguage(filename string, content []byte) string {
|
||||
return GetCodeLanguageWithCallback(filename, func() ([]byte, error) {
|
||||
return content, nil
|
||||
})
|
||||
}
|
||||
|
|
|
@ -50,11 +50,15 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e
|
|||
return nil
|
||||
}
|
||||
|
||||
// If content can not be read just do detection by filename
|
||||
content, _ := readFile(f, fileSizeLimit)
|
||||
if enry.IsGenerated(f.Name, content) {
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: Use .gitattributes file for linguist overrides
|
||||
|
||||
language := analyze.GetCodeLanguageWithCallback(f.Name, func() ([]byte, error) {
|
||||
return readFile(f, fileSizeLimit)
|
||||
})
|
||||
language := analyze.GetCodeLanguage(f.Name, content)
|
||||
if language == enry.OtherLanguage || language == "" {
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -1,26 +1,26 @@
|
|||
# go-enry [![GoDoc](https://godoc.org/github.com/go-enry/go-enry?status.svg)](https://pkg.go.dev/github.com/go-enry/go-enry/v2) [![Test](https://github.com/go-enry/go-enry/workflows/Test/badge.svg)](https://github.com/go-enry/go-enry/actions?query=workflow%3ATest+branch%3Amaster) [![codecov](https://codecov.io/gh/go-enry/go-enry/branch/master/graph/badge.svg)](https://codecov.io/gh/go-enry/go-enry)
|
||||
|
||||
Programming language detector and toolbox to ignore binary or vendored files. *enry*, started as a port to _Go_ of the original [Linguist](https://github.com/github/linguist) _Ruby_ library, that has an improved *2x performance*.
|
||||
Programming language detector and toolbox to ignore binary or vendored files. _enry_, started as a port to _Go_ of the original [Linguist](https://github.com/github/linguist) _Ruby_ library, that has an improved _2x performance_.
|
||||
|
||||
* [CLI](#cli)
|
||||
* [Library](#library)
|
||||
* [Use cases](#use-cases)
|
||||
* [By filename](#by-filename)
|
||||
* [By text](#by-text)
|
||||
* [By file](#by-file)
|
||||
* [Filtering](#filtering-vendoring-binaries-etc)
|
||||
* [Coloring](#language-colors-and-groups)
|
||||
* [Languages](#languages)
|
||||
* [Go](#go)
|
||||
* [Java bindings](#java-bindings)
|
||||
* [Python bindings](#python-bindings)
|
||||
* [Divergences from linguist](#divergences-from-linguist)
|
||||
* [Benchmarks](#benchmarks)
|
||||
* [Why Enry?](#why-enry)
|
||||
* [Development](#development)
|
||||
* [Sync with github/linguist upstream](#sync-with-githublinguist-upstream)
|
||||
* [Misc](#misc)
|
||||
* [License](#license)
|
||||
- [CLI](#cli)
|
||||
- [Library](#library)
|
||||
- [Use cases](#use-cases)
|
||||
- [By filename](#by-filename)
|
||||
- [By text](#by-text)
|
||||
- [By file](#by-file)
|
||||
- [Filtering](#filtering-vendoring-binaries-etc)
|
||||
- [Coloring](#language-colors-and-groups)
|
||||
- [Languages](#languages)
|
||||
- [Go](#go)
|
||||
- [Java bindings](#java-bindings)
|
||||
- [Python bindings](#python-bindings)
|
||||
- [Divergences from linguist](#divergences-from-linguist)
|
||||
- [Benchmarks](#benchmarks)
|
||||
- [Why Enry?](#why-enry)
|
||||
- [Development](#development)
|
||||
- [Sync with github/linguist upstream](#sync-with-githublinguist-upstream)
|
||||
- [Misc](#misc)
|
||||
- [License](#license)
|
||||
|
||||
# CLI
|
||||
|
||||
|
@ -28,50 +28,62 @@ The CLI binary is hosted in a separate repository [go-enry/enry](https://github.
|
|||
|
||||
# Library
|
||||
|
||||
*enry* is also a Go library for guessing a programming language that exposes API through FFI to multiple programming environments.
|
||||
_enry_ is also a Go library for guessing a programming language that exposes API through FFI to multiple programming environments.
|
||||
|
||||
## Use cases
|
||||
|
||||
*enry* guesses a programming language using a sequence of matching *strategies* that are
|
||||
applied progressively to narrow down the possible options. Each *strategy* varies on the type
|
||||
_enry_ guesses a programming language using a sequence of matching _strategies_ that are
|
||||
applied progressively to narrow down the possible options. Each _strategy_ varies on the type
|
||||
of input data that it needs to make a decision: file name, extension, the first line of the file, the full content of the file, etc.
|
||||
|
||||
Depending on available input data, enry API can be roughly divided into the next categories or use cases.
|
||||
|
||||
### By filename
|
||||
Next functions require only a name of the file to make a guess:
|
||||
- `GetLanguageByExtension` uses only file extension (wich may be ambiguous)
|
||||
- `GetLanguageByFilename` useful for cases like `.gitignore`, `.bashrc`, etc
|
||||
- all [filtering helpers](#filtering)
|
||||
|
||||
Please note that such guesses are expected not to be very accurate.
|
||||
Next functions require only a name of the file to make a guess:
|
||||
|
||||
- `GetLanguageByExtension` uses only file extension (wich may be ambiguous)
|
||||
- `GetLanguageByFilename` useful for cases like `.gitignore`, `.bashrc`, etc
|
||||
- all [filtering helpers](#filtering)
|
||||
|
||||
Please note that such guesses are expected not to be very accurate.
|
||||
|
||||
### By text
|
||||
To make a guess only based on the content of the file or a text snippet, use
|
||||
- `GetLanguageByShebang` reads only the first line of text to identify the [shebang](https://en.wikipedia.org/wiki/Shebang_(Unix)).
|
||||
- `GetLanguageByModeline` for cases when Vim/Emacs modeline e.g. `/* vim: set ft=cpp: */` may be present at a head or a tail of the text.
|
||||
- `GetLanguageByClassifier` uses a Bayesian classifier trained on all the `./samples/` from Linguist.
|
||||
|
||||
It usually is a last-resort strategy that is used to disambiguate the guess of the previous strategies, and thus it requires a list of "candidate" guesses. One can provide a list of all known languages - keys from the `data.LanguagesLogProbabilities` as possible candidates if more intelligent hypotheses are not available, at the price of possibly suboptimal accuracy.
|
||||
To make a guess only based on the content of the file or a text snippet, use
|
||||
|
||||
- `GetLanguageByShebang` reads only the first line of text to identify the [shebang](<https://en.wikipedia.org/wiki/Shebang_(Unix)>).
|
||||
- `GetLanguageByModeline` for cases when Vim/Emacs modeline e.g. `/* vim: set ft=cpp: */` may be present at a head or a tail of the text.
|
||||
- `GetLanguageByClassifier` uses a Bayesian classifier trained on all the `./samples/` from Linguist.
|
||||
|
||||
It usually is a last-resort strategy that is used to disambiguate the guess of the previous strategies, and thus it requires a list of "candidate" guesses. One can provide a list of all known languages - keys from the `data.LanguagesLogProbabilities` as possible candidates if more intelligent hypotheses are not available, at the price of possibly suboptimal accuracy.
|
||||
|
||||
### By file
|
||||
|
||||
The most accurate guess would be one when both, the file name and the content are available:
|
||||
- `GetLanguagesByContent` only uses file extension and a set of regexp-based content heuristics.
|
||||
- `GetLanguages` uses the full set of matching strategies and is expected to be most accurate.
|
||||
|
||||
- `GetLanguagesByContent` only uses file extension and a set of regexp-based content heuristics.
|
||||
- `GetLanguages` uses the full set of matching strategies and is expected to be most accurate.
|
||||
|
||||
### Filtering: vendoring, binaries, etc
|
||||
*enry* expose a set of file-level helpers `Is*` to simplify filtering out the files that are less interesting for the purpose of source code analysis:
|
||||
- `IsBinary`
|
||||
- `IsVendor`
|
||||
- `IsConfiguration`
|
||||
- `IsDocumentation`
|
||||
- `IsDotFile`
|
||||
- `IsImage`
|
||||
|
||||
_enry_ expose a set of file-level helpers `Is*` to simplify filtering out the files that are less interesting for the purpose of source code analysis:
|
||||
|
||||
- `IsBinary`
|
||||
- `IsVendor`
|
||||
- `IsConfiguration`
|
||||
- `IsDocumentation`
|
||||
- `IsDotFile`
|
||||
- `IsImage`
|
||||
- `IsTest`
|
||||
- `IsGenerated`
|
||||
|
||||
### Language colors and groups
|
||||
*enry* exposes function to get language color to use for example in presenting statistics in graphs:
|
||||
- `GetColor`
|
||||
- `GetLanguageGroup` can be used to group similar languages together e.g. for `Less` this function will return `CSS`
|
||||
|
||||
_enry_ exposes function to get language color to use for example in presenting statistics in graphs:
|
||||
|
||||
- `GetColor`
|
||||
- `GetLanguageGroup` can be used to group similar languages together e.g. for `Less` this function will return `CSS`
|
||||
|
||||
## Languages
|
||||
|
||||
|
@ -136,39 +148,36 @@ Generated Python bindings using a C shared library and cffi are WIP under [src-d
|
|||
A library is going to be published on pypi as [enry](https://pypi.org/project/enry/) for
|
||||
macOS and linux platforms. Windows support is planned under [src-d/enry#150](https://github.com/src-d/enry/issues/150).
|
||||
|
||||
Divergences from Linguist
|
||||
------------
|
||||
## Divergences from Linguist
|
||||
|
||||
The `enry` library is based on the data from `github/linguist` version **v7.9.0**.
|
||||
|
||||
Parsing [linguist/samples](https://github.com/github/linguist/tree/master/samples) the following `enry` results are different from the Linguist:
|
||||
|
||||
* [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine.
|
||||
- [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine.
|
||||
|
||||
* [Heuristics for ".rno" extension](https://github.com/github/linguist/blob/3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d/lib/linguist/heuristics.yml#L365) in RUNOFF could not be parsed, due to unsupported lookahead in RE2 regexp engine.
|
||||
- [Heuristics for ".rno" extension](https://github.com/github/linguist/blob/3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d/lib/linguist/heuristics.yml#L365) in RUNOFF could not be parsed, due to unsupported lookahead in RE2 regexp engine.
|
||||
|
||||
* [Heuristics for ".inc" extension](https://github.com/github/linguist/blob/f0e2d0d7f1ce600b2a5acccaef6b149c87d8b99c/lib/linguist/heuristics.yml#L222) in NASL could not be parsed, due to unsupported possessive quantifier in RE2 regexp engine.
|
||||
- [Heuristics for ".inc" extension](https://github.com/github/linguist/blob/f0e2d0d7f1ce600b2a5acccaef6b149c87d8b99c/lib/linguist/heuristics.yml#L222) in NASL could not be parsed, due to unsupported possessive quantifier in RE2 regexp engine.
|
||||
|
||||
* As of [Linguist v5.3.2](https://github.com/github/linguist/releases/tag/v5.3.2) it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193).
|
||||
- As of [Linguist v5.3.2](https://github.com/github/linguist/releases/tag/v5.3.2) it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193).
|
||||
|
||||
* Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194).
|
||||
- Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194).
|
||||
|
||||
* Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet.
|
||||
(Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213).
|
||||
- Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet.
|
||||
(Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213).
|
||||
|
||||
* XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192).
|
||||
- XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192).
|
||||
|
||||
* Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18).
|
||||
- Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18).
|
||||
|
||||
* `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as Linguist does
|
||||
- `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as Linguist does
|
||||
|
||||
In all the cases above that have an issue number - we plan to update enry to match Linguist behavior.
|
||||
|
||||
## Benchmarks
|
||||
|
||||
Benchmarks
|
||||
------------
|
||||
|
||||
Enry's language detection has been compared with Linguist's on [*linguist/samples*](https://github.com/github/linguist/tree/master/samples).
|
||||
Enry's language detection has been compared with Linguist's on [_linguist/samples_](https://github.com/github/linguist/tree/master/samples).
|
||||
|
||||
We got these results:
|
||||
|
||||
|
@ -182,9 +191,7 @@ Go regexp engine being slower than Ruby's on, wich is based on [oniguruma](https
|
|||
|
||||
See [instructions](#misc) for running enry with oniguruma.
|
||||
|
||||
|
||||
Why Enry?
|
||||
------------
|
||||
## Why Enry?
|
||||
|
||||
In the movie [My Fair Lady](https://en.wikipedia.org/wiki/My_Fair_Lady), [Professor Henry Higgins](http://www.imdb.com/character/ch0011719/) is a linguist who at the very beginning of the movie enjoys guessing the origin of people based on their accent.
|
||||
|
||||
|
@ -199,10 +206,9 @@ To run the tests use:
|
|||
Setting `ENRY_TEST_REPO` to the path to existing checkout of Linguist will avoid cloning it and sepeed tests up.
|
||||
Setting `ENRY_DEBUG=1` will provide insight in the Bayesian classifier building done by `make code-generate`.
|
||||
|
||||
|
||||
### Sync with github/linguist upstream
|
||||
|
||||
*enry* re-uses parts of the original [github/linguist](https://github.com/github/linguist) to generate internal data structures.
|
||||
_enry_ re-uses parts of the original [github/linguist](https://github.com/github/linguist) to generate internal data structures.
|
||||
In order to update to the latest release of linguist do:
|
||||
|
||||
```bash
|
||||
|
@ -217,10 +223,10 @@ $ make code-generate
|
|||
|
||||
To stay in sync, enry needs to be updated when a new release of the linguist includes changes to any of the following files:
|
||||
|
||||
* [languages.yml](https://github.com/github/linguist/blob/master/lib/linguist/languages.yml)
|
||||
* [heuristics.yml](https://github.com/github/linguist/blob/master/lib/linguist/heuristics.yml)
|
||||
* [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml)
|
||||
* [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml)
|
||||
- [languages.yml](https://github.com/github/linguist/blob/master/lib/linguist/languages.yml)
|
||||
- [heuristics.yml](https://github.com/github/linguist/blob/master/lib/linguist/heuristics.yml)
|
||||
- [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml)
|
||||
- [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml)
|
||||
|
||||
There is no automation for detecting the changes in the linguist project, so this process above has to be done manually from time to time.
|
||||
|
||||
|
@ -229,8 +235,6 @@ the generated files (in [data](https://github.com/go-enry/go-enry/blob/master/da
|
|||
|
||||
Separating all the necessary "manual" code changes to a different PR that includes some background description and an update to the documentation on ["divergences from linguist"](#divergences-from-linguist) is very much appreciated as it simplifies the maintenance (review/release notes/etc).
|
||||
|
||||
|
||||
|
||||
## Misc
|
||||
|
||||
<details>
|
||||
|
@ -238,19 +242,20 @@ Separating all the necessary "manual" code changes to a different PR that includ
|
|||
|
||||
### Benchmark
|
||||
|
||||
All benchmark scripts are in [*benchmarks*](https://github.com/go-enry/go-enry/blob/master/benchmarks) directory.
|
||||
|
||||
All benchmark scripts are in [_benchmarks_](https://github.com/go-enry/go-enry/blob/master/benchmarks) directory.
|
||||
|
||||
#### Dependencies
|
||||
As benchmarks depend on Ruby and Github-Linguist gem make sure you have:
|
||||
- Ruby (e.g using [`rbenv`](https://github.com/rbenv/rbenv)), [`bundler`](https://bundler.io/) installed
|
||||
- Docker
|
||||
- [native dependencies](https://github.com/github/linguist/#dependencies) installed
|
||||
- Build the gem `cd .linguist && bundle install && rake build_gem && cd -`
|
||||
- Install it `gem install --no-rdoc --no-ri --local .linguist/github-linguist-*.gem`
|
||||
|
||||
As benchmarks depend on Ruby and Github-Linguist gem make sure you have:
|
||||
|
||||
- Ruby (e.g using [`rbenv`](https://github.com/rbenv/rbenv)), [`bundler`](https://bundler.io/) installed
|
||||
- Docker
|
||||
- [native dependencies](https://github.com/github/linguist/#dependencies) installed
|
||||
- Build the gem `cd .linguist && bundle install && rake build_gem && cd -`
|
||||
- Install it `gem install --no-rdoc --no-ri --local .linguist/github-linguist-*.gem`
|
||||
|
||||
#### Quick benchmark
|
||||
|
||||
To run quicker benchmarks
|
||||
|
||||
make benchmarks
|
||||
|
@ -259,19 +264,20 @@ to get average times for the primary detection function and strategies for the w
|
|||
|
||||
make benchmarks-samples
|
||||
|
||||
|
||||
#### Full benchmark
|
||||
|
||||
If you want to reproduce the same benchmarks as reported above:
|
||||
- Make sure all [dependencies](#benchmark-dependencies) are installed
|
||||
- Install [gnuplot](http://gnuplot.info) (in order to plot the histogram)
|
||||
- Run `ENRY_TEST_REPO="$PWD/.linguist" benchmarks/run.sh` (takes ~15h)
|
||||
|
||||
- Make sure all [dependencies](#benchmark-dependencies) are installed
|
||||
- Install [gnuplot](http://gnuplot.info) (in order to plot the histogram)
|
||||
- Run `ENRY_TEST_REPO="$PWD/.linguist" benchmarks/run.sh` (takes ~15h)
|
||||
|
||||
It will run the benchmarks for enry and Linguist, parse the output, create csv files and plot the histogram.
|
||||
|
||||
### Faster regexp engine (optional)
|
||||
|
||||
[Oniguruma](https://github.com/kkos/oniguruma) is CRuby's regular expression engine.
|
||||
It is very fast and performs better than the one built into Go runtime. *enry* supports swapping
|
||||
It is very fast and performs better than the one built into Go runtime. _enry_ supports swapping
|
||||
between those two engines thanks to [rubex](https://github.com/moovweb/rubex) project.
|
||||
The typical overall speedup from using Oniguruma is 1.5-2x. However, it requires CGo and the external shared library.
|
||||
On macOS with [Homebrew](https://brew.sh/), it is:
|
||||
|
@ -296,8 +302,6 @@ and then rebuild the project.
|
|||
|
||||
</details>
|
||||
|
||||
|
||||
License
|
||||
------------
|
||||
## License
|
||||
|
||||
Apache License, Version 2.0. See [LICENSE](LICENSE)
|
||||
|
|
|
@ -328,15 +328,13 @@ func getInterpreter(data []byte) (interpreter string) {
|
|||
return
|
||||
}
|
||||
|
||||
func getFirstLine(data []byte) []byte {
|
||||
buf := bufio.NewScanner(bytes.NewReader(data))
|
||||
buf.Scan()
|
||||
line := buf.Bytes()
|
||||
if err := buf.Err(); err != nil {
|
||||
return nil
|
||||
func getFirstLine(content []byte) []byte {
|
||||
nlpos := bytes.IndexByte(content, '\n')
|
||||
if nlpos < 0 {
|
||||
return content
|
||||
}
|
||||
|
||||
return line
|
||||
return content[:nlpos]
|
||||
}
|
||||
|
||||
func hasShebang(line []byte) bool {
|
||||
|
|
|
@ -3,24 +3,24 @@
|
|||
|
||||
package data
|
||||
|
||||
import "gopkg.in/toqueteos/substring.v1"
|
||||
import "github.com/go-enry/go-enry/v2/regex"
|
||||
|
||||
var DocumentationMatchers = substring.Or(
|
||||
substring.Regexp(`^[Dd]ocs?/`),
|
||||
substring.Regexp(`(^|/)[Dd]ocumentation/`),
|
||||
substring.Regexp(`(^|/)[Gg]roovydoc/`),
|
||||
substring.Regexp(`(^|/)[Jj]avadoc/`),
|
||||
substring.Regexp(`^[Mm]an/`),
|
||||
substring.Regexp(`^[Ee]xamples/`),
|
||||
substring.Regexp(`^[Dd]emos?/`),
|
||||
substring.Regexp(`(^|/)inst/doc/`),
|
||||
substring.Regexp(`(^|/)CHANGE(S|LOG)?(\.|$)`),
|
||||
substring.Regexp(`(^|/)CONTRIBUTING(\.|$)`),
|
||||
substring.Regexp(`(^|/)COPYING(\.|$)`),
|
||||
substring.Regexp(`(^|/)INSTALL(\.|$)`),
|
||||
substring.Regexp(`(^|/)LICEN[CS]E(\.|$)`),
|
||||
substring.Regexp(`(^|/)[Ll]icen[cs]e(\.|$)`),
|
||||
substring.Regexp(`(^|/)README(\.|$)`),
|
||||
substring.Regexp(`(^|/)[Rr]eadme(\.|$)`),
|
||||
substring.Regexp(`^[Ss]amples?/`),
|
||||
)
|
||||
var DocumentationMatchers = []regex.EnryRegexp{
|
||||
regex.MustCompile(`^[Dd]ocs?/`),
|
||||
regex.MustCompile(`(^|/)[Dd]ocumentation/`),
|
||||
regex.MustCompile(`(^|/)[Gg]roovydoc/`),
|
||||
regex.MustCompile(`(^|/)[Jj]avadoc/`),
|
||||
regex.MustCompile(`^[Mm]an/`),
|
||||
regex.MustCompile(`^[Ee]xamples/`),
|
||||
regex.MustCompile(`^[Dd]emos?/`),
|
||||
regex.MustCompile(`(^|/)inst/doc/`),
|
||||
regex.MustCompile(`(^|/)CHANGE(S|LOG)?(\.|$)`),
|
||||
regex.MustCompile(`(^|/)CONTRIBUTING(\.|$)`),
|
||||
regex.MustCompile(`(^|/)COPYING(\.|$)`),
|
||||
regex.MustCompile(`(^|/)INSTALL(\.|$)`),
|
||||
regex.MustCompile(`(^|/)LICEN[CS]E(\.|$)`),
|
||||
regex.MustCompile(`(^|/)[Ll]icen[cs]e(\.|$)`),
|
||||
regex.MustCompile(`(^|/)README(\.|$)`),
|
||||
regex.MustCompile(`(^|/)[Rr]eadme(\.|$)`),
|
||||
regex.MustCompile(`^[Ss]amples?/`),
|
||||
}
|
||||
|
|
|
@ -0,0 +1,823 @@
|
|||
package data
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strings"
|
||||
|
||||
"github.com/go-enry/go-enry/v2/regex"
|
||||
)
|
||||
|
||||
// GeneratedCodeExtensions contains all extensions that belong to generated
|
||||
// files for sure.
|
||||
var GeneratedCodeExtensions = map[string]struct{}{
|
||||
// XCode files
|
||||
".nib": {},
|
||||
".xcworkspacedata": {},
|
||||
".xcuserstate": {},
|
||||
}
|
||||
|
||||
// GeneratedCodeNameMatcher is a function that tells whether the file with the
|
||||
// given name is generated.
|
||||
type GeneratedCodeNameMatcher func(string) bool
|
||||
|
||||
func nameMatches(pattern string) GeneratedCodeNameMatcher {
|
||||
r := regex.MustCompile(pattern)
|
||||
return func(name string) bool {
|
||||
return r.MatchString(name)
|
||||
}
|
||||
}
|
||||
|
||||
func nameContains(pattern string) GeneratedCodeNameMatcher {
|
||||
return func(name string) bool {
|
||||
return strings.Contains(name, pattern)
|
||||
}
|
||||
}
|
||||
|
||||
func nameEndsWith(pattern string) GeneratedCodeNameMatcher {
|
||||
return func(name string) bool {
|
||||
return strings.HasSuffix(name, pattern)
|
||||
}
|
||||
}
|
||||
|
||||
// GeneratedCodeNameMatchers are all the matchers that check whether the code
|
||||
// is generated based only on the file name.
|
||||
var GeneratedCodeNameMatchers = []GeneratedCodeNameMatcher{
|
||||
// Cocoa pods
|
||||
nameMatches(`(^Pods|\/Pods)\/`),
|
||||
|
||||
// Carthage build
|
||||
nameMatches(`(^|\/)Carthage\/Build\/`),
|
||||
|
||||
// NET designer file
|
||||
nameMatches(`(?i)\.designer\.(cs|vb)$`),
|
||||
|
||||
// Generated NET specflow feature file
|
||||
nameEndsWith(".feature.cs"),
|
||||
|
||||
// Node modules
|
||||
nameContains("node_modules/"),
|
||||
|
||||
// Go vendor
|
||||
nameMatches(`vendor\/([-0-9A-Za-z]+\.)+(com|edu|gov|in|me|net|org|fm|io)`),
|
||||
|
||||
// Go lock
|
||||
nameEndsWith("Gopkg.lock"),
|
||||
nameEndsWith("glide.lock"),
|
||||
|
||||
// Esy lock
|
||||
nameMatches(`(^|\/)(\w+\.)?esy.lock$`),
|
||||
|
||||
// NPM shrinkwrap
|
||||
nameEndsWith("npm-shrinkwrap.json"),
|
||||
|
||||
// NPM package lock
|
||||
nameEndsWith("package-lock.json"),
|
||||
|
||||
// Yarn plugnplay
|
||||
nameMatches(`(^|\/)\.pnp\.(c|m)?js$`),
|
||||
|
||||
// Godeps
|
||||
nameContains("Godeps/"),
|
||||
|
||||
// Composer lock
|
||||
nameEndsWith("composer.lock"),
|
||||
|
||||
// Generated by zephir
|
||||
nameMatches(`.\.zep\.(?:c|h|php)$`),
|
||||
|
||||
// Cargo lock
|
||||
nameEndsWith("Cargo.lock"),
|
||||
|
||||
// Pipenv lock
|
||||
nameEndsWith("Pipfile.lock"),
|
||||
|
||||
// GraphQL relay
|
||||
nameContains("__generated__/"),
|
||||
}
|
||||
|
||||
// GeneratedCodeMatcher checks whether the file with the given data is
|
||||
// generated code.
|
||||
type GeneratedCodeMatcher func(path, ext string, content []byte) bool
|
||||
|
||||
// GeneratedCodeMatchers is the list of all generated code matchers that
|
||||
// rely on checking the content of the file to make the guess.
|
||||
var GeneratedCodeMatchers = []GeneratedCodeMatcher{
|
||||
isMinifiedFile,
|
||||
hasSourceMapReference,
|
||||
isSourceMap,
|
||||
isCompiledCoffeeScript,
|
||||
isGeneratedNetDocfile,
|
||||
isGeneratedJavaScriptPEGParser,
|
||||
isGeneratedPostScript,
|
||||
isGeneratedGo,
|
||||
isGeneratedProtobuf,
|
||||
isGeneratedJavaScriptProtocolBuffer,
|
||||
isGeneratedApacheThrift,
|
||||
isGeneratedJNIHeader,
|
||||
isVCRCassette,
|
||||
isCompiledCythonFile,
|
||||
isGeneratedModule,
|
||||
isGeneratedUnity3DMeta,
|
||||
isGeneratedRacc,
|
||||
isGeneratedJFlex,
|
||||
isGeneratedGrammarKit,
|
||||
isGeneratedRoxygen2,
|
||||
isGeneratedJison,
|
||||
isGeneratedGRPCCpp,
|
||||
isGeneratedDart,
|
||||
isGeneratedPerlPPPortHeader,
|
||||
isGeneratedGameMakerStudio,
|
||||
isGeneratedGimp,
|
||||
isGeneratedVisualStudio6,
|
||||
isGeneratedHaxe,
|
||||
isGeneratedHTML,
|
||||
isGeneratedJooq,
|
||||
}
|
||||
|
||||
func canBeMinified(ext string) bool {
|
||||
return ext == ".js" || ext == ".css"
|
||||
}
|
||||
|
||||
// isMinifiedFile returns whether the file may be minified.
|
||||
// We consider a minified file any css or js file whose average number of chars
|
||||
// per line is more than 110.
|
||||
func isMinifiedFile(path, ext string, content []byte) bool {
|
||||
if !canBeMinified(ext) {
|
||||
return false
|
||||
}
|
||||
|
||||
var chars, lines uint64
|
||||
forEachLine(content, func(line []byte) {
|
||||
chars += uint64(len(line))
|
||||
lines++
|
||||
})
|
||||
|
||||
if lines == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
return chars/lines > 110
|
||||
}
|
||||
|
||||
var sourceMapRegex = regex.MustCompile(`^\/[*\/][\#@] source(?:Mapping)?URL|sourceURL=`)
|
||||
|
||||
// hasSourceMapReference returns whether the file contains a reference to a
|
||||
// source-map file.
|
||||
func hasSourceMapReference(_ string, ext string, content []byte) bool {
|
||||
if !canBeMinified(ext) {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, line := range getLines(content, -2) {
|
||||
if sourceMapRegex.Match(line) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
var sourceMapRegexps = []regex.EnryRegexp{
|
||||
regex.MustCompile(`^{"version":\d+,`),
|
||||
regex.MustCompile(`^\/\*\* Begin line maps\. \*\*\/{`),
|
||||
}
|
||||
|
||||
// isSourceMap returns whether the file itself is a source map.
|
||||
func isSourceMap(path, _ string, content []byte) bool {
|
||||
if strings.HasSuffix(path, ".js.map") || strings.HasSuffix(path, ".css.map") {
|
||||
return true
|
||||
}
|
||||
|
||||
firstLine := getFirstLine(content)
|
||||
if len(firstLine) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, r := range sourceMapRegexps {
|
||||
if r.Match(firstLine) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func isCompiledCoffeeScript(path, ext string, content []byte) bool {
|
||||
if ext != ".js" {
|
||||
return false
|
||||
}
|
||||
|
||||
firstLine := getFirstLine(content)
|
||||
lastLines := getLines(content, -2)
|
||||
if len(lastLines) < 2 {
|
||||
return false
|
||||
}
|
||||
|
||||
if string(firstLine) == "(function() {" &&
|
||||
string(lastLines[1]) == "}).call(this);" &&
|
||||
string(lastLines[0]) == "" {
|
||||
score := 0
|
||||
|
||||
forEachLine(content, func(line []byte) {
|
||||
if bytes.Contains(line, []byte("var ")) {
|
||||
// Underscored temp vars are likely to be Coffee
|
||||
score += 1 * countAppearancesInLine(line, "_fn", "_i", "_len", "_ref", "_results")
|
||||
|
||||
// bind and extend functions are very Coffee specific
|
||||
score += 3 * countAppearancesInLine(line, "__bind", "__extends", "__hasProp", "__indexOf", "__slice")
|
||||
}
|
||||
})
|
||||
|
||||
// Require a score of 3. This is fairly abritrary. Consider tweaking later.
|
||||
// See: https://github.com/github/linguist/blob/master/lib/linguist/generated.rb#L176-L213
|
||||
return score >= 3
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func isGeneratedNetDocfile(_, ext string, content []byte) bool {
|
||||
if ext != ".xml" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := bytes.Split(content, []byte{'\n'})
|
||||
if len(lines) <= 3 {
|
||||
return false
|
||||
}
|
||||
|
||||
return bytes.Contains(lines[1], []byte("<doc>")) &&
|
||||
bytes.Contains(lines[2], []byte("<assembly>")) &&
|
||||
bytes.Contains(lines[len(lines)-2], []byte("</doc>"))
|
||||
}
|
||||
|
||||
var pegJavaScriptGeneratedRegex = regex.MustCompile(`^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js`)
|
||||
|
||||
func isGeneratedJavaScriptPEGParser(_, ext string, content []byte) bool {
|
||||
if ext != ".js" {
|
||||
return false
|
||||
}
|
||||
|
||||
// PEG.js-generated parsers include a comment near the top of the file
|
||||
// that marks them as such.
|
||||
return pegJavaScriptGeneratedRegex.Match(bytes.Join(getLines(content, 5), []byte("")))
|
||||
}
|
||||
|
||||
var postScriptType1And42Regex = regex.MustCompile(`(\n|\r\n|\r)\s*(?:currentfile eexec\s+|\/sfnts\s+\[)`)
|
||||
|
||||
var postScriptRegexes = []regex.EnryRegexp{
|
||||
regex.MustCompile(`[0-9]|draw|mpage|ImageMagick|inkscape|MATLAB`),
|
||||
regex.MustCompile(`PCBNEW|pnmtops|\(Unknown\)|Serif Affinity|Filterimage -tops`),
|
||||
}
|
||||
|
||||
func isGeneratedPostScript(_, ext string, content []byte) bool {
|
||||
if ext != ".ps" && ext != ".eps" && ext != ".pfa" {
|
||||
return false
|
||||
}
|
||||
|
||||
// Type 1 and Type 42 fonts converted to PostScript are stored as hex-encoded byte streams; these
|
||||
// streams are always preceded the `eexec` operator (if Type 1), or the `/sfnts` key (if Type 42).
|
||||
if postScriptType1And42Regex.Match(content) {
|
||||
return true
|
||||
}
|
||||
|
||||
// We analyze the "%%Creator:" comment, which contains the author/generator
|
||||
// of the file. If there is one, it should be in one of the first few lines.
|
||||
var creator []byte
|
||||
for _, line := range getLines(content, 10) {
|
||||
if bytes.HasPrefix(line, []byte("%%Creator: ")) {
|
||||
creator = line
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if len(creator) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
// EAGLE doesn't include a version number when it generates PostScript.
|
||||
// However, it does prepend its name to the document's "%%Title" field.
|
||||
if bytes.Contains(creator, []byte("EAGLE")) {
|
||||
for _, line := range getLines(content, 5) {
|
||||
if bytes.HasPrefix(line, []byte("%%Title: EAGLE Drawing ")) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Most generators write their version number, while human authors' or companies'
|
||||
// names don't contain numbers. So look if the line contains digits. Also
|
||||
// look for some special cases without version numbers.
|
||||
for _, r := range postScriptRegexes {
|
||||
if r.Match(creator) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func isGeneratedGo(_, ext string, content []byte) bool {
|
||||
if ext != ".go" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 40)
|
||||
if len(lines) <= 1 {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, line := range lines {
|
||||
if bytes.Contains(line, []byte("Code generated by")) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
var protoExtensions = map[string]struct{}{
|
||||
".py": {},
|
||||
".java": {},
|
||||
".h": {},
|
||||
".cc": {},
|
||||
".cpp": {},
|
||||
".m": {},
|
||||
".rb": {},
|
||||
".php": {},
|
||||
}
|
||||
|
||||
func isGeneratedProtobuf(_, ext string, content []byte) bool {
|
||||
if _, ok := protoExtensions[ext]; !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 3)
|
||||
if len(lines) <= 1 {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, line := range lines {
|
||||
if bytes.Contains(line, []byte("Generated by the protocol buffer compiler. DO NOT EDIT!")) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func isGeneratedJavaScriptProtocolBuffer(_, ext string, content []byte) bool {
|
||||
if ext != ".js" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 6)
|
||||
if len(lines) < 6 {
|
||||
return false
|
||||
}
|
||||
|
||||
return bytes.Contains(lines[5], []byte("GENERATED CODE -- DO NOT EDIT!"))
|
||||
}
|
||||
|
||||
var apacheThriftExtensions = map[string]struct{}{
|
||||
".rb": {},
|
||||
".py": {},
|
||||
".go": {},
|
||||
".js": {},
|
||||
".m": {},
|
||||
".java": {},
|
||||
".h": {},
|
||||
".cc": {},
|
||||
".cpp": {},
|
||||
".php": {},
|
||||
}
|
||||
|
||||
func isGeneratedApacheThrift(_, ext string, content []byte) bool {
|
||||
if _, ok := apacheThriftExtensions[ext]; !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, line := range getLines(content, 6) {
|
||||
if bytes.Contains(line, []byte("Autogenerated by Thrift Compiler")) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func isGeneratedJNIHeader(_, ext string, content []byte) bool {
|
||||
if ext != ".h" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 2)
|
||||
if len(lines) < 2 {
|
||||
return false
|
||||
}
|
||||
|
||||
return bytes.Contains(lines[0], []byte("/* DO NOT EDIT THIS FILE - it is machine generated */")) &&
|
||||
bytes.Contains(lines[1], []byte("#include <jni.h>"))
|
||||
}
|
||||
|
||||
func isVCRCassette(_, ext string, content []byte) bool {
|
||||
if ext != ".yml" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, -2)
|
||||
if len(lines) < 2 {
|
||||
return false
|
||||
}
|
||||
|
||||
return bytes.Contains(lines[1], []byte("recorded_with: VCR"))
|
||||
}
|
||||
|
||||
func isCompiledCythonFile(_, ext string, content []byte) bool {
|
||||
if ext != ".c" && ext != ".cpp" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 1)
|
||||
if len(lines) < 1 {
|
||||
return false
|
||||
}
|
||||
|
||||
return bytes.Contains(lines[0], []byte("Generated by Cython"))
|
||||
}
|
||||
|
||||
func isGeneratedModule(_, ext string, content []byte) bool {
|
||||
if ext != ".mod" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 1)
|
||||
if len(lines) < 1 {
|
||||
return false
|
||||
}
|
||||
|
||||
return bytes.Contains(lines[0], []byte("PCBNEW-LibModule-V")) ||
|
||||
bytes.Contains(lines[0], []byte("GFORTRAN module version '"))
|
||||
}
|
||||
|
||||
func isGeneratedUnity3DMeta(_, ext string, content []byte) bool {
|
||||
if ext != ".meta" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 1)
|
||||
if len(lines) < 1 {
|
||||
return false
|
||||
}
|
||||
|
||||
return bytes.Contains(lines[0], []byte("fileFormatVersion: "))
|
||||
}
|
||||
|
||||
func isGeneratedRacc(_, ext string, content []byte) bool {
|
||||
if ext != ".rb" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 3)
|
||||
if len(lines) < 3 {
|
||||
return false
|
||||
}
|
||||
|
||||
return bytes.HasPrefix(lines[2], []byte("# This file is automatically generated by Racc"))
|
||||
}
|
||||
|
||||
func isGeneratedJFlex(_, ext string, content []byte) bool {
|
||||
if ext != ".java" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 1)
|
||||
if len(lines) < 1 {
|
||||
return false
|
||||
}
|
||||
|
||||
return bytes.HasPrefix(lines[0], []byte("/* The following code was generated by JFlex "))
|
||||
}
|
||||
|
||||
func isGeneratedGrammarKit(_, ext string, content []byte) bool {
|
||||
if ext != ".java" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 1)
|
||||
if len(lines) < 1 {
|
||||
return false
|
||||
}
|
||||
|
||||
return bytes.Contains(lines[0], []byte("// This is a generated file. Not intended for manual editing."))
|
||||
}
|
||||
|
||||
func isGeneratedRoxygen2(_, ext string, content []byte) bool {
|
||||
if ext != ".rd" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 1)
|
||||
if len(lines) < 1 {
|
||||
return false
|
||||
}
|
||||
|
||||
return bytes.Contains(lines[0], []byte("% Generated by roxygen2: do not edit by hand"))
|
||||
}
|
||||
|
||||
func isGeneratedJison(_, ext string, content []byte) bool {
|
||||
if ext != ".js" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 1)
|
||||
if len(lines) < 1 {
|
||||
return false
|
||||
}
|
||||
|
||||
return bytes.Contains(lines[0], []byte("/* parser generated by jison ")) ||
|
||||
bytes.Contains(lines[0], []byte("/* generated by jison-lex "))
|
||||
}
|
||||
|
||||
func isGeneratedGRPCCpp(_, ext string, content []byte) bool {
|
||||
switch ext {
|
||||
case ".cpp", ".hpp", ".h", ".cc":
|
||||
lines := getLines(content, 1)
|
||||
if len(lines) < 1 {
|
||||
return false
|
||||
}
|
||||
|
||||
return bytes.Contains(lines[0], []byte("// Generated by the gRPC"))
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
var dartRegex = regex.MustCompile(`generated code\W{2,3}do not modify`)
|
||||
|
||||
func isGeneratedDart(_, ext string, content []byte) bool {
|
||||
if ext != ".dart" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 1)
|
||||
if len(lines) < 1 {
|
||||
return false
|
||||
}
|
||||
|
||||
return dartRegex.Match(bytes.ToLower(lines[0]))
|
||||
}
|
||||
|
||||
func isGeneratedPerlPPPortHeader(name, _ string, content []byte) bool {
|
||||
if !strings.HasSuffix(name, "ppport.h") {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 10)
|
||||
if len(lines) < 10 {
|
||||
return false
|
||||
}
|
||||
|
||||
return bytes.Contains(lines[8], []byte("Automatically created by Devel::PPPort"))
|
||||
}
|
||||
|
||||
var (
|
||||
gameMakerStudioFirstLineRegex = regex.MustCompile(`^\d\.\d\.\d.+\|\{`)
|
||||
gameMakerStudioThirdLineRegex = regex.MustCompile(`\"modelName\"\:\s*\"GM`)
|
||||
)
|
||||
|
||||
func isGeneratedGameMakerStudio(_, ext string, content []byte) bool {
|
||||
if ext != ".yy" && ext != ".yyp" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 3)
|
||||
if len(lines) < 3 {
|
||||
return false
|
||||
}
|
||||
|
||||
return gameMakerStudioThirdLineRegex.Match(lines[2]) ||
|
||||
gameMakerStudioFirstLineRegex.Match(lines[0])
|
||||
}
|
||||
|
||||
var gimpRegexes = []regex.EnryRegexp{
|
||||
regex.MustCompile(`\/\* GIMP [a-zA-Z0-9\- ]+ C\-Source image dump \(.+?\.c\) \*\/`),
|
||||
regex.MustCompile(`\/\* GIMP header image file format \([a-zA-Z0-9\- ]+\)\: .+?\.h \*\/`),
|
||||
}
|
||||
|
||||
func isGeneratedGimp(_, ext string, content []byte) bool {
|
||||
if ext != ".c" && ext != ".h" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 1)
|
||||
if len(lines) < 1 {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, r := range gimpRegexes {
|
||||
if r.Match(lines[0]) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func isGeneratedVisualStudio6(_, ext string, content []byte) bool {
|
||||
if ext != ".dsp" {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, l := range getLines(content, 3) {
|
||||
if bytes.Contains(l, []byte("# Microsoft Developer Studio Generated Build File")) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
var haxeExtensions = map[string]struct{}{
|
||||
".js": {},
|
||||
".py": {},
|
||||
".lua": {},
|
||||
".cpp": {},
|
||||
".h": {},
|
||||
".java": {},
|
||||
".cs": {},
|
||||
".php": {},
|
||||
}
|
||||
|
||||
func isGeneratedHaxe(_, ext string, content []byte) bool {
|
||||
if _, ok := haxeExtensions[ext]; !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, l := range getLines(content, 3) {
|
||||
if bytes.Contains(l, []byte("Generated by Haxe")) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
var (
|
||||
doxygenRegex = regex.MustCompile(`<!--\s+Generated by Doxygen\s+[.0-9]+\s*-->`)
|
||||
htmlMetaRegex = regex.MustCompile(`<meta(\s+[^>]+)>`)
|
||||
htmlMetaContentRegex = regex.MustCompile(`\s+(name|content|value)\s*=\s*("[^"]+"|'[^']+'|[^\s"']+)`)
|
||||
orgModeMetaRegex = regex.MustCompile(`org\s+mode`)
|
||||
)
|
||||
|
||||
func isGeneratedHTML(_, ext string, content []byte) bool {
|
||||
if ext != ".html" && ext != ".htm" && ext != ".xhtml" {
|
||||
return false
|
||||
}
|
||||
|
||||
lines := getLines(content, 30)
|
||||
|
||||
// Pkgdown
|
||||
if len(lines) >= 2 {
|
||||
for _, l := range lines[:2] {
|
||||
if bytes.Contains(l, []byte("<!-- Generated by pkgdown: do not edit by hand -->")) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Mandoc
|
||||
if len(lines) > 2 &&
|
||||
bytes.HasPrefix(lines[2], []byte("<!-- This is an automatically generated file.")) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Doxygen
|
||||
for _, l := range lines {
|
||||
if doxygenRegex.Match(l) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// HTML tag: <meta name="generator" content="" />
|
||||
part := bytes.ToLower(bytes.Join(lines, []byte{' '}))
|
||||
part = bytes.ReplaceAll(part, []byte{'\n'}, []byte{})
|
||||
part = bytes.ReplaceAll(part, []byte{'\r'}, []byte{})
|
||||
matches := htmlMetaRegex.FindAll(part, -1)
|
||||
if len(matches) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, m := range matches {
|
||||
var name, value, content string
|
||||
ms := htmlMetaContentRegex.FindAllStringSubmatch(string(m), -1)
|
||||
for _, m := range ms {
|
||||
switch m[1] {
|
||||
case "name":
|
||||
name = m[2]
|
||||
case "value":
|
||||
value = m[2]
|
||||
case "content":
|
||||
content = m[2]
|
||||
}
|
||||
}
|
||||
|
||||
var val = value
|
||||
if val == "" {
|
||||
val = content
|
||||
}
|
||||
|
||||
name = strings.Trim(name, `"'`)
|
||||
val = strings.Trim(val, `"'`)
|
||||
|
||||
if name != "generator" || val == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.Contains(val, "jlatex2html") ||
|
||||
strings.Contains(val, "latex2html") ||
|
||||
strings.Contains(val, "groff") ||
|
||||
strings.Contains(val, "makeinfo") ||
|
||||
strings.Contains(val, "texi2html") ||
|
||||
strings.Contains(val, "ronn") ||
|
||||
orgModeMetaRegex.MatchString(val) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func isGeneratedJooq(_, ext string, content []byte) bool {
|
||||
if ext != ".java" {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, l := range getLines(content, 2) {
|
||||
if bytes.Contains(l, []byte("This file is generated by jOOQ.")) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func getFirstLine(content []byte) []byte {
|
||||
lines := getLines(content, 1)
|
||||
if len(lines) > 0 {
|
||||
return lines[0]
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// getLines returns up to the first n lines. A negative index will return up to
|
||||
// the last n lines in reverse order.
|
||||
func getLines(content []byte, n int) [][]byte {
|
||||
var result [][]byte
|
||||
if n < 0 {
|
||||
for pos := len(content); pos > 0 && len(result) < -n; {
|
||||
nlpos := bytes.LastIndexByte(content[:pos], '\n')
|
||||
if nlpos+1 < len(content)-1 {
|
||||
result = append(result, content[nlpos+1:pos])
|
||||
}
|
||||
pos = nlpos
|
||||
}
|
||||
} else {
|
||||
for pos := 0; pos < len(content) && len(result) < n; {
|
||||
nlpos := bytes.IndexByte(content[pos:], '\n')
|
||||
if nlpos < 0 && pos < len(content) {
|
||||
nlpos = len(content)
|
||||
} else if nlpos >= 0 {
|
||||
nlpos += pos
|
||||
}
|
||||
|
||||
result = append(result, content[pos:nlpos])
|
||||
pos = nlpos + 1
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func forEachLine(content []byte, cb func([]byte)) {
|
||||
var pos int
|
||||
for pos < len(content) {
|
||||
nlpos := bytes.IndexByte(content[pos:], '\n')
|
||||
if nlpos < 0 && pos < len(content) {
|
||||
nlpos = len(content)
|
||||
} else if nlpos >= 0 {
|
||||
nlpos += pos
|
||||
}
|
||||
|
||||
cb(content[pos:nlpos])
|
||||
pos = nlpos + 1
|
||||
}
|
||||
}
|
||||
|
||||
func countAppearancesInLine(line []byte, targets ...string) int {
|
||||
var count int
|
||||
for _, t := range targets {
|
||||
count += bytes.Count(line, []byte(t))
|
||||
}
|
||||
return count
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
package data
|
||||
|
||||
import "github.com/go-enry/go-enry/v2/regex"
|
||||
|
||||
// TestMatchers is hand made collection of regexp used by the function `enry.IsTest`
|
||||
// to identify test files in different languages.
|
||||
var TestMatchers = []regex.EnryRegexp{
|
||||
regex.MustCompile(`(^|/)tests/.*Test\.php$`),
|
||||
regex.MustCompile(`(^|/)test/.*Test(s?)\.java$`),
|
||||
regex.MustCompile(`(^|/)test(/|/.*/)Test.*\.java$`),
|
||||
regex.MustCompile(`(^|/)test/.*(Test(s?)|Spec(s?))\.scala$`),
|
||||
regex.MustCompile(`(^|/)test_.*\.py$`),
|
||||
regex.MustCompile(`(^|/).*_test\.go$`),
|
||||
regex.MustCompile(`(^|/).*_(test|spec)\.rb$`),
|
||||
regex.MustCompile(`(^|/).*Test(s?)\.cs$`),
|
||||
regex.MustCompile(`(^|/).*\.(test|spec)\.(ts|tsx|js)$`),
|
||||
}
|
|
@ -3,167 +3,167 @@
|
|||
|
||||
package data
|
||||
|
||||
import "gopkg.in/toqueteos/substring.v1"
|
||||
import "github.com/go-enry/go-enry/v2/regex"
|
||||
|
||||
var VendorMatchers = substring.Or(
|
||||
substring.Regexp(`(^|/)cache/`),
|
||||
substring.Regexp(`^[Dd]ependencies/`),
|
||||
substring.Regexp(`(^|/)dist/`),
|
||||
substring.Regexp(`^deps/`),
|
||||
substring.Regexp(`(^|/)configure$`),
|
||||
substring.Regexp(`(^|/)config.guess$`),
|
||||
substring.Regexp(`(^|/)config.sub$`),
|
||||
substring.Regexp(`(^|/)aclocal.m4`),
|
||||
substring.Regexp(`(^|/)libtool.m4`),
|
||||
substring.Regexp(`(^|/)ltoptions.m4`),
|
||||
substring.Regexp(`(^|/)ltsugar.m4`),
|
||||
substring.Regexp(`(^|/)ltversion.m4`),
|
||||
substring.Regexp(`(^|/)lt~obsolete.m4`),
|
||||
substring.Regexp(`dotnet-install\.(ps1|sh)$`),
|
||||
substring.Regexp(`cpplint.py`),
|
||||
substring.Regexp(`node_modules/`),
|
||||
substring.Regexp(`(^|/)\.yarn/releases/`),
|
||||
substring.Regexp(`(^|/)_esy$`),
|
||||
substring.Regexp(`bower_components/`),
|
||||
substring.Regexp(`^rebar$`),
|
||||
substring.Regexp(`erlang.mk`),
|
||||
substring.Regexp(`Godeps/_workspace/`),
|
||||
substring.Regexp(`(^|/)testdata/`),
|
||||
substring.Regexp(`.indent.pro`),
|
||||
substring.Regexp(`(\.|-)min\.(js|css)$`),
|
||||
substring.Regexp(`([^\s]*)import\.(css|less|scss|styl)$`),
|
||||
substring.Regexp(`(^|/)bootstrap([^.]*)\.(js|css|less|scss|styl)$`),
|
||||
substring.Regexp(`(^|/)custom\.bootstrap([^\s]*)(js|css|less|scss|styl)$`),
|
||||
substring.Regexp(`(^|/)font-?awesome\.(css|less|scss|styl)$`),
|
||||
substring.Regexp(`(^|/)font-?awesome/.*\.(css|less|scss|styl)$`),
|
||||
substring.Regexp(`(^|/)foundation\.(css|less|scss|styl)$`),
|
||||
substring.Regexp(`(^|/)normalize\.(css|less|scss|styl)$`),
|
||||
substring.Regexp(`(^|/)skeleton\.(css|less|scss|styl)$`),
|
||||
substring.Regexp(`(^|/)[Bb]ourbon/.*\.(css|less|scss|styl)$`),
|
||||
substring.Regexp(`(^|/)animate\.(css|less|scss|styl)$`),
|
||||
substring.Regexp(`(^|/)materialize\.(css|less|scss|styl|js)$`),
|
||||
substring.Regexp(`(^|/)select2/.*\.(css|scss|js)$`),
|
||||
substring.Regexp(`(^|/)bulma\.(css|sass|scss)$`),
|
||||
substring.Regexp(`(3rd|[Tt]hird)[-_]?[Pp]arty/`),
|
||||
substring.Regexp(`vendors?/`),
|
||||
substring.Regexp(`extern(al)?/`),
|
||||
substring.Regexp(`(^|/)[Vv]+endor/`),
|
||||
substring.Regexp(`^debian/`),
|
||||
substring.Regexp(`run.n$`),
|
||||
substring.Regexp(`bootstrap-datepicker/`),
|
||||
substring.Regexp(`(^|/)jquery([^.]*)\.js$`),
|
||||
substring.Regexp(`(^|/)jquery\-\d\.\d+(\.\d+)?\.js$`),
|
||||
substring.Regexp(`(^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?\.(js|css)$`),
|
||||
substring.Regexp(`(^|/)jquery\.(ui|effects)\.([^.]*)\.(js|css)$`),
|
||||
substring.Regexp(`jquery.fn.gantt.js`),
|
||||
substring.Regexp(`jquery.fancybox.(js|css)`),
|
||||
substring.Regexp(`fuelux.js`),
|
||||
substring.Regexp(`(^|/)jquery\.fileupload(-\w+)?\.js$`),
|
||||
substring.Regexp(`jquery.dataTables.js`),
|
||||
substring.Regexp(`bootbox.js`),
|
||||
substring.Regexp(`pdf.worker.js`),
|
||||
substring.Regexp(`(^|/)slick\.\w+.js$`),
|
||||
substring.Regexp(`(^|/)Leaflet\.Coordinates-\d+\.\d+\.\d+\.src\.js$`),
|
||||
substring.Regexp(`leaflet.draw-src.js`),
|
||||
substring.Regexp(`leaflet.draw.css`),
|
||||
substring.Regexp(`Control.FullScreen.css`),
|
||||
substring.Regexp(`Control.FullScreen.js`),
|
||||
substring.Regexp(`leaflet.spin.js`),
|
||||
substring.Regexp(`wicket-leaflet.js`),
|
||||
substring.Regexp(`.sublime-project`),
|
||||
substring.Regexp(`.sublime-workspace`),
|
||||
substring.Regexp(`.vscode`),
|
||||
substring.Regexp(`(^|/)prototype(.*)\.js$`),
|
||||
substring.Regexp(`(^|/)effects\.js$`),
|
||||
substring.Regexp(`(^|/)controls\.js$`),
|
||||
substring.Regexp(`(^|/)dragdrop\.js$`),
|
||||
substring.Regexp(`(.*?)\.d\.ts$`),
|
||||
substring.Regexp(`(^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$`),
|
||||
substring.Regexp(`(^|/)dojo\.js$`),
|
||||
substring.Regexp(`(^|/)MochiKit\.js$`),
|
||||
substring.Regexp(`(^|/)yahoo-([^.]*)\.js$`),
|
||||
substring.Regexp(`(^|/)yui([^.]*)\.js$`),
|
||||
substring.Regexp(`(^|/)ckeditor\.js$`),
|
||||
substring.Regexp(`(^|/)tiny_mce([^.]*)\.js$`),
|
||||
substring.Regexp(`(^|/)tiny_mce/(langs|plugins|themes|utils)`),
|
||||
substring.Regexp(`(^|/)ace-builds/`),
|
||||
substring.Regexp(`(^|/)fontello(.*?)\.css$`),
|
||||
substring.Regexp(`(^|/)MathJax/`),
|
||||
substring.Regexp(`(^|/)Chart\.js$`),
|
||||
substring.Regexp(`(^|/)[Cc]ode[Mm]irror/(\d+\.\d+/)?(lib|mode|theme|addon|keymap|demo)`),
|
||||
substring.Regexp(`(^|/)shBrush([^.]*)\.js$`),
|
||||
substring.Regexp(`(^|/)shCore\.js$`),
|
||||
substring.Regexp(`(^|/)shLegacy\.js$`),
|
||||
substring.Regexp(`(^|/)angular([^.]*)\.js$`),
|
||||
substring.Regexp(`(^|\/)d3(\.v\d+)?([^.]*)\.js$`),
|
||||
substring.Regexp(`(^|/)react(-[^.]*)?\.js$`),
|
||||
substring.Regexp(`(^|/)flow-typed/.*\.js$`),
|
||||
substring.Regexp(`(^|/)modernizr\-\d\.\d+(\.\d+)?\.js$`),
|
||||
substring.Regexp(`(^|/)modernizr\.custom\.\d+\.js$`),
|
||||
substring.Regexp(`(^|/)knockout-(\d+\.){3}(debug\.)?js$`),
|
||||
substring.Regexp(`(^|/)docs?/_?(build|themes?|templates?|static)/`),
|
||||
substring.Regexp(`(^|/)admin_media/`),
|
||||
substring.Regexp(`(^|/)env/`),
|
||||
substring.Regexp(`^fabfile\.py$`),
|
||||
substring.Regexp(`^waf$`),
|
||||
substring.Regexp(`^.osx$`),
|
||||
substring.Regexp(`\.xctemplate/`),
|
||||
substring.Regexp(`\.imageset/`),
|
||||
substring.Regexp(`(^|/)Carthage/`),
|
||||
substring.Regexp(`(^|/)Sparkle/`),
|
||||
substring.Regexp(`Crashlytics.framework/`),
|
||||
substring.Regexp(`Fabric.framework/`),
|
||||
substring.Regexp(`BuddyBuildSDK.framework/`),
|
||||
substring.Regexp(`Realm.framework`),
|
||||
substring.Regexp(`RealmSwift.framework`),
|
||||
substring.Regexp(`gitattributes$`),
|
||||
substring.Regexp(`gitignore$`),
|
||||
substring.Regexp(`gitmodules$`),
|
||||
substring.Regexp(`(^|/)gradlew$`),
|
||||
substring.Regexp(`(^|/)gradlew\.bat$`),
|
||||
substring.Regexp(`(^|/)gradle/wrapper/`),
|
||||
substring.Regexp(`(^|/)mvnw$`),
|
||||
substring.Regexp(`(^|/)mvnw\.cmd$`),
|
||||
substring.Regexp(`(^|/)\.mvn/wrapper/`),
|
||||
substring.Regexp(`-vsdoc\.js$`),
|
||||
substring.Regexp(`\.intellisense\.js$`),
|
||||
substring.Regexp(`(^|/)jquery([^.]*)\.validate(\.unobtrusive)?\.js$`),
|
||||
substring.Regexp(`(^|/)jquery([^.]*)\.unobtrusive\-ajax\.js$`),
|
||||
substring.Regexp(`(^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$`),
|
||||
substring.Regexp(`^[Pp]ackages\/.+\.\d+\/`),
|
||||
substring.Regexp(`(^|/)extjs/.*?\.js$`),
|
||||
substring.Regexp(`(^|/)extjs/.*?\.xml$`),
|
||||
substring.Regexp(`(^|/)extjs/.*?\.txt$`),
|
||||
substring.Regexp(`(^|/)extjs/.*?\.html$`),
|
||||
substring.Regexp(`(^|/)extjs/.*?\.properties$`),
|
||||
substring.Regexp(`(^|/)extjs/.sencha/`),
|
||||
substring.Regexp(`(^|/)extjs/docs/`),
|
||||
substring.Regexp(`(^|/)extjs/builds/`),
|
||||
substring.Regexp(`(^|/)extjs/cmd/`),
|
||||
substring.Regexp(`(^|/)extjs/examples/`),
|
||||
substring.Regexp(`(^|/)extjs/locale/`),
|
||||
substring.Regexp(`(^|/)extjs/packages/`),
|
||||
substring.Regexp(`(^|/)extjs/plugins/`),
|
||||
substring.Regexp(`(^|/)extjs/resources/`),
|
||||
substring.Regexp(`(^|/)extjs/src/`),
|
||||
substring.Regexp(`(^|/)extjs/welcome/`),
|
||||
substring.Regexp(`(^|/)html5shiv\.js$`),
|
||||
substring.Regexp(`^[Tt]ests?/fixtures/`),
|
||||
substring.Regexp(`^[Ss]pecs?/fixtures/`),
|
||||
substring.Regexp(`(^|/)cordova([^.]*)\.js$`),
|
||||
substring.Regexp(`(^|/)cordova\-\d\.\d(\.\d)?\.js$`),
|
||||
substring.Regexp(`foundation(\..*)?\.js$`),
|
||||
substring.Regexp(`^Vagrantfile$`),
|
||||
substring.Regexp(`.[Dd][Ss]_[Ss]tore$`),
|
||||
substring.Regexp(`^vignettes/`),
|
||||
substring.Regexp(`^inst/extdata/`),
|
||||
substring.Regexp(`octicons.css`),
|
||||
substring.Regexp(`sprockets-octicons.scss`),
|
||||
substring.Regexp(`(^|/)activator$`),
|
||||
substring.Regexp(`(^|/)activator\.bat$`),
|
||||
substring.Regexp(`proguard.pro`),
|
||||
substring.Regexp(`proguard-rules.pro`),
|
||||
substring.Regexp(`^puphpet/`),
|
||||
substring.Regexp(`(^|/)\.google_apis/`),
|
||||
substring.Regexp(`^Jenkinsfile$`),
|
||||
)
|
||||
var VendorMatchers = []regex.EnryRegexp{
|
||||
regex.MustCompile(`(^|/)cache/`),
|
||||
regex.MustCompile(`^[Dd]ependencies/`),
|
||||
regex.MustCompile(`(^|/)dist/`),
|
||||
regex.MustCompile(`^deps/`),
|
||||
regex.MustCompile(`(^|/)configure$`),
|
||||
regex.MustCompile(`(^|/)config.guess$`),
|
||||
regex.MustCompile(`(^|/)config.sub$`),
|
||||
regex.MustCompile(`(^|/)aclocal.m4`),
|
||||
regex.MustCompile(`(^|/)libtool.m4`),
|
||||
regex.MustCompile(`(^|/)ltoptions.m4`),
|
||||
regex.MustCompile(`(^|/)ltsugar.m4`),
|
||||
regex.MustCompile(`(^|/)ltversion.m4`),
|
||||
regex.MustCompile(`(^|/)lt~obsolete.m4`),
|
||||
regex.MustCompile(`dotnet-install\.(ps1|sh)$`),
|
||||
regex.MustCompile(`cpplint.py`),
|
||||
regex.MustCompile(`node_modules/`),
|
||||
regex.MustCompile(`(^|/)\.yarn/releases/`),
|
||||
regex.MustCompile(`(^|/)_esy$`),
|
||||
regex.MustCompile(`bower_components/`),
|
||||
regex.MustCompile(`^rebar$`),
|
||||
regex.MustCompile(`erlang.mk`),
|
||||
regex.MustCompile(`Godeps/_workspace/`),
|
||||
regex.MustCompile(`(^|/)testdata/`),
|
||||
regex.MustCompile(`.indent.pro`),
|
||||
regex.MustCompile(`(\.|-)min\.(js|css)$`),
|
||||
regex.MustCompile(`([^\s]*)import\.(css|less|scss|styl)$`),
|
||||
regex.MustCompile(`(^|/)bootstrap([^.]*)\.(js|css|less|scss|styl)$`),
|
||||
regex.MustCompile(`(^|/)custom\.bootstrap([^\s]*)(js|css|less|scss|styl)$`),
|
||||
regex.MustCompile(`(^|/)font-?awesome\.(css|less|scss|styl)$`),
|
||||
regex.MustCompile(`(^|/)font-?awesome/.*\.(css|less|scss|styl)$`),
|
||||
regex.MustCompile(`(^|/)foundation\.(css|less|scss|styl)$`),
|
||||
regex.MustCompile(`(^|/)normalize\.(css|less|scss|styl)$`),
|
||||
regex.MustCompile(`(^|/)skeleton\.(css|less|scss|styl)$`),
|
||||
regex.MustCompile(`(^|/)[Bb]ourbon/.*\.(css|less|scss|styl)$`),
|
||||
regex.MustCompile(`(^|/)animate\.(css|less|scss|styl)$`),
|
||||
regex.MustCompile(`(^|/)materialize\.(css|less|scss|styl|js)$`),
|
||||
regex.MustCompile(`(^|/)select2/.*\.(css|scss|js)$`),
|
||||
regex.MustCompile(`(^|/)bulma\.(css|sass|scss)$`),
|
||||
regex.MustCompile(`(3rd|[Tt]hird)[-_]?[Pp]arty/`),
|
||||
regex.MustCompile(`vendors?/`),
|
||||
regex.MustCompile(`extern(al)?/`),
|
||||
regex.MustCompile(`(^|/)[Vv]+endor/`),
|
||||
regex.MustCompile(`^debian/`),
|
||||
regex.MustCompile(`run.n$`),
|
||||
regex.MustCompile(`bootstrap-datepicker/`),
|
||||
regex.MustCompile(`(^|/)jquery([^.]*)\.js$`),
|
||||
regex.MustCompile(`(^|/)jquery\-\d\.\d+(\.\d+)?\.js$`),
|
||||
regex.MustCompile(`(^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?\.(js|css)$`),
|
||||
regex.MustCompile(`(^|/)jquery\.(ui|effects)\.([^.]*)\.(js|css)$`),
|
||||
regex.MustCompile(`jquery.fn.gantt.js`),
|
||||
regex.MustCompile(`jquery.fancybox.(js|css)`),
|
||||
regex.MustCompile(`fuelux.js`),
|
||||
regex.MustCompile(`(^|/)jquery\.fileupload(-\w+)?\.js$`),
|
||||
regex.MustCompile(`jquery.dataTables.js`),
|
||||
regex.MustCompile(`bootbox.js`),
|
||||
regex.MustCompile(`pdf.worker.js`),
|
||||
regex.MustCompile(`(^|/)slick\.\w+.js$`),
|
||||
regex.MustCompile(`(^|/)Leaflet\.Coordinates-\d+\.\d+\.\d+\.src\.js$`),
|
||||
regex.MustCompile(`leaflet.draw-src.js`),
|
||||
regex.MustCompile(`leaflet.draw.css`),
|
||||
regex.MustCompile(`Control.FullScreen.css`),
|
||||
regex.MustCompile(`Control.FullScreen.js`),
|
||||
regex.MustCompile(`leaflet.spin.js`),
|
||||
regex.MustCompile(`wicket-leaflet.js`),
|
||||
regex.MustCompile(`.sublime-project`),
|
||||
regex.MustCompile(`.sublime-workspace`),
|
||||
regex.MustCompile(`.vscode`),
|
||||
regex.MustCompile(`(^|/)prototype(.*)\.js$`),
|
||||
regex.MustCompile(`(^|/)effects\.js$`),
|
||||
regex.MustCompile(`(^|/)controls\.js$`),
|
||||
regex.MustCompile(`(^|/)dragdrop\.js$`),
|
||||
regex.MustCompile(`(.*?)\.d\.ts$`),
|
||||
regex.MustCompile(`(^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$`),
|
||||
regex.MustCompile(`(^|/)dojo\.js$`),
|
||||
regex.MustCompile(`(^|/)MochiKit\.js$`),
|
||||
regex.MustCompile(`(^|/)yahoo-([^.]*)\.js$`),
|
||||
regex.MustCompile(`(^|/)yui([^.]*)\.js$`),
|
||||
regex.MustCompile(`(^|/)ckeditor\.js$`),
|
||||
regex.MustCompile(`(^|/)tiny_mce([^.]*)\.js$`),
|
||||
regex.MustCompile(`(^|/)tiny_mce/(langs|plugins|themes|utils)`),
|
||||
regex.MustCompile(`(^|/)ace-builds/`),
|
||||
regex.MustCompile(`(^|/)fontello(.*?)\.css$`),
|
||||
regex.MustCompile(`(^|/)MathJax/`),
|
||||
regex.MustCompile(`(^|/)Chart\.js$`),
|
||||
regex.MustCompile(`(^|/)[Cc]ode[Mm]irror/(\d+\.\d+/)?(lib|mode|theme|addon|keymap|demo)`),
|
||||
regex.MustCompile(`(^|/)shBrush([^.]*)\.js$`),
|
||||
regex.MustCompile(`(^|/)shCore\.js$`),
|
||||
regex.MustCompile(`(^|/)shLegacy\.js$`),
|
||||
regex.MustCompile(`(^|/)angular([^.]*)\.js$`),
|
||||
regex.MustCompile(`(^|\/)d3(\.v\d+)?([^.]*)\.js$`),
|
||||
regex.MustCompile(`(^|/)react(-[^.]*)?\.js$`),
|
||||
regex.MustCompile(`(^|/)flow-typed/.*\.js$`),
|
||||
regex.MustCompile(`(^|/)modernizr\-\d\.\d+(\.\d+)?\.js$`),
|
||||
regex.MustCompile(`(^|/)modernizr\.custom\.\d+\.js$`),
|
||||
regex.MustCompile(`(^|/)knockout-(\d+\.){3}(debug\.)?js$`),
|
||||
regex.MustCompile(`(^|/)docs?/_?(build|themes?|templates?|static)/`),
|
||||
regex.MustCompile(`(^|/)admin_media/`),
|
||||
regex.MustCompile(`(^|/)env/`),
|
||||
regex.MustCompile(`^fabfile\.py$`),
|
||||
regex.MustCompile(`^waf$`),
|
||||
regex.MustCompile(`^.osx$`),
|
||||
regex.MustCompile(`\.xctemplate/`),
|
||||
regex.MustCompile(`\.imageset/`),
|
||||
regex.MustCompile(`(^|/)Carthage/`),
|
||||
regex.MustCompile(`(^|/)Sparkle/`),
|
||||
regex.MustCompile(`Crashlytics.framework/`),
|
||||
regex.MustCompile(`Fabric.framework/`),
|
||||
regex.MustCompile(`BuddyBuildSDK.framework/`),
|
||||
regex.MustCompile(`Realm.framework`),
|
||||
regex.MustCompile(`RealmSwift.framework`),
|
||||
regex.MustCompile(`gitattributes$`),
|
||||
regex.MustCompile(`gitignore$`),
|
||||
regex.MustCompile(`gitmodules$`),
|
||||
regex.MustCompile(`(^|/)gradlew$`),
|
||||
regex.MustCompile(`(^|/)gradlew\.bat$`),
|
||||
regex.MustCompile(`(^|/)gradle/wrapper/`),
|
||||
regex.MustCompile(`(^|/)mvnw$`),
|
||||
regex.MustCompile(`(^|/)mvnw\.cmd$`),
|
||||
regex.MustCompile(`(^|/)\.mvn/wrapper/`),
|
||||
regex.MustCompile(`-vsdoc\.js$`),
|
||||
regex.MustCompile(`\.intellisense\.js$`),
|
||||
regex.MustCompile(`(^|/)jquery([^.]*)\.validate(\.unobtrusive)?\.js$`),
|
||||
regex.MustCompile(`(^|/)jquery([^.]*)\.unobtrusive\-ajax\.js$`),
|
||||
regex.MustCompile(`(^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$`),
|
||||
regex.MustCompile(`^[Pp]ackages\/.+\.\d+\/`),
|
||||
regex.MustCompile(`(^|/)extjs/.*?\.js$`),
|
||||
regex.MustCompile(`(^|/)extjs/.*?\.xml$`),
|
||||
regex.MustCompile(`(^|/)extjs/.*?\.txt$`),
|
||||
regex.MustCompile(`(^|/)extjs/.*?\.html$`),
|
||||
regex.MustCompile(`(^|/)extjs/.*?\.properties$`),
|
||||
regex.MustCompile(`(^|/)extjs/.sencha/`),
|
||||
regex.MustCompile(`(^|/)extjs/docs/`),
|
||||
regex.MustCompile(`(^|/)extjs/builds/`),
|
||||
regex.MustCompile(`(^|/)extjs/cmd/`),
|
||||
regex.MustCompile(`(^|/)extjs/examples/`),
|
||||
regex.MustCompile(`(^|/)extjs/locale/`),
|
||||
regex.MustCompile(`(^|/)extjs/packages/`),
|
||||
regex.MustCompile(`(^|/)extjs/plugins/`),
|
||||
regex.MustCompile(`(^|/)extjs/resources/`),
|
||||
regex.MustCompile(`(^|/)extjs/src/`),
|
||||
regex.MustCompile(`(^|/)extjs/welcome/`),
|
||||
regex.MustCompile(`(^|/)html5shiv\.js$`),
|
||||
regex.MustCompile(`^[Tt]ests?/fixtures/`),
|
||||
regex.MustCompile(`^[Ss]pecs?/fixtures/`),
|
||||
regex.MustCompile(`(^|/)cordova([^.]*)\.js$`),
|
||||
regex.MustCompile(`(^|/)cordova\-\d\.\d(\.\d)?\.js$`),
|
||||
regex.MustCompile(`foundation(\..*)?\.js$`),
|
||||
regex.MustCompile(`^Vagrantfile$`),
|
||||
regex.MustCompile(`.[Dd][Ss]_[Ss]tore$`),
|
||||
regex.MustCompile(`^vignettes/`),
|
||||
regex.MustCompile(`^inst/extdata/`),
|
||||
regex.MustCompile(`octicons.css`),
|
||||
regex.MustCompile(`sprockets-octicons.scss`),
|
||||
regex.MustCompile(`(^|/)activator$`),
|
||||
regex.MustCompile(`(^|/)activator\.bat$`),
|
||||
regex.MustCompile(`proguard.pro`),
|
||||
regex.MustCompile(`proguard-rules.pro`),
|
||||
regex.MustCompile(`^puphpet/`),
|
||||
regex.MustCompile(`(^|/)\.google_apis/`),
|
||||
regex.MustCompile(`^Jenkinsfile$`),
|
||||
}
|
||||
|
|
|
@ -3,9 +3,7 @@ module github.com/go-enry/go-enry/v2
|
|||
go 1.14
|
||||
|
||||
require (
|
||||
github.com/go-enry/go-oniguruma v1.2.0
|
||||
github.com/go-enry/go-oniguruma v1.2.1
|
||||
github.com/stretchr/testify v1.3.0
|
||||
github.com/toqueteos/trie v1.0.0 // indirect
|
||||
gopkg.in/toqueteos/substring.v1 v1.0.2
|
||||
gopkg.in/yaml.v2 v2.2.8
|
||||
)
|
||||
|
|
|
@ -2,17 +2,15 @@ github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8
|
|||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/go-enry/go-oniguruma v1.2.0 h1:oBO9XC1IDT9+AoWW5oFsa/7gFeOPacEqDbyXZKWXuDs=
|
||||
github.com/go-enry/go-oniguruma v1.2.0/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
|
||||
github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo=
|
||||
github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/toqueteos/trie v1.0.0 h1:8i6pXxNUXNRAqP246iibb7w/pSFquNTQ+uNfriG7vlk=
|
||||
github.com/toqueteos/trie v1.0.0/go.mod h1:Ywk48QhEqhU1+DwhMkJ2x7eeGxDHiGkAdc9+0DYcbsM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/toqueteos/substring.v1 v1.0.2 h1:urLqCeMm6x/eTuQa1oZerNw8N1KNOIp5hD5kGL7lFsE=
|
||||
gopkg.in/toqueteos/substring.v1 v1.0.2/go.mod h1:Eb2Z1UYehlVK8LYW2WBVR2rwbujsz3aX8XDrM1vbNew=
|
||||
gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
|
||||
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
|
|
|
@ -6,12 +6,18 @@ import (
|
|||
"strings"
|
||||
|
||||
"github.com/go-enry/go-enry/v2/data"
|
||||
"github.com/go-enry/go-enry/v2/regex"
|
||||
)
|
||||
|
||||
const binSniffLen = 8000
|
||||
|
||||
var configurationLanguages = map[string]bool{
|
||||
"XML": true, "JSON": true, "TOML": true, "YAML": true, "INI": true, "SQL": true,
|
||||
var configurationLanguages = map[string]struct{}{
|
||||
"XML": {},
|
||||
"JSON": {},
|
||||
"TOML": {},
|
||||
"YAML": {},
|
||||
"INI": {},
|
||||
"SQL": {},
|
||||
}
|
||||
|
||||
// IsConfiguration tells if filename is in one of the configuration languages.
|
||||
|
@ -46,7 +52,7 @@ func GetMIMEType(path string, language string) string {
|
|||
|
||||
// IsDocumentation returns whether or not path is a documentation path.
|
||||
func IsDocumentation(path string) bool {
|
||||
return data.DocumentationMatchers.Match(path)
|
||||
return matchRegexSlice(data.DocumentationMatchers, path)
|
||||
}
|
||||
|
||||
// IsDotFile returns whether or not path has dot as a prefix.
|
||||
|
@ -57,7 +63,12 @@ func IsDotFile(path string) bool {
|
|||
|
||||
// IsVendor returns whether or not path is a vendor path.
|
||||
func IsVendor(path string) bool {
|
||||
return data.VendorMatchers.Match(path)
|
||||
return matchRegexSlice(data.VendorMatchers, path)
|
||||
}
|
||||
|
||||
// IsTest returns whether or not path is a test path.
|
||||
func IsTest(path string) bool {
|
||||
return matchRegexSlice(data.TestMatchers, path)
|
||||
}
|
||||
|
||||
// IsBinary detects if data is a binary value based on:
|
||||
|
@ -86,3 +97,37 @@ func GetColor(language string) string {
|
|||
|
||||
return "#cccccc"
|
||||
}
|
||||
|
||||
func matchRegexSlice(exprs []regex.EnryRegexp, str string) bool {
|
||||
for _, expr := range exprs {
|
||||
if expr.MatchString(str) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// IsGenerated returns whether the file with the given path and content is a
|
||||
// generated file.
|
||||
func IsGenerated(path string, content []byte) bool {
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
if _, ok := data.GeneratedCodeExtensions[ext]; ok {
|
||||
return true
|
||||
}
|
||||
|
||||
for _, m := range data.GeneratedCodeNameMatchers {
|
||||
if m(path) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
path = strings.ToLower(path)
|
||||
for _, m := range data.GeneratedCodeMatchers {
|
||||
if m(path, ext, content) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
#include "chelper.h"
|
||||
|
||||
int NewOnigRegex( char *pattern, int pattern_length, int option,
|
||||
OnigRegex *regex, OnigRegion **region, OnigEncoding *encoding, OnigErrorInfo **error_info, char **error_buffer) {
|
||||
OnigRegex *regex, OnigEncoding *encoding, OnigErrorInfo **error_info, char **error_buffer) {
|
||||
int ret = ONIG_NORMAL;
|
||||
int error_msg_len = 0;
|
||||
|
||||
|
@ -23,8 +23,6 @@ int NewOnigRegex( char *pattern, int pattern_length, int option,
|
|||
|
||||
memset(*error_buffer, 0, ONIG_MAX_ERROR_MESSAGE_LEN * sizeof(char));
|
||||
|
||||
*region = onig_region_new();
|
||||
|
||||
ret = onig_new(regex, pattern_start, pattern_end, (OnigOptionType)(option), *encoding, OnigDefaultSyntax, *error_info);
|
||||
|
||||
if (ret != ONIG_NORMAL) {
|
||||
|
@ -38,9 +36,10 @@ int NewOnigRegex( char *pattern, int pattern_length, int option,
|
|||
}
|
||||
|
||||
int SearchOnigRegex( void *str, int str_length, int offset, int option,
|
||||
OnigRegex regex, OnigRegion *region, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures) {
|
||||
OnigRegex regex, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures) {
|
||||
int ret = ONIG_MISMATCH;
|
||||
int error_msg_len = 0;
|
||||
OnigRegion *region;
|
||||
#ifdef BENCHMARK_CHELP
|
||||
struct timeval tim1, tim2;
|
||||
long t;
|
||||
|
@ -55,6 +54,8 @@ int SearchOnigRegex( void *str, int str_length, int offset, int option,
|
|||
gettimeofday(&tim1, NULL);
|
||||
#endif
|
||||
|
||||
region = onig_region_new();
|
||||
|
||||
ret = onig_search(regex, str_start, str_end, search_start, search_end, region, option);
|
||||
if (ret < 0 && error_buffer != NULL) {
|
||||
error_msg_len = onig_error_code_to_str((unsigned char*)(error_buffer), ret, error_info);
|
||||
|
@ -74,6 +75,8 @@ int SearchOnigRegex( void *str, int str_length, int offset, int option,
|
|||
*numCaptures = count;
|
||||
}
|
||||
|
||||
onig_region_free(region, 1);
|
||||
|
||||
#ifdef BENCHMARK_CHELP
|
||||
gettimeofday(&tim2, NULL);
|
||||
t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec;
|
||||
|
@ -83,9 +86,10 @@ int SearchOnigRegex( void *str, int str_length, int offset, int option,
|
|||
}
|
||||
|
||||
int MatchOnigRegex(void *str, int str_length, int offset, int option,
|
||||
OnigRegex regex, OnigRegion *region) {
|
||||
OnigRegex regex) {
|
||||
int ret = ONIG_MISMATCH;
|
||||
int error_msg_len = 0;
|
||||
OnigRegion *region;
|
||||
#ifdef BENCHMARK_CHELP
|
||||
struct timeval tim1, tim2;
|
||||
long t;
|
||||
|
@ -98,7 +102,9 @@ int MatchOnigRegex(void *str, int str_length, int offset, int option,
|
|||
#ifdef BENCHMARK_CHELP
|
||||
gettimeofday(&tim1, NULL);
|
||||
#endif
|
||||
region = onig_region_new();
|
||||
ret = onig_match(regex, str_start, str_end, search_start, region, option);
|
||||
onig_region_free(region, 1);
|
||||
#ifdef BENCHMARK_CHELP
|
||||
gettimeofday(&tim2, NULL);
|
||||
t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec;
|
||||
|
@ -108,8 +114,9 @@ int MatchOnigRegex(void *str, int str_length, int offset, int option,
|
|||
}
|
||||
|
||||
int LookupOnigCaptureByName(char *name, int name_length,
|
||||
OnigRegex regex, OnigRegion *region) {
|
||||
OnigRegex regex) {
|
||||
int ret = ONIGERR_UNDEFINED_NAME_REFERENCE;
|
||||
OnigRegion *region;
|
||||
#ifdef BENCHMARK_CHELP
|
||||
struct timeval tim1, tim2;
|
||||
long t;
|
||||
|
@ -119,7 +126,9 @@ int LookupOnigCaptureByName(char *name, int name_length,
|
|||
#ifdef BENCHMARK_CHELP
|
||||
gettimeofday(&tim1, NULL);
|
||||
#endif
|
||||
region = onig_region_new();
|
||||
ret = onig_name_to_backref_number(regex, name_start, name_end, region);
|
||||
onig_region_free(region, 1);
|
||||
#ifdef BENCHMARK_CHELP
|
||||
gettimeofday(&tim2, NULL);
|
||||
t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec;
|
||||
|
@ -181,4 +190,3 @@ int GetCaptureNames(OnigRegex reg, void *buffer, int bufferSize, int* groupNumbe
|
|||
onig_foreach_name(reg, name_callback, (void* )&groupInfo);
|
||||
return groupInfo.bufferOffset;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
#include <oniguruma.h>
|
||||
|
||||
extern int NewOnigRegex( char *pattern, int pattern_length, int option,
|
||||
OnigRegex *regex, OnigRegion **region, OnigEncoding *encoding, OnigErrorInfo **error_info, char **error_buffer);
|
||||
OnigRegex *regex, OnigEncoding *encoding, OnigErrorInfo **error_info, char **error_buffer);
|
||||
|
||||
extern int SearchOnigRegex( void *str, int str_length, int offset, int option,
|
||||
OnigRegex regex, OnigRegion *region, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures);
|
||||
OnigRegex regex, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures);
|
||||
|
||||
extern int MatchOnigRegex( void *str, int str_length, int offset, int option,
|
||||
OnigRegex regex, OnigRegion *region);
|
||||
OnigRegex regex);
|
||||
|
||||
extern int LookupOnigCaptureByName(char *name, int name_length, OnigRegex regex, OnigRegion *region);
|
||||
extern int LookupOnigCaptureByName(char *name, int name_length, OnigRegex regex);
|
||||
|
||||
extern int GetCaptureNames(OnigRegex regex, void *buffer, int bufferSize, int* groupNumbers);
|
||||
|
|
|
@ -14,7 +14,6 @@ import (
|
|||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"sync"
|
||||
|
@ -22,62 +21,52 @@ import (
|
|||
"unsafe"
|
||||
)
|
||||
|
||||
type strRange []int
|
||||
|
||||
const numMatchStartSize = 4
|
||||
const numReadBufferStartSize = 256
|
||||
|
||||
var mutex sync.Mutex
|
||||
|
||||
type MatchData struct {
|
||||
count int
|
||||
indexes [][]int32
|
||||
}
|
||||
|
||||
type NamedGroupInfo map[string]int
|
||||
|
||||
type Regexp struct {
|
||||
pattern string
|
||||
regex C.OnigRegex
|
||||
region *C.OnigRegion
|
||||
encoding C.OnigEncoding
|
||||
errorInfo *C.OnigErrorInfo
|
||||
errorBuf *C.char
|
||||
matchData *MatchData
|
||||
pattern string
|
||||
regex C.OnigRegex
|
||||
encoding C.OnigEncoding
|
||||
errorInfo *C.OnigErrorInfo
|
||||
errorBuf *C.char
|
||||
|
||||
numCaptures int32
|
||||
namedGroupInfo NamedGroupInfo
|
||||
}
|
||||
|
||||
// NewRegexp creates and initializes a new Regexp with the given pattern and option.
|
||||
func NewRegexp(pattern string, option int) (re *Regexp, err error) {
|
||||
func NewRegexp(pattern string, option int) (*Regexp, error) {
|
||||
return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_UTF8}, option)
|
||||
}
|
||||
|
||||
// NewRegexpASCII is equivalent to NewRegexp, but with the encoding restricted to ASCII.
|
||||
func NewRegexpASCII(pattern string, option int) (re *Regexp, err error) {
|
||||
func NewRegexpASCII(pattern string, option int) (*Regexp, error) {
|
||||
return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_ASCII}, option)
|
||||
}
|
||||
|
||||
func initRegexp(re *Regexp, option int) (*Regexp, error) {
|
||||
var err error
|
||||
patternCharPtr := C.CString(re.pattern)
|
||||
defer C.free(unsafe.Pointer(patternCharPtr))
|
||||
|
||||
mutex.Lock()
|
||||
defer mutex.Unlock()
|
||||
errorCode := C.NewOnigRegex(patternCharPtr, C.int(len(re.pattern)), C.int(option), &re.regex, &re.region, &re.encoding, &re.errorInfo, &re.errorBuf)
|
||||
|
||||
errorCode := C.NewOnigRegex(patternCharPtr, C.int(len(re.pattern)), C.int(option), &re.regex, &re.encoding, &re.errorInfo, &re.errorBuf)
|
||||
if errorCode != C.ONIG_NORMAL {
|
||||
err = errors.New(C.GoString(re.errorBuf))
|
||||
} else {
|
||||
err = nil
|
||||
numCapturesInPattern := int(C.onig_number_of_captures(re.regex)) + 1
|
||||
re.matchData = &MatchData{}
|
||||
re.matchData.indexes = make([][]int32, numMatchStartSize)
|
||||
for i := 0; i < numMatchStartSize; i++ {
|
||||
re.matchData.indexes[i] = make([]int32, numCapturesInPattern*2)
|
||||
}
|
||||
re.namedGroupInfo = re.getNamedGroupInfo()
|
||||
runtime.SetFinalizer(re, (*Regexp).Free)
|
||||
return re, errors.New(C.GoString(re.errorBuf))
|
||||
}
|
||||
return re, err
|
||||
|
||||
re.numCaptures = int32(C.onig_number_of_captures(re.regex)) + 1
|
||||
re.namedGroupInfo = re.getNamedGroupInfo()
|
||||
|
||||
runtime.SetFinalizer(re, (*Regexp).Free)
|
||||
|
||||
return re, nil
|
||||
}
|
||||
|
||||
func Compile(str string) (*Regexp, error) {
|
||||
|
@ -89,6 +78,7 @@ func MustCompile(str string) *Regexp {
|
|||
if error != nil {
|
||||
panic("regexp: compiling " + str + ": " + error.Error())
|
||||
}
|
||||
|
||||
return regexp
|
||||
}
|
||||
|
||||
|
@ -101,6 +91,7 @@ func MustCompileWithOption(str string, option int) *Regexp {
|
|||
if error != nil {
|
||||
panic("regexp: compiling " + str + ": " + error.Error())
|
||||
}
|
||||
|
||||
return regexp
|
||||
}
|
||||
|
||||
|
@ -110,6 +101,7 @@ func MustCompileASCII(str string) *Regexp {
|
|||
if error != nil {
|
||||
panic("regexp: compiling " + str + ": " + error.Error())
|
||||
}
|
||||
|
||||
return regexp
|
||||
}
|
||||
|
||||
|
@ -119,10 +111,6 @@ func (re *Regexp) Free() {
|
|||
C.onig_free(re.regex)
|
||||
re.regex = nil
|
||||
}
|
||||
if re.region != nil {
|
||||
C.onig_region_free(re.region, 1)
|
||||
re.region = nil
|
||||
}
|
||||
mutex.Unlock()
|
||||
if re.errorInfo != nil {
|
||||
C.free(unsafe.Pointer(re.errorInfo))
|
||||
|
@ -134,149 +122,149 @@ func (re *Regexp) Free() {
|
|||
}
|
||||
}
|
||||
|
||||
func (re *Regexp) getNamedGroupInfo() (namedGroupInfo NamedGroupInfo) {
|
||||
func (re *Regexp) getNamedGroupInfo() NamedGroupInfo {
|
||||
numNamedGroups := int(C.onig_number_of_names(re.regex))
|
||||
//when any named capture exisits, there is no numbered capture even if there are unnamed captures
|
||||
if numNamedGroups > 0 {
|
||||
namedGroupInfo = make(map[string]int)
|
||||
//try to get the names
|
||||
bufferSize := len(re.pattern) * 2
|
||||
nameBuffer := make([]byte, bufferSize)
|
||||
groupNumbers := make([]int32, numNamedGroups)
|
||||
bufferPtr := unsafe.Pointer(&nameBuffer[0])
|
||||
numbersPtr := unsafe.Pointer(&groupNumbers[0])
|
||||
length := int(C.GetCaptureNames(re.regex, bufferPtr, (C.int)(bufferSize), (*C.int)(numbersPtr)))
|
||||
if length > 0 {
|
||||
namesAsBytes := bytes.Split(nameBuffer[:length], ([]byte)(";"))
|
||||
if len(namesAsBytes) != numNamedGroups {
|
||||
log.Fatalf("the number of named groups (%d) does not match the number names found (%d)\n", numNamedGroups, len(namesAsBytes))
|
||||
}
|
||||
for i, nameAsBytes := range namesAsBytes {
|
||||
name := string(nameAsBytes)
|
||||
namedGroupInfo[name] = int(groupNumbers[i])
|
||||
}
|
||||
} else {
|
||||
log.Fatalf("could not get the capture group names from %q", re.String())
|
||||
}
|
||||
// when any named capture exists, there is no numbered capture even if
|
||||
// there are unnamed captures.
|
||||
if numNamedGroups == 0 {
|
||||
return nil
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (re *Regexp) groupNameToId(name string) (id int) {
|
||||
if re.namedGroupInfo == nil {
|
||||
id = ONIGERR_UNDEFINED_NAME_REFERENCE
|
||||
} else {
|
||||
id = re.namedGroupInfo[name]
|
||||
namedGroupInfo := make(map[string]int)
|
||||
|
||||
//try to get the names
|
||||
bufferSize := len(re.pattern) * 2
|
||||
nameBuffer := make([]byte, bufferSize)
|
||||
groupNumbers := make([]int32, numNamedGroups)
|
||||
bufferPtr := unsafe.Pointer(&nameBuffer[0])
|
||||
numbersPtr := unsafe.Pointer(&groupNumbers[0])
|
||||
|
||||
length := int(C.GetCaptureNames(re.regex, bufferPtr, (C.int)(bufferSize), (*C.int)(numbersPtr)))
|
||||
if length == 0 {
|
||||
panic(fmt.Errorf("could not get the capture group names from %q", re.String()))
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (re *Regexp) processMatch(numCaptures int) (match []int32) {
|
||||
if numCaptures <= 0 {
|
||||
panic("cannot have 0 captures when processing a match")
|
||||
namesAsBytes := bytes.Split(nameBuffer[:length], ([]byte)(";"))
|
||||
if len(namesAsBytes) != numNamedGroups {
|
||||
panic(fmt.Errorf(
|
||||
"the number of named groups (%d) does not match the number names found (%d)",
|
||||
numNamedGroups, len(namesAsBytes),
|
||||
))
|
||||
}
|
||||
matchData := re.matchData
|
||||
return matchData.indexes[matchData.count][:numCaptures*2]
|
||||
|
||||
for i, nameAsBytes := range namesAsBytes {
|
||||
name := string(nameAsBytes)
|
||||
namedGroupInfo[name] = int(groupNumbers[i])
|
||||
}
|
||||
|
||||
return namedGroupInfo
|
||||
}
|
||||
|
||||
func (re *Regexp) ClearMatchData() {
|
||||
matchData := re.matchData
|
||||
matchData.count = 0
|
||||
}
|
||||
func (re *Regexp) find(b []byte, n int, offset int) []int {
|
||||
match := make([]int, re.numCaptures*2)
|
||||
|
||||
func (re *Regexp) find(b []byte, n int, offset int) (match []int) {
|
||||
if n == 0 {
|
||||
b = []byte{0}
|
||||
}
|
||||
ptr := unsafe.Pointer(&b[0])
|
||||
matchData := re.matchData
|
||||
capturesPtr := unsafe.Pointer(&(matchData.indexes[matchData.count][0]))
|
||||
numCaptures := int32(0)
|
||||
|
||||
bytesPtr := unsafe.Pointer(&b[0])
|
||||
|
||||
// captures contains two pairs of ints, start and end, so we need list
|
||||
// twice the size of the capture groups.
|
||||
captures := make([]C.int, re.numCaptures*2)
|
||||
capturesPtr := unsafe.Pointer(&captures[0])
|
||||
|
||||
var numCaptures int32
|
||||
numCapturesPtr := unsafe.Pointer(&numCaptures)
|
||||
pos := int(C.SearchOnigRegex((ptr), C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT), re.regex, re.region, re.errorInfo, (*C.char)(nil), (*C.int)(capturesPtr), (*C.int)(numCapturesPtr)))
|
||||
if pos >= 0 {
|
||||
if numCaptures <= 0 {
|
||||
panic("cannot have 0 captures when processing a match")
|
||||
}
|
||||
match2 := matchData.indexes[matchData.count][:numCaptures*2]
|
||||
match = make([]int, len(match2))
|
||||
for i := range match2 {
|
||||
match[i] = int(match2[i])
|
||||
}
|
||||
numCapturesInPattern := int32(C.onig_number_of_captures(re.regex)) + 1
|
||||
if numCapturesInPattern != numCaptures {
|
||||
log.Fatalf("expected %d captures but got %d\n", numCapturesInPattern, numCaptures)
|
||||
}
|
||||
|
||||
pos := int(C.SearchOnigRegex(
|
||||
bytesPtr, C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT),
|
||||
re.regex, re.errorInfo, (*C.char)(nil), (*C.int)(capturesPtr), (*C.int)(numCapturesPtr),
|
||||
))
|
||||
|
||||
if pos < 0 {
|
||||
return nil
|
||||
}
|
||||
return
|
||||
|
||||
if numCaptures <= 0 {
|
||||
panic("cannot have 0 captures when processing a match")
|
||||
}
|
||||
|
||||
if re.numCaptures != numCaptures {
|
||||
panic(fmt.Errorf("expected %d captures but got %d", re.numCaptures, numCaptures))
|
||||
}
|
||||
|
||||
for i := range captures {
|
||||
match[i] = int(captures[i])
|
||||
}
|
||||
|
||||
return match
|
||||
}
|
||||
|
||||
func getCapture(b []byte, beg int, end int) []byte {
|
||||
if beg < 0 || end < 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
return b[beg:end]
|
||||
}
|
||||
|
||||
func (re *Regexp) match(b []byte, n int, offset int) bool {
|
||||
re.ClearMatchData()
|
||||
if n == 0 {
|
||||
b = []byte{0}
|
||||
}
|
||||
ptr := unsafe.Pointer(&b[0])
|
||||
pos := int(C.SearchOnigRegex((ptr), C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT), re.regex, re.region, re.errorInfo, (*C.char)(nil), (*C.int)(nil), (*C.int)(nil)))
|
||||
|
||||
bytesPtr := unsafe.Pointer(&b[0])
|
||||
pos := int(C.SearchOnigRegex(
|
||||
bytesPtr, C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT),
|
||||
re.regex, re.errorInfo, nil, nil, nil,
|
||||
))
|
||||
|
||||
return pos >= 0
|
||||
}
|
||||
|
||||
func (re *Regexp) findAll(b []byte, n int) (matches [][]int) {
|
||||
re.ClearMatchData()
|
||||
|
||||
func (re *Regexp) findAll(b []byte, n int) [][]int {
|
||||
if n < 0 {
|
||||
n = len(b)
|
||||
}
|
||||
matchData := re.matchData
|
||||
offset := 0
|
||||
|
||||
capture := make([][]int, 0, numMatchStartSize)
|
||||
var offset int
|
||||
for offset <= n {
|
||||
if matchData.count >= len(matchData.indexes) {
|
||||
length := len(matchData.indexes[0])
|
||||
matchData.indexes = append(matchData.indexes, make([]int32, length))
|
||||
}
|
||||
if match := re.find(b, n, offset); len(match) > 0 {
|
||||
matchData.count += 1
|
||||
//move offset to the ending index of the current match and prepare to find the next non-overlapping match
|
||||
offset = match[1]
|
||||
//if match[0] == match[1], it means the current match does not advance the search. we need to exit the loop to avoid getting stuck here.
|
||||
if match[0] == match[1] {
|
||||
if offset < n && offset >= 0 {
|
||||
//there are more bytes, so move offset by a word
|
||||
_, width := utf8.DecodeRune(b[offset:])
|
||||
offset += width
|
||||
} else {
|
||||
//search is over, exit loop
|
||||
break
|
||||
}
|
||||
}
|
||||
} else {
|
||||
match := re.find(b, n, offset)
|
||||
if match == nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
matches2 := matchData.indexes[:matchData.count]
|
||||
matches = make([][]int, len(matches2))
|
||||
for i, v := range matches2 {
|
||||
matches[i] = make([]int, len(v))
|
||||
for j, v2 := range v {
|
||||
matches[i][j] = int(v2)
|
||||
|
||||
capture = append(capture, match)
|
||||
|
||||
// move offset to the ending index of the current match and prepare to
|
||||
// find the next non-overlapping match.
|
||||
offset = match[1]
|
||||
|
||||
// if match[0] == match[1], it means the current match does not advance
|
||||
// the search. we need to exit the loop to avoid getting stuck here.
|
||||
if match[0] == match[1] {
|
||||
if offset < n && offset >= 0 {
|
||||
//there are more bytes, so move offset by a word
|
||||
_, width := utf8.DecodeRune(b[offset:])
|
||||
offset += width
|
||||
} else {
|
||||
//search is over, exit loop
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
|
||||
return capture
|
||||
}
|
||||
|
||||
func (re *Regexp) FindIndex(b []byte) []int {
|
||||
re.ClearMatchData()
|
||||
match := re.find(b, len(b), 0)
|
||||
if len(match) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
return match[:2]
|
||||
}
|
||||
|
||||
|
@ -285,21 +273,21 @@ func (re *Regexp) Find(b []byte) []byte {
|
|||
if loc == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return getCapture(b, loc[0], loc[1])
|
||||
}
|
||||
|
||||
func (re *Regexp) FindString(s string) string {
|
||||
b := []byte(s)
|
||||
mb := re.Find(b)
|
||||
mb := re.Find([]byte(s))
|
||||
if mb == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
return string(mb)
|
||||
}
|
||||
|
||||
func (re *Regexp) FindStringIndex(s string) []int {
|
||||
b := []byte(s)
|
||||
return re.FindIndex(b)
|
||||
return re.FindIndex([]byte(s))
|
||||
}
|
||||
|
||||
func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
|
||||
|
@ -307,6 +295,7 @@ func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
|
|||
if len(matches) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
return matches
|
||||
}
|
||||
|
||||
|
@ -315,10 +304,12 @@ func (re *Regexp) FindAll(b []byte, n int) [][]byte {
|
|||
if matches == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
matchBytes := make([][]byte, 0, len(matches))
|
||||
for _, match := range matches {
|
||||
matchBytes = append(matchBytes, getCapture(b, match[0], match[1]))
|
||||
}
|
||||
|
||||
return matchBytes
|
||||
}
|
||||
|
||||
|
@ -328,6 +319,7 @@ func (re *Regexp) FindAllString(s string, n int) []string {
|
|||
if matches == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
matchStrings := make([]string, 0, len(matches))
|
||||
for _, match := range matches {
|
||||
m := getCapture(b, match[0], match[1])
|
||||
|
@ -337,51 +329,50 @@ func (re *Regexp) FindAllString(s string, n int) []string {
|
|||
matchStrings = append(matchStrings, string(m))
|
||||
}
|
||||
}
|
||||
|
||||
return matchStrings
|
||||
|
||||
}
|
||||
|
||||
func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
|
||||
b := []byte(s)
|
||||
return re.FindAllIndex(b, n)
|
||||
}
|
||||
|
||||
func (re *Regexp) findSubmatchIndex(b []byte) (match []int) {
|
||||
re.ClearMatchData()
|
||||
match = re.find(b, len(b), 0)
|
||||
return
|
||||
return re.FindAllIndex([]byte(s), n)
|
||||
}
|
||||
|
||||
func (re *Regexp) FindSubmatchIndex(b []byte) []int {
|
||||
match := re.findSubmatchIndex(b)
|
||||
match := re.find(b, len(b), 0)
|
||||
if len(match) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
return match
|
||||
}
|
||||
|
||||
func (re *Regexp) FindSubmatch(b []byte) [][]byte {
|
||||
match := re.findSubmatchIndex(b)
|
||||
match := re.FindSubmatchIndex(b)
|
||||
if match == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
length := len(match) / 2
|
||||
if length == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
results := make([][]byte, 0, length)
|
||||
for i := 0; i < length; i++ {
|
||||
results = append(results, getCapture(b, match[2*i], match[2*i+1]))
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
func (re *Regexp) FindStringSubmatch(s string) []string {
|
||||
b := []byte(s)
|
||||
match := re.findSubmatchIndex(b)
|
||||
match := re.FindSubmatchIndex(b)
|
||||
if match == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
length := len(match) / 2
|
||||
if length == 0 {
|
||||
return nil
|
||||
|
@ -396,12 +387,12 @@ func (re *Regexp) FindStringSubmatch(s string) []string {
|
|||
results = append(results, string(cap))
|
||||
}
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
func (re *Regexp) FindStringSubmatchIndex(s string) []int {
|
||||
b := []byte(s)
|
||||
return re.FindSubmatchIndex(b)
|
||||
return re.FindSubmatchIndex([]byte(s))
|
||||
}
|
||||
|
||||
func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
|
||||
|
@ -409,6 +400,7 @@ func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
|
|||
if len(matches) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
return matches
|
||||
}
|
||||
|
||||
|
@ -417,6 +409,7 @@ func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
|
|||
if len(matches) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
allCapturedBytes := make([][][]byte, 0, len(matches))
|
||||
for _, match := range matches {
|
||||
length := len(match) / 2
|
||||
|
@ -424,6 +417,7 @@ func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
|
|||
for i := 0; i < length; i++ {
|
||||
capturedBytes = append(capturedBytes, getCapture(b, match[2*i], match[2*i+1]))
|
||||
}
|
||||
|
||||
allCapturedBytes = append(allCapturedBytes, capturedBytes)
|
||||
}
|
||||
|
||||
|
@ -432,10 +426,12 @@ func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
|
|||
|
||||
func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
|
||||
b := []byte(s)
|
||||
|
||||
matches := re.findAll(b, n)
|
||||
if len(matches) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
allCapturedStrings := make([][]string, 0, len(matches))
|
||||
for _, match := range matches {
|
||||
length := len(match) / 2
|
||||
|
@ -448,14 +444,15 @@ func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
|
|||
capturedStrings = append(capturedStrings, string(cap))
|
||||
}
|
||||
}
|
||||
|
||||
allCapturedStrings = append(allCapturedStrings, capturedStrings)
|
||||
}
|
||||
|
||||
return allCapturedStrings
|
||||
}
|
||||
|
||||
func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
|
||||
b := []byte(s)
|
||||
return re.FindAllSubmatchIndex(b, n)
|
||||
return re.FindAllSubmatchIndex([]byte(s), n)
|
||||
}
|
||||
|
||||
func (re *Regexp) Match(b []byte) bool {
|
||||
|
@ -463,44 +460,25 @@ func (re *Regexp) Match(b []byte) bool {
|
|||
}
|
||||
|
||||
func (re *Regexp) MatchString(s string) bool {
|
||||
b := []byte(s)
|
||||
return re.Match(b)
|
||||
return re.Match([]byte(s))
|
||||
}
|
||||
|
||||
func (re *Regexp) NumSubexp() int {
|
||||
return (int)(C.onig_number_of_captures(re.regex))
|
||||
}
|
||||
|
||||
func (re *Regexp) getNamedCapture(name []byte, capturedBytes [][]byte) []byte {
|
||||
nameStr := string(name)
|
||||
capNum := re.groupNameToId(nameStr)
|
||||
if capNum < 0 || capNum >= len(capturedBytes) {
|
||||
panic(fmt.Sprintf("capture group name (%q) has error\n", nameStr))
|
||||
}
|
||||
return capturedBytes[capNum]
|
||||
}
|
||||
|
||||
func (re *Regexp) getNumberedCapture(num int, capturedBytes [][]byte) []byte {
|
||||
//when named capture groups exist, numbered capture groups returns ""
|
||||
if re.namedGroupInfo == nil && num <= (len(capturedBytes)-1) && num >= 0 {
|
||||
return capturedBytes[num]
|
||||
}
|
||||
return ([]byte)("")
|
||||
}
|
||||
|
||||
func fillCapturedValues(repl []byte, _ []byte, capturedBytes map[string][]byte) []byte {
|
||||
replLen := len(repl)
|
||||
newRepl := make([]byte, 0, replLen*3)
|
||||
inEscapeMode := false
|
||||
inGroupNameMode := false
|
||||
groupName := make([]byte, 0, replLen)
|
||||
for index := 0; index < replLen; index += 1 {
|
||||
|
||||
var inGroupNameMode, inEscapeMode bool
|
||||
for index := 0; index < replLen; index++ {
|
||||
ch := repl[index]
|
||||
if inGroupNameMode && ch == byte('<') {
|
||||
} else if inGroupNameMode && ch == byte('>') {
|
||||
inGroupNameMode = false
|
||||
groupNameStr := string(groupName)
|
||||
capBytes := capturedBytes[groupNameStr]
|
||||
capBytes := capturedBytes[string(groupName)]
|
||||
newRepl = append(newRepl, capBytes...)
|
||||
groupName = groupName[:0] //reset the name
|
||||
} else if inGroupNameMode {
|
||||
|
@ -512,7 +490,7 @@ func fillCapturedValues(repl []byte, _ []byte, capturedBytes map[string][]byte)
|
|||
} else if inEscapeMode && ch == byte('k') && (index+1) < replLen && repl[index+1] == byte('<') {
|
||||
inGroupNameMode = true
|
||||
inEscapeMode = false
|
||||
index += 1 //bypass the next char '<'
|
||||
index++ //bypass the next char '<'
|
||||
} else if inEscapeMode {
|
||||
newRepl = append(newRepl, '\\')
|
||||
newRepl = append(newRepl, ch)
|
||||
|
@ -523,6 +501,7 @@ func fillCapturedValues(repl []byte, _ []byte, capturedBytes map[string][]byte)
|
|||
inEscapeMode = !inEscapeMode
|
||||
}
|
||||
}
|
||||
|
||||
return newRepl
|
||||
}
|
||||
|
||||
|
@ -532,10 +511,12 @@ func (re *Regexp) replaceAll(src, repl []byte, replFunc func([]byte, []byte, map
|
|||
if len(matches) == 0 {
|
||||
return src
|
||||
}
|
||||
|
||||
dest := make([]byte, 0, srcLen)
|
||||
for i, match := range matches {
|
||||
length := len(match) / 2
|
||||
capturedBytes := make(map[string][]byte)
|
||||
|
||||
if re.namedGroupInfo == nil {
|
||||
for j := 0; j < length; j++ {
|
||||
capturedBytes[strconv.Itoa(j)] = getCapture(src, match[2*j], match[2*j+1])
|
||||
|
@ -545,6 +526,7 @@ func (re *Regexp) replaceAll(src, repl []byte, replFunc func([]byte, []byte, map
|
|||
capturedBytes[name] = getCapture(src, match[2*j], match[2*j+1])
|
||||
}
|
||||
}
|
||||
|
||||
matchBytes := getCapture(src, match[0], match[1])
|
||||
newRepl := replFunc(repl, matchBytes, capturedBytes)
|
||||
prevEnd := 0
|
||||
|
@ -552,15 +534,19 @@ func (re *Regexp) replaceAll(src, repl []byte, replFunc func([]byte, []byte, map
|
|||
prevMatch := matches[i-1][:2]
|
||||
prevEnd = prevMatch[1]
|
||||
}
|
||||
|
||||
if match[0] > prevEnd && prevEnd >= 0 && match[0] <= srcLen {
|
||||
dest = append(dest, src[prevEnd:match[0]]...)
|
||||
}
|
||||
|
||||
dest = append(dest, newRepl...)
|
||||
}
|
||||
|
||||
lastEnd := matches[len(matches)-1][1]
|
||||
if lastEnd < srcLen && lastEnd >= 0 {
|
||||
dest = append(dest, src[lastEnd:]...)
|
||||
}
|
||||
|
||||
return dest
|
||||
}
|
||||
|
||||
|
@ -569,7 +555,7 @@ func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
|
|||
}
|
||||
|
||||
func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
|
||||
return re.replaceAll(src, []byte(""), func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte {
|
||||
return re.replaceAll(src, nil, func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte {
|
||||
return repl(matchBytes)
|
||||
})
|
||||
}
|
||||
|
@ -579,43 +565,44 @@ func (re *Regexp) ReplaceAllString(src, repl string) string {
|
|||
}
|
||||
|
||||
func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
|
||||
srcB := []byte(src)
|
||||
destB := re.replaceAll(srcB, []byte(""), func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte {
|
||||
return string(re.replaceAll([]byte(src), nil, func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte {
|
||||
return []byte(repl(string(matchBytes)))
|
||||
})
|
||||
return string(destB)
|
||||
}))
|
||||
}
|
||||
|
||||
func (re *Regexp) String() string {
|
||||
return re.pattern
|
||||
}
|
||||
|
||||
func grow_buffer(b []byte, offset int, n int) []byte {
|
||||
func growBuffer(b []byte, offset int, n int) []byte {
|
||||
if offset+n > cap(b) {
|
||||
buf := make([]byte, 2*cap(b)+n)
|
||||
copy(buf, b[:offset])
|
||||
return buf
|
||||
}
|
||||
|
||||
return b
|
||||
}
|
||||
|
||||
func fromReader(r io.RuneReader) []byte {
|
||||
b := make([]byte, numReadBufferStartSize)
|
||||
offset := 0
|
||||
var err error = nil
|
||||
for err == nil {
|
||||
|
||||
var offset int
|
||||
for {
|
||||
rune, runeWidth, err := r.ReadRune()
|
||||
if err == nil {
|
||||
b = grow_buffer(b, offset, runeWidth)
|
||||
writeWidth := utf8.EncodeRune(b[offset:], rune)
|
||||
if runeWidth != writeWidth {
|
||||
panic("reading rune width not equal to the written rune width")
|
||||
}
|
||||
offset += writeWidth
|
||||
} else {
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
|
||||
b = growBuffer(b, offset, runeWidth)
|
||||
writeWidth := utf8.EncodeRune(b[offset:], rune)
|
||||
if runeWidth != writeWidth {
|
||||
panic("reading rune width not equal to the written rune width")
|
||||
}
|
||||
|
||||
offset += writeWidth
|
||||
}
|
||||
|
||||
return b[:offset]
|
||||
}
|
||||
|
||||
|
@ -644,25 +631,25 @@ func MatchString(pattern string, s string) (matched bool, error error) {
|
|||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
return re.MatchString(s), nil
|
||||
}
|
||||
|
||||
func (re *Regexp) Gsub(src, repl string) string {
|
||||
srcBytes := ([]byte)(src)
|
||||
replBytes := ([]byte)(repl)
|
||||
replaced := re.replaceAll(srcBytes, replBytes, fillCapturedValues)
|
||||
return string(replaced)
|
||||
return string(re.replaceAll([]byte(src), []byte(repl), fillCapturedValues))
|
||||
}
|
||||
|
||||
func (re *Regexp) GsubFunc(src string, replFunc func(string, map[string]string) string) string {
|
||||
srcBytes := ([]byte)(src)
|
||||
replaced := re.replaceAll(srcBytes, nil, func(_ []byte, matchBytes []byte, capturedBytes map[string][]byte) []byte {
|
||||
capturedStrings := make(map[string]string)
|
||||
for name, capBytes := range capturedBytes {
|
||||
capturedStrings[name] = string(capBytes)
|
||||
}
|
||||
matchString := string(matchBytes)
|
||||
return ([]byte)(replFunc(matchString, capturedStrings))
|
||||
})
|
||||
replaced := re.replaceAll([]byte(src), nil,
|
||||
func(_ []byte, matchBytes []byte, capturedBytes map[string][]byte) []byte {
|
||||
capturedStrings := make(map[string]string)
|
||||
for name, capBytes := range capturedBytes {
|
||||
capturedStrings[name] = string(capBytes)
|
||||
}
|
||||
matchString := string(matchBytes)
|
||||
return ([]byte)(replFunc(matchString, capturedStrings))
|
||||
},
|
||||
)
|
||||
|
||||
return string(replaced)
|
||||
}
|
||||
|
|
|
@ -1,22 +0,0 @@
|
|||
Copyright (c) 2013 Caleb Spare
|
||||
|
||||
MIT License
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@ -1,7 +0,0 @@
|
|||
# Trie
|
||||
|
||||
[![GoDoc](http://godoc.org/github.com/toqueteos/trie?status.png)](http://godoc.org/github.com/toqueteos/trie)
|
||||
|
||||
This is a fork of https://github.com/cespare/go-trie that adds the `PrefixIndex` method.
|
||||
|
||||
It's required for https://github.com/toqueteos/substring.
|
|
@ -1 +0,0 @@
|
|||
module github.com/toqueteos/trie
|
|
@ -1,102 +0,0 @@
|
|||
// Package trie is an implementation of a trie (prefix tree) data structure over byte slices. It provides a
|
||||
// small and simple API for usage as a set as well as a 'Node' API for walking the trie.
|
||||
package trie
|
||||
|
||||
// A Trie is a a prefix tree.
|
||||
type Trie struct {
|
||||
root *Node
|
||||
}
|
||||
|
||||
// New construct a new, empty Trie ready for use.
|
||||
func New() *Trie {
|
||||
return &Trie{
|
||||
root: &Node{},
|
||||
}
|
||||
}
|
||||
|
||||
// Insert puts b into the Trie. It returns true if the element was not previously in t.
|
||||
func (t *Trie) Insert(b []byte) bool {
|
||||
n := t.root
|
||||
for _, c := range b {
|
||||
next, ok := n.Walk(c)
|
||||
if !ok {
|
||||
next = &Node{}
|
||||
n.branches[c] = next
|
||||
n.hasChildren = true
|
||||
}
|
||||
n = next
|
||||
}
|
||||
if n.terminal {
|
||||
return false
|
||||
}
|
||||
n.terminal = true
|
||||
return true
|
||||
}
|
||||
|
||||
// Contains checks t for membership of b.
|
||||
func (t *Trie) Contains(b []byte) bool {
|
||||
n := t.root
|
||||
for _, c := range b {
|
||||
next, ok := n.Walk(c)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
n = next
|
||||
}
|
||||
return n.terminal
|
||||
}
|
||||
|
||||
// PrefixIndex walks through `b` until a prefix is found (terminal node) or it is exhausted.
|
||||
func (t *Trie) PrefixIndex(b []byte) int {
|
||||
var idx int
|
||||
n := t.root
|
||||
for _, c := range b {
|
||||
next, ok := n.Walk(c)
|
||||
if !ok {
|
||||
return -1
|
||||
}
|
||||
if next.terminal {
|
||||
return idx
|
||||
}
|
||||
n = next
|
||||
idx++
|
||||
}
|
||||
if !n.terminal {
|
||||
idx = -1
|
||||
}
|
||||
return idx
|
||||
}
|
||||
|
||||
// Root returns the root node of a Trie. A valid Trie (i.e., constructed with New), always has a non-nil root
|
||||
// node.
|
||||
func (t *Trie) Root() *Node {
|
||||
return t.root
|
||||
}
|
||||
|
||||
// A Node represents a logical vertex in the trie structure.
|
||||
type Node struct {
|
||||
branches [256]*Node
|
||||
terminal bool
|
||||
hasChildren bool
|
||||
}
|
||||
|
||||
// Walk returns the node reached along edge c, if one exists. The ok value indicates whether such a node
|
||||
// exist.
|
||||
func (n *Node) Walk(c byte) (next *Node, ok bool) {
|
||||
next = n.branches[int(c)]
|
||||
return next, (next != nil)
|
||||
}
|
||||
|
||||
// Terminal indicates whether n is terminal in the trie (that is, whether the path from the root to n
|
||||
// represents an element in the set). For instance, if the root node is terminal, then []byte{} is in the
|
||||
// trie.
|
||||
func (n *Node) Terminal() bool {
|
||||
return n.terminal
|
||||
}
|
||||
|
||||
// Leaf indicates whether n is a leaf node in the trie (that is, whether it has children). A leaf node must be
|
||||
// terminal (else it would not exist). Logically, if n is a leaf node then the []byte represented by the path
|
||||
// from the root to n is not a proper prefix of any element of the trie.
|
||||
func (n *Node) Leaf() bool {
|
||||
return !n.hasChildren
|
||||
}
|
|
@ -1,24 +0,0 @@
|
|||
# Compiled Object files, Static and Dynamic libs (Shared Objects)
|
||||
*.o
|
||||
*.a
|
||||
*.so
|
||||
|
||||
# Folders
|
||||
_obj
|
||||
_test
|
||||
|
||||
# Architecture specific extensions/prefixes
|
||||
*.[568vq]
|
||||
[568vq].out
|
||||
|
||||
*.cgo1.go
|
||||
*.cgo2.c
|
||||
_cgo_defun.c
|
||||
_cgo_gotypes.go
|
||||
_cgo_export.*
|
||||
|
||||
_testmain.go
|
||||
|
||||
*.exe
|
||||
*.test
|
||||
*.prof
|
|
@ -1,11 +0,0 @@
|
|||
language: go
|
||||
|
||||
go:
|
||||
- 1.2
|
||||
- 1.3
|
||||
- 1.4
|
||||
- tip
|
||||
|
||||
script:
|
||||
- go get launchpad.net/gocheck
|
||||
- go test
|
|
@ -1,22 +0,0 @@
|
|||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Carlos Cobo
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
|
@ -1,80 +0,0 @@
|
|||
# substring [![Build Status](https://travis-ci.org/toqueteos/substring.png?branch=master)](https://travis-ci.org/toqueteos/substring) [![GoDoc](http://godoc.org/github.com/toqueteos/substring?status.png)](http://godoc.org/github.com/toqueteos/substring) [![GitHub release](https://img.shields.io/github/release/toqueteos/substring.svg)](https://github.com/toqueteos/substring/releases)
|
||||
|
||||
Simple and composable alternative to [regexp](http://golang.org/pkg/regexp/) package for fast substring searches.
|
||||
|
||||
## Installation
|
||||
|
||||
The recommended way to install substring
|
||||
|
||||
```
|
||||
go get -t gopkg.in/toqueteos/substring.v1
|
||||
```
|
||||
|
||||
The `-t` flag is for fetching [gocheck](https://gopkg.in/check.v1), required for tests and benchmarks.
|
||||
|
||||
## Examples
|
||||
|
||||
A basic example with two matchers:
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
|
||||
"gopkg.in/toqueteos/substring.v1"
|
||||
)
|
||||
|
||||
func main() {
|
||||
m1 := substring.After("assets/", substring.Or(
|
||||
substring.Has("jquery"),
|
||||
substring.Has("angular"),
|
||||
substring.Suffixes(".js", ".css", ".html"),
|
||||
))
|
||||
fmt.Println(m1.Match("assets/angular/foo/bar")) //Prints: true
|
||||
fmt.Println(m1.Match("assets/js/file.js")) //Prints: true
|
||||
fmt.Println(m1.Match("assets/style/bar.css")) //Prints: true
|
||||
fmt.Println(m1.Match("assets/foo/bar.html")) //Prints: false
|
||||
fmt.Println(m1.Match("assets/js/qux.json")) //Prints: false
|
||||
fmt.Println(m1.Match("core/file.html")) //Prints: false
|
||||
fmt.Println(m1.Match("foobar/that.jsx")) //Prints: false
|
||||
|
||||
m2 := substring.After("vendor/", substring.Suffixes(".css", ".js", ".less"))
|
||||
|
||||
fmt.Println(m2.Match("foo/vendor/bar/qux.css")) //Prints: true
|
||||
fmt.Println(m2.Match("foo/var/qux.less")) //Prints: false
|
||||
|
||||
re := regexp.MustCompile(`vendor\/.*\.(css|js|less)$`)
|
||||
fmt.Println(re.MatchString("foo/vendor/bar/qux.css")) //Prints: true
|
||||
fmt.Println(re.MatchString("foo/var/qux.less")) //Prints: false
|
||||
}
|
||||
```
|
||||
|
||||
## How fast?
|
||||
|
||||
It may vary depending on your use case but 1~2 orders of magnitude faster than `regexp` is pretty common.
|
||||
|
||||
Test it out for yourself by running `go test -check.b`!
|
||||
|
||||
```
|
||||
$ go test -check.b
|
||||
PASS: lib_test.go:18: LibSuite.BenchmarkExample1 10000000 221 ns/op
|
||||
PASS: lib_test.go:23: LibSuite.BenchmarkExample2 10000000 229 ns/op
|
||||
PASS: lib_test.go:28: LibSuite.BenchmarkExample3 10000000 216 ns/op
|
||||
PASS: lib_test.go:33: LibSuite.BenchmarkExample4 10000000 208 ns/op
|
||||
PASS: lib_test.go:38: LibSuite.BenchmarkExample5 20000000 82.1 ns/op
|
||||
PASS: lib_test.go:48: LibSuite.BenchmarkExampleRe1 500000 4136 ns/op
|
||||
PASS: lib_test.go:53: LibSuite.BenchmarkExampleRe2 500000 5222 ns/op
|
||||
PASS: lib_test.go:58: LibSuite.BenchmarkExampleRe3 500000 5116 ns/op
|
||||
PASS: lib_test.go:63: LibSuite.BenchmarkExampleRe4 500000 4020 ns/op
|
||||
PASS: lib_test.go:68: LibSuite.BenchmarkExampleRe5 10000000 226 ns/op
|
||||
OK: 10 passed
|
||||
PASS
|
||||
ok gopkg.in/toqueteos/substring.v1 23.471s
|
||||
```
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
MIT, see [LICENSE](LICENSE)
|
|
@ -1,229 +0,0 @@
|
|||
package substring
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"regexp"
|
||||
|
||||
"github.com/toqueteos/trie"
|
||||
)
|
||||
|
||||
type BytesMatcher interface {
|
||||
Match(b []byte) bool
|
||||
MatchIndex(b []byte) int
|
||||
}
|
||||
|
||||
// regexp
|
||||
type regexpBytes struct{ re *regexp.Regexp }
|
||||
|
||||
func BytesRegexp(pat string) *regexpBytes { return ®expBytes{regexp.MustCompile(pat)} }
|
||||
func (m *regexpBytes) Match(b []byte) bool { return m.re.Match(b) }
|
||||
func (m *regexpBytes) MatchIndex(b []byte) int {
|
||||
found := m.re.FindIndex(b)
|
||||
if found != nil {
|
||||
return found[1]
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// exact
|
||||
type exactBytes struct{ pat []byte }
|
||||
|
||||
func BytesExact(pat string) *exactBytes { return &exactBytes{[]byte(pat)} }
|
||||
func (m *exactBytes) Match(b []byte) bool {
|
||||
l, r := len(m.pat), len(b)
|
||||
if l != r {
|
||||
return false
|
||||
}
|
||||
for i := 0; i < l; i++ {
|
||||
if b[i] != m.pat[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
func (m *exactBytes) MatchIndex(b []byte) int {
|
||||
if m.Match(b) {
|
||||
return len(b)
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// any, search `s` in `.Match(pat)`
|
||||
type anyBytes struct {
|
||||
pat []byte
|
||||
}
|
||||
|
||||
func BytesAny(pat string) *anyBytes { return &anyBytes{[]byte(pat)} }
|
||||
func (m *anyBytes) Match(b []byte) bool { return bytes.Index(m.pat, b) >= 0 }
|
||||
func (m *anyBytes) MatchIndex(b []byte) int {
|
||||
if idx := bytes.Index(m.pat, b); idx >= 0 {
|
||||
return idx + len(b)
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// has, search `pat` in `.Match(s)`
|
||||
type hasBytes struct {
|
||||
pat []byte
|
||||
}
|
||||
|
||||
func BytesHas(pat string) *hasBytes { return &hasBytes{[]byte(pat)} }
|
||||
func (m *hasBytes) Match(b []byte) bool { return bytes.Index(b, m.pat) >= 0 }
|
||||
func (m *hasBytes) MatchIndex(b []byte) int {
|
||||
if idx := bytes.Index(b, m.pat); idx >= 0 {
|
||||
return idx + len(m.pat)
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// prefix
|
||||
type prefixBytes struct{ pat []byte }
|
||||
|
||||
func BytesPrefix(pat string) *prefixBytes { return &prefixBytes{[]byte(pat)} }
|
||||
func (m *prefixBytes) Match(b []byte) bool { return bytes.HasPrefix(b, m.pat) }
|
||||
func (m *prefixBytes) MatchIndex(b []byte) int {
|
||||
if bytes.HasPrefix(b, m.pat) {
|
||||
return len(m.pat)
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// prefixes
|
||||
type prefixesBytes struct {
|
||||
t *trie.Trie
|
||||
}
|
||||
|
||||
func BytesPrefixes(pats ...string) *prefixesBytes {
|
||||
t := trie.New()
|
||||
for _, pat := range pats {
|
||||
t.Insert([]byte(pat))
|
||||
}
|
||||
return &prefixesBytes{t}
|
||||
}
|
||||
func (m *prefixesBytes) Match(b []byte) bool { return m.t.PrefixIndex(b) >= 0 }
|
||||
func (m *prefixesBytes) MatchIndex(b []byte) int {
|
||||
if idx := m.t.PrefixIndex(b); idx >= 0 {
|
||||
return idx
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// suffix
|
||||
type suffixBytes struct{ pat []byte }
|
||||
|
||||
func BytesSuffix(pat string) *suffixBytes { return &suffixBytes{[]byte(pat)} }
|
||||
func (m *suffixBytes) Match(b []byte) bool { return bytes.HasSuffix(b, m.pat) }
|
||||
func (m *suffixBytes) MatchIndex(b []byte) int {
|
||||
if bytes.HasSuffix(b, m.pat) {
|
||||
return len(m.pat)
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// suffixes
|
||||
type suffixesBytes struct {
|
||||
t *trie.Trie
|
||||
}
|
||||
|
||||
func BytesSuffixes(pats ...string) *suffixesBytes {
|
||||
t := trie.New()
|
||||
for _, pat := range pats {
|
||||
t.Insert(reverse([]byte(pat)))
|
||||
}
|
||||
return &suffixesBytes{t}
|
||||
}
|
||||
func (m *suffixesBytes) Match(b []byte) bool {
|
||||
return m.t.PrefixIndex(reverse(b)) >= 0
|
||||
}
|
||||
func (m *suffixesBytes) MatchIndex(b []byte) int {
|
||||
if idx := m.t.PrefixIndex(reverse(b)); idx >= 0 {
|
||||
return idx
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// after
|
||||
type afterBytes struct {
|
||||
first []byte
|
||||
matcher BytesMatcher
|
||||
}
|
||||
|
||||
func BytesAfter(first string, m BytesMatcher) *afterBytes { return &afterBytes{[]byte(first), m} }
|
||||
func (a *afterBytes) Match(b []byte) bool {
|
||||
if idx := bytes.Index(b, a.first); idx >= 0 {
|
||||
return a.matcher.Match(b[idx+len(a.first):])
|
||||
}
|
||||
return false
|
||||
}
|
||||
func (a *afterBytes) MatchIndex(b []byte) int {
|
||||
if idx := bytes.Index(b, a.first); idx >= 0 {
|
||||
return idx + a.matcher.MatchIndex(b[idx:])
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// and, returns true iff all matchers return true
|
||||
type andBytes struct{ matchers []BytesMatcher }
|
||||
|
||||
func BytesAnd(m ...BytesMatcher) *andBytes { return &andBytes{m} }
|
||||
func (a *andBytes) Match(b []byte) bool {
|
||||
for _, m := range a.matchers {
|
||||
if !m.Match(b) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
func (a *andBytes) MatchIndex(b []byte) int {
|
||||
longest := 0
|
||||
for _, m := range a.matchers {
|
||||
if idx := m.MatchIndex(b); idx < 0 {
|
||||
return -1
|
||||
} else if idx > longest {
|
||||
longest = idx
|
||||
}
|
||||
}
|
||||
return longest
|
||||
}
|
||||
|
||||
// or, returns true iff any matcher returns true
|
||||
type orBytes struct{ matchers []BytesMatcher }
|
||||
|
||||
func BytesOr(m ...BytesMatcher) *orBytes { return &orBytes{m} }
|
||||
func (o *orBytes) Match(b []byte) bool {
|
||||
for _, m := range o.matchers {
|
||||
if m.Match(b) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
func (o *orBytes) MatchIndex(b []byte) int {
|
||||
for _, m := range o.matchers {
|
||||
if idx := m.MatchIndex(b); idx >= 0 {
|
||||
return idx
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
type suffixGroupBytes struct {
|
||||
suffix BytesMatcher
|
||||
matchers []BytesMatcher
|
||||
}
|
||||
|
||||
func BytesSuffixGroup(s string, m ...BytesMatcher) *suffixGroupBytes {
|
||||
return &suffixGroupBytes{BytesSuffix(s), m}
|
||||
}
|
||||
func (sg *suffixGroupBytes) Match(b []byte) bool {
|
||||
if sg.suffix.Match(b) {
|
||||
return BytesOr(sg.matchers...).Match(b)
|
||||
}
|
||||
return false
|
||||
}
|
||||
func (sg *suffixGroupBytes) MatchIndex(b []byte) int {
|
||||
if sg.suffix.MatchIndex(b) >= 0 {
|
||||
return BytesOr(sg.matchers...).MatchIndex(b)
|
||||
}
|
||||
return -1
|
||||
}
|
|
@ -1,10 +0,0 @@
|
|||
package substring
|
||||
|
||||
// reverse is a helper fn for Suffixes
|
||||
func reverse(b []byte) []byte {
|
||||
n := len(b)
|
||||
for i := 0; i < n/2; i++ {
|
||||
b[i], b[n-1-i] = b[n-1-i], b[i]
|
||||
}
|
||||
return b
|
||||
}
|
|
@ -1,216 +0,0 @@
|
|||
package substring
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/toqueteos/trie"
|
||||
)
|
||||
|
||||
type StringsMatcher interface {
|
||||
Match(s string) bool
|
||||
MatchIndex(s string) int
|
||||
}
|
||||
|
||||
// regexp
|
||||
type regexpString struct{ re *regexp.Regexp }
|
||||
|
||||
func Regexp(pat string) *regexpString { return ®expString{regexp.MustCompile(pat)} }
|
||||
func (m *regexpString) Match(s string) bool { return m.re.MatchString(s) }
|
||||
func (m *regexpString) MatchIndex(s string) int {
|
||||
found := m.re.FindStringIndex(s)
|
||||
if found != nil {
|
||||
return found[1]
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// exact
|
||||
type exactString struct{ pat string }
|
||||
|
||||
func Exact(pat string) *exactString { return &exactString{pat} }
|
||||
func (m *exactString) Match(s string) bool { return m.pat == s }
|
||||
func (m *exactString) MatchIndex(s string) int {
|
||||
if m.pat == s {
|
||||
return len(s)
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// any, search `s` in `.Match(pat)`
|
||||
type anyString struct{ pat string }
|
||||
|
||||
func Any(pat string) *anyString { return &anyString{pat} }
|
||||
func (m *anyString) Match(s string) bool {
|
||||
return strings.Index(m.pat, s) >= 0
|
||||
}
|
||||
func (m *anyString) MatchIndex(s string) int {
|
||||
if idx := strings.Index(m.pat, s); idx >= 0 {
|
||||
return idx + len(s)
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// has, search `pat` in `.Match(s)`
|
||||
type hasString struct{ pat string }
|
||||
|
||||
func Has(pat string) *hasString { return &hasString{pat} }
|
||||
func (m *hasString) Match(s string) bool {
|
||||
return strings.Index(s, m.pat) >= 0
|
||||
}
|
||||
func (m *hasString) MatchIndex(s string) int {
|
||||
if idx := strings.Index(s, m.pat); idx >= 0 {
|
||||
return idx + len(m.pat)
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// prefix
|
||||
type prefixString struct{ pat string }
|
||||
|
||||
func Prefix(pat string) *prefixString { return &prefixString{pat} }
|
||||
func (m *prefixString) Match(s string) bool { return strings.HasPrefix(s, m.pat) }
|
||||
func (m *prefixString) MatchIndex(s string) int {
|
||||
if strings.HasPrefix(s, m.pat) {
|
||||
return len(m.pat)
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// prefixes
|
||||
type prefixesString struct{ t *trie.Trie }
|
||||
|
||||
func Prefixes(pats ...string) *prefixesString {
|
||||
t := trie.New()
|
||||
for _, pat := range pats {
|
||||
t.Insert([]byte(pat))
|
||||
}
|
||||
return &prefixesString{t}
|
||||
}
|
||||
func (m *prefixesString) Match(s string) bool { return m.t.PrefixIndex([]byte(s)) >= 0 }
|
||||
func (m *prefixesString) MatchIndex(s string) int {
|
||||
if idx := m.t.PrefixIndex([]byte(s)); idx >= 0 {
|
||||
return idx
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// suffix
|
||||
type suffixString struct{ pat string }
|
||||
|
||||
func Suffix(pat string) *suffixString { return &suffixString{pat} }
|
||||
func (m *suffixString) Match(s string) bool { return strings.HasSuffix(s, m.pat) }
|
||||
func (m *suffixString) MatchIndex(s string) int {
|
||||
if strings.HasSuffix(s, m.pat) {
|
||||
return len(m.pat)
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// suffixes
|
||||
type suffixesString struct{ t *trie.Trie }
|
||||
|
||||
func Suffixes(pats ...string) *suffixesString {
|
||||
t := trie.New()
|
||||
for _, pat := range pats {
|
||||
t.Insert(reverse([]byte(pat)))
|
||||
}
|
||||
return &suffixesString{t}
|
||||
}
|
||||
func (m *suffixesString) Match(s string) bool {
|
||||
return m.t.PrefixIndex(reverse([]byte(s))) >= 0
|
||||
}
|
||||
func (m *suffixesString) MatchIndex(s string) int {
|
||||
if idx := m.t.PrefixIndex(reverse([]byte(s))); idx >= 0 {
|
||||
return idx
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// after
|
||||
type afterString struct {
|
||||
first string
|
||||
matcher StringsMatcher
|
||||
}
|
||||
|
||||
func After(first string, m StringsMatcher) *afterString {
|
||||
return &afterString{first, m}
|
||||
}
|
||||
func (a *afterString) Match(s string) bool {
|
||||
if idx := strings.Index(s, a.first); idx >= 0 {
|
||||
return a.matcher.Match(s[idx+len(a.first):])
|
||||
}
|
||||
return false
|
||||
}
|
||||
func (a *afterString) MatchIndex(s string) int {
|
||||
if idx := strings.Index(s, a.first); idx >= 0 {
|
||||
return idx + a.matcher.MatchIndex(s[idx+len(a.first):])
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// and, returns true iff all matchers return true
|
||||
type andString struct{ matchers []StringsMatcher }
|
||||
|
||||
func And(m ...StringsMatcher) *andString { return &andString{m} }
|
||||
func (a *andString) Match(s string) bool {
|
||||
for _, m := range a.matchers {
|
||||
if !m.Match(s) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
func (a *andString) MatchIndex(s string) int {
|
||||
longest := 0
|
||||
for _, m := range a.matchers {
|
||||
if idx := m.MatchIndex(s); idx < 0 {
|
||||
return -1
|
||||
} else if idx > longest {
|
||||
longest = idx
|
||||
}
|
||||
}
|
||||
return longest
|
||||
}
|
||||
|
||||
// or, returns true iff any matcher returns true
|
||||
type orString struct{ matchers []StringsMatcher }
|
||||
|
||||
func Or(m ...StringsMatcher) *orString { return &orString{m} }
|
||||
func (o *orString) Match(s string) bool {
|
||||
for _, m := range o.matchers {
|
||||
if m.Match(s) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
func (o *orString) MatchIndex(s string) int {
|
||||
for _, m := range o.matchers {
|
||||
if idx := m.MatchIndex(s); idx >= 0 {
|
||||
return idx
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
type suffixGroupString struct {
|
||||
suffix StringsMatcher
|
||||
matchers []StringsMatcher
|
||||
}
|
||||
|
||||
func SuffixGroup(s string, m ...StringsMatcher) *suffixGroupString {
|
||||
return &suffixGroupString{Suffix(s), m}
|
||||
}
|
||||
func (sg *suffixGroupString) Match(s string) bool {
|
||||
if sg.suffix.Match(s) {
|
||||
return Or(sg.matchers...).Match(s)
|
||||
}
|
||||
return false
|
||||
}
|
||||
func (sg *suffixGroupString) MatchIndex(s string) int {
|
||||
if sg.suffix.MatchIndex(s) >= 0 {
|
||||
return Or(sg.matchers...).MatchIndex(s)
|
||||
}
|
||||
return -1
|
||||
}
|
|
@ -202,7 +202,7 @@ github.com/gliderlabs/ssh
|
|||
# github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a
|
||||
## explicit
|
||||
github.com/glycerine/go-unsnap-stream
|
||||
# github.com/go-enry/go-enry/v2 v2.3.0
|
||||
# github.com/go-enry/go-enry/v2 v2.5.2
|
||||
## explicit
|
||||
github.com/go-enry/go-enry/v2
|
||||
github.com/go-enry/go-enry/v2/data
|
||||
|
@ -210,7 +210,7 @@ github.com/go-enry/go-enry/v2/data/rule
|
|||
github.com/go-enry/go-enry/v2/internal/tokenizer
|
||||
github.com/go-enry/go-enry/v2/internal/tokenizer/flex
|
||||
github.com/go-enry/go-enry/v2/regex
|
||||
# github.com/go-enry/go-oniguruma v1.2.0
|
||||
# github.com/go-enry/go-oniguruma v1.2.1
|
||||
github.com/go-enry/go-oniguruma
|
||||
# github.com/go-git/gcfg v1.5.0
|
||||
github.com/go-git/gcfg
|
||||
|
@ -614,8 +614,6 @@ github.com/syndtr/goleveldb/leveldb/util
|
|||
# github.com/tinylib/msgp v1.1.2
|
||||
## explicit
|
||||
github.com/tinylib/msgp/msgp
|
||||
# github.com/toqueteos/trie v1.0.0
|
||||
github.com/toqueteos/trie
|
||||
# github.com/toqueteos/webbrowser v1.2.0
|
||||
github.com/toqueteos/webbrowser
|
||||
# github.com/tstranex/u2f v1.0.0
|
||||
|
@ -836,8 +834,6 @@ gopkg.in/ldap.v3
|
|||
# gopkg.in/testfixtures.v2 v2.5.0
|
||||
## explicit
|
||||
gopkg.in/testfixtures.v2
|
||||
# gopkg.in/toqueteos/substring.v1 v1.0.2
|
||||
gopkg.in/toqueteos/substring.v1
|
||||
# gopkg.in/warnings.v0 v0.1.2
|
||||
gopkg.in/warnings.v0
|
||||
# gopkg.in/yaml.v2 v2.2.8
|
||||
|
|
Loading…
Reference in New Issue