open-vault/vendor/github.com/client9/misspell/mime.go

211 lines
5.7 KiB
Go
Raw Normal View History

package misspell
import (
"bytes"
"fmt"
"io"
"io/ioutil"
"net/http"
"os"
"path/filepath"
"strings"
)
// The number of possible binary formats is very large
// items that might be checked into a repo or be an
// artifact of a build. Additions welcome.
//
// Golang's internal table is very small and can't be
// relied on. Even then things like ".js" have a mime
// type of "application/javascipt" which isn't very helpful.
// "[x]" means we have sniff test and suffix test should be eliminated
var binary = map[string]bool{
".a": true, // [ ] archive
".bin": true, // [ ] binary
".bz2": true, // [ ] compression
".class": true, // [x] Java class file
".dll": true, // [ ] shared library
".exe": true, // [ ] binary
".gif": true, // [ ] image
".gpg": true, // [x] text, but really all base64
".gz": true, // [ ] compression
".ico": true, // [ ] image
".jar": true, // [x] archive
".jpeg": true, // [ ] image
".jpg": true, // [ ] image
".mp3": true, // [ ] audio
".mp4": true, // [ ] video
".mpeg": true, // [ ] video
".o": true, // [ ] object file
".pdf": true, // [x] pdf
".png": true, // [x] image
".pyc": true, // [ ] Python bytecode
".pyo": true, // [ ] Python bytecode
".so": true, // [x] shared library
".swp": true, // [ ] vim swap file
".tar": true, // [ ] archive
".tiff": true, // [ ] image
".woff": true, // [ ] font
".woff2": true, // [ ] font
".xz": true, // [ ] compression
".z": true, // [ ] compression
".zip": true, // [x] archive
}
// isBinaryFilename returns true if the file is likely to be binary
//
// Better heuristics could be done here, in particular a binary
// file is unlikely to be UTF-8 encoded. However this is cheap
// and will solve the immediate need of making sure common
// binary formats are not corrupted by mistake.
func isBinaryFilename(s string) bool {
return binary[strings.ToLower(filepath.Ext(s))]
}
var scm = map[string]bool{
".bzr": true,
".git": true,
".hg": true,
".svn": true,
"CVS": true,
}
// isSCMPath returns true if the path is likely part of a (private) SCM
// directory. E.g. ./git/something = true
func isSCMPath(s string) bool {
// hack for .git/COMMIT_EDITMSG and .git/TAG_EDITMSG
// normally we don't look at anything in .git
// but COMMIT_EDITMSG and TAG_EDITMSG are used as
// temp files for git commits. Allowing misspell to inspect
// these files allows for commit-msg hooks
// https://git-scm.com/book/en/v2/Customizing-Git-Git-Hooks
if strings.Contains(filepath.Base(s), "EDITMSG") {
return false
}
parts := strings.Split(filepath.Clean(s), string(filepath.Separator))
for _, dir := range parts {
if scm[dir] {
return true
}
}
return false
}
var magicHeaders = [][]byte{
// Issue #68
// PGP messages and signatures are "text" but really just
// blobs of base64-text and should not be misspell-checked
[]byte("-----BEGIN PGP MESSAGE-----"),
[]byte("-----BEGIN PGP SIGNATURE-----"),
// ELF
{0x7f, 0x45, 0x4c, 0x46},
// Postscript
{0x25, 0x21, 0x50, 0x53},
// PDF
{0x25, 0x50, 0x44, 0x46},
// Java class file
// https://en.wikipedia.org/wiki/Java_class_file
{0xCA, 0xFE, 0xBA, 0xBE},
// PNG
// https://en.wikipedia.org/wiki/Portable_Network_Graphics
{0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a},
// ZIP, JAR, ODF, OOXML
{0x50, 0x4B, 0x03, 0x04},
{0x50, 0x4B, 0x05, 0x06},
{0x50, 0x4B, 0x07, 0x08},
}
func isTextFile(raw []byte) bool {
for _, magic := range magicHeaders {
if bytes.HasPrefix(raw, magic) {
return false
}
}
// allow any text/ type with utf-8 encoding
// DetectContentType sometimes returns charset=utf-16 for XML stuff
// in which case ignore.
mime := http.DetectContentType(raw)
return strings.HasPrefix(mime, "text/") && strings.HasSuffix(mime, "charset=utf-8")
}
// ReadTextFile returns the contents of a file, first testing if it is a text file
// returns ("", nil) if not a text file
// returns ("", error) if error
// returns (string, nil) if text
//
// unfortunately, in worse case, this does
// 1 stat
// 1 open,read,close of 512 bytes
// 1 more stat,open, read everything, close (via ioutil.ReadAll)
// This could be kinder to the filesystem.
//
// This uses some heuristics of the file's extension (e.g. .zip, .txt) and
// uses a sniffer to determine if the file is text or not.
// Using file extensions isn't great, but probably
// good enough for real-world use.
// Golang's built in sniffer is problematic for differnet reasons. It's
// optimized for HTML, and is very limited in detection. It would be good
// to explicitly add some tests for ELF/DWARF formats to make sure we never
// corrupt binary files.
func ReadTextFile(filename string) (string, error) {
if isBinaryFilename(filename) {
return "", nil
}
if isSCMPath(filename) {
return "", nil
}
fstat, err := os.Stat(filename)
if err != nil {
return "", fmt.Errorf("Unable to stat %q: %s", filename, err)
}
// directory: nothing to do.
if fstat.IsDir() {
return "", nil
}
// avoid reading in multi-gig files
// if input is large, read the first 512 bytes to sniff type
// if not-text, then exit
isText := false
if fstat.Size() > 50000 {
fin, err := os.Open(filename)
if err != nil {
return "", fmt.Errorf("Unable to open large file %q: %s", filename, err)
}
defer fin.Close()
buf := make([]byte, 512)
_, err = io.ReadFull(fin, buf)
if err != nil {
return "", fmt.Errorf("Unable to read 512 bytes from %q: %s", filename, err)
}
if !isTextFile(buf) {
return "", nil
}
// set so we don't double check this file
isText = true
}
// read in whole file
raw, err := ioutil.ReadFile(filename)
if err != nil {
return "", fmt.Errorf("Unable to read all %q: %s", filename, err)
}
if !isText && !isTextFile(raw) {
return "", nil
}
return string(raw), nil
}